FictionAgent / utils /text_utils.py
gdwind's picture
Upload folder using huggingface_hub
a226682 verified
import re
import tiktoken
from typing import List, Tuple, Dict # 添加 Dict
class TextUtils:
"""文本处理工具类"""
@staticmethod
def count_tokens(text: str, model: str = "gpt-4") -> int:
"""计算文本的token数量"""
try:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
except:
# 粗略估计:英文约4字符=1token,中文约1.5字符=1token
# 使用更保守的估计
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
english_chars = len(text) - chinese_chars
return int(chinese_chars / 1.5 + english_chars / 4)
@staticmethod
def clean_text(text: str) -> str:
"""清理文本"""
# 移除多余的空白
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符(保留基本标点)
text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text)
return text.strip()
@staticmethod
def split_into_sentences(text: str) -> List[str]:
"""分割句子"""
# 支持中英文句子分割
# 英文句号、问号、感叹号
# 中文句号、问号、感叹号
sentences = re.split(r'[.!?。!?]+', text)
return [s.strip() for s in sentences if s.strip()]
@staticmethod
def detect_language(text: str) -> str:
"""检测文本语言"""
# 统计中文字符
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
# 统计英文字符
english_chars = len(re.findall(r'[a-zA-Z]', text))
total_chars = chinese_chars + english_chars
if total_chars == 0:
return "unknown"
chinese_ratio = chinese_chars / total_chars
if chinese_ratio > 0.3:
return "zh"
elif chinese_ratio < 0.1:
return "en"
else:
return "mixed"
@staticmethod
def extract_dialogues(text: str, language: str = "en") -> List[Dict]:
"""提取对话"""
dialogues = []
if language == "zh":
# 中文对话模式:引号内的内容
patterns = [
r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))',
r'"([^"]+)"',
r'「([^」]+)」',
r'『([^』]+)』'
]
else:
# 英文对话模式
patterns = [
r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))',
r'"([^"]+)"',
r"'([^']+)',?\s*([^said]*(said|asked|replied))",
r"'([^']+)'"
]
for pattern in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
dialogue = {
'content': match.group(1),
'attribution': match.group(2) if len(match.groups()) > 1 else '',
'position': match.start()
}
dialogues.append(dialogue)
return dialogues
@staticmethod
def truncate_text(text: str, max_length: int,
ellipsis: str = "...") -> str:
"""截断文本到指定长度"""
if len(text) <= max_length:
return text
return text[:max_length - len(ellipsis)] + ellipsis
@staticmethod
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
"""提取关键词(简单实现)"""
# 移除标点和停用词
words = re.findall(r'\b\w+\b', text.lower())
# 简单的停用词列表
stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有'
}
# 过滤停用词
filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
# 统计词频
from collections import Counter
word_freq = Counter(filtered_words)
# 返回最常见的词
return [word for word, freq in word_freq.most_common(top_n)]
@staticmethod
def estimate_reading_time(text: str, wpm: int = 200) -> int:
"""估计阅读时间(分钟)"""
words = len(re.findall(r'\b\w+\b', text))
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
# 中文按字符数/500,英文按单词数/wpm
reading_time = chinese_chars / 500 + words / wpm
return max(1, int(reading_time))