import re import tiktoken from typing import List, Tuple, Dict # 添加 Dict class TextUtils: """文本处理工具类""" @staticmethod def count_tokens(text: str, model: str = "gpt-4") -> int: """计算文本的token数量""" try: encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(text)) except: # 粗略估计:英文约4字符=1token,中文约1.5字符=1token # 使用更保守的估计 chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) english_chars = len(text) - chinese_chars return int(chinese_chars / 1.5 + english_chars / 4) @staticmethod def clean_text(text: str) -> str: """清理文本""" # 移除多余的空白 text = re.sub(r'\s+', ' ', text) # 移除特殊字符(保留基本标点) text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text) return text.strip() @staticmethod def split_into_sentences(text: str) -> List[str]: """分割句子""" # 支持中英文句子分割 # 英文句号、问号、感叹号 # 中文句号、问号、感叹号 sentences = re.split(r'[.!?。!?]+', text) return [s.strip() for s in sentences if s.strip()] @staticmethod def detect_language(text: str) -> str: """检测文本语言""" # 统计中文字符 chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) # 统计英文字符 english_chars = len(re.findall(r'[a-zA-Z]', text)) total_chars = chinese_chars + english_chars if total_chars == 0: return "unknown" chinese_ratio = chinese_chars / total_chars if chinese_ratio > 0.3: return "zh" elif chinese_ratio < 0.1: return "en" else: return "mixed" @staticmethod def extract_dialogues(text: str, language: str = "en") -> List[Dict]: """提取对话""" dialogues = [] if language == "zh": # 中文对话模式:引号内的内容 patterns = [ r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))', r'"([^"]+)"', r'「([^」]+)」', r'『([^』]+)』' ] else: # 英文对话模式 patterns = [ r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))', r'"([^"]+)"', r"'([^']+)',?\s*([^said]*(said|asked|replied))", r"'([^']+)'" ] for pattern in patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: dialogue = { 'content': match.group(1), 'attribution': match.group(2) if len(match.groups()) > 1 else '', 'position': match.start() } dialogues.append(dialogue) return dialogues @staticmethod def truncate_text(text: str, max_length: int, ellipsis: str = "...") -> str: """截断文本到指定长度""" if len(text) <= max_length: return text return text[:max_length - len(ellipsis)] + ellipsis @staticmethod def extract_keywords(text: str, top_n: int = 10) -> List[str]: """提取关键词(简单实现)""" # 移除标点和停用词 words = re.findall(r'\b\w+\b', text.lower()) # 简单的停用词列表 stop_words = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有' } # 过滤停用词 filtered_words = [w for w in words if w not in stop_words and len(w) > 2] # 统计词频 from collections import Counter word_freq = Counter(filtered_words) # 返回最常见的词 return [word for word, freq in word_freq.most_common(top_n)] @staticmethod def estimate_reading_time(text: str, wpm: int = 200) -> int: """估计阅读时间(分钟)""" words = len(re.findall(r'\b\w+\b', text)) chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) # 中文按字符数/500,英文按单词数/wpm reading_time = chinese_chars / 500 + words / wpm return max(1, int(reading_time))