File size: 5,131 Bytes
a226682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re
import tiktoken
from typing import List, Tuple, Dict  # 添加 Dict

class TextUtils:
    """文本处理工具类"""
    
    @staticmethod
    def count_tokens(text: str, model: str = "gpt-4") -> int:
        """计算文本的token数量"""
        try:
            encoding = tiktoken.encoding_for_model(model)
            return len(encoding.encode(text))
        except:
            # 粗略估计:英文约4字符=1token,中文约1.5字符=1token
            # 使用更保守的估计
            chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
            english_chars = len(text) - chinese_chars
            return int(chinese_chars / 1.5 + english_chars / 4)
    
    @staticmethod
    def clean_text(text: str) -> str:
        """清理文本"""
        # 移除多余的空白
        text = re.sub(r'\s+', ' ', text)
        # 移除特殊字符(保留基本标点)
        text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text)
        return text.strip()
    
    @staticmethod
    def split_into_sentences(text: str) -> List[str]:
        """分割句子"""
        # 支持中英文句子分割
        # 英文句号、问号、感叹号
        # 中文句号、问号、感叹号
        sentences = re.split(r'[.!?。!?]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    @staticmethod
    def detect_language(text: str) -> str:
        """检测文本语言"""
        # 统计中文字符
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
        # 统计英文字符
        english_chars = len(re.findall(r'[a-zA-Z]', text))
        
        total_chars = chinese_chars + english_chars
        
        if total_chars == 0:
            return "unknown"
        
        chinese_ratio = chinese_chars / total_chars
        
        if chinese_ratio > 0.3:
            return "zh"
        elif chinese_ratio < 0.1:
            return "en"
        else:
            return "mixed"
    
    @staticmethod
    def extract_dialogues(text: str, language: str = "en") -> List[Dict]:
        """提取对话"""
        dialogues = []
        
        if language == "zh":
            # 中文对话模式:引号内的内容
            patterns = [
                r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))',
                r'"([^"]+)"',
                r'「([^」]+)」',
                r'『([^』]+)』'
            ]
        else:
            # 英文对话模式
            patterns = [
                r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))',
                r'"([^"]+)"',
                r"'([^']+)',?\s*([^said]*(said|asked|replied))",
                r"'([^']+)'"
            ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                dialogue = {
                    'content': match.group(1),
                    'attribution': match.group(2) if len(match.groups()) > 1 else '',
                    'position': match.start()
                }
                dialogues.append(dialogue)
        
        return dialogues
    
    @staticmethod
    def truncate_text(text: str, max_length: int, 
                     ellipsis: str = "...") -> str:
        """截断文本到指定长度"""
        if len(text) <= max_length:
            return text
        
        return text[:max_length - len(ellipsis)] + ellipsis
    
    @staticmethod
    def extract_keywords(text: str, top_n: int = 10) -> List[str]:
        """提取关键词(简单实现)"""
        # 移除标点和停用词
        words = re.findall(r'\b\w+\b', text.lower())
        
        # 简单的停用词列表
        stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
            'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
            '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有'
        }
        
        # 过滤停用词
        filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
        
        # 统计词频
        from collections import Counter
        word_freq = Counter(filtered_words)
        
        # 返回最常见的词
        return [word for word, freq in word_freq.most_common(top_n)]
    
    @staticmethod
    def estimate_reading_time(text: str, wpm: int = 200) -> int:
        """估计阅读时间(分钟)"""
        words = len(re.findall(r'\b\w+\b', text))
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
        
        # 中文按字符数/500,英文按单词数/wpm
        reading_time = chinese_chars / 500 + words / wpm
        
        return max(1, int(reading_time))