Spaces:

Eliot0110
/

Travel_Assistant

Sleeping

App Files Files Community

Eliot0110 commited on Aug 6

Commit

3327cc4

1 Parent(s): c22dbae

improve: budget mapping

Browse files

Files changed (1) hide show

modules/info_extractor.py +30 -26

modules/info_extractor.py CHANGED Viewed

@@ -756,7 +756,7 @@ class InfoExtractor:
     def _extract_budget_from_tokens(self, tokens: list) -> dict:
         result = {}
-        text = "".join(tokens).lower()
         # --- 1. 统一提取金额和货币 ---
         # 按优先级排列正则表达式，越精确的模式越靠前
@@ -788,11 +788,11 @@ class InfoExtractor:
         ]
         amount_found = False
-        for pattern, default_multiplier, default_currency in patterns:
-            match = re.search(pattern, text)
             if match:
                 # 检查是否是纯数字模式，是的话需要上下文
-                if pattern in [r'(\d+\.?\d+)\s*(千|k|万|w)', r'(\d+\.?\d+)']:
                     budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
                     if not any(indicator in text for indicator in budget_indicators):
                         continue # 如果没有上下文，则跳过纯数字匹配
@@ -800,14 +800,16 @@ class InfoExtractor:
                 groups = match.groups()
                 # 提取金额
-                amount = float(groups[0])
                 # 确定乘数
                 multiplier = 1
                 multiplier_token = ''
-                if 'multiplier' in p['groups'] and p['groups']['multiplier'] <= len(groups) and groups[p['groups']['multiplier']-1]:
-                    multiplier_token = groups[p['groups']['multiplier']-1]
                 if '十' in multiplier_token:
                     multiplier = 10
                 elif '百' in multiplier_token or 'hundred' in multiplier_token:
@@ -822,14 +824,12 @@ class InfoExtractor:
                 # 确定货币
                 currency_token = ''
-                if default_currency:
-                    result['currency'] = default_currency
-                elif len(groups) > 2 and groups[2]:
-                    currency_token = groups[2]
-                currency_map = {
-                    'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'
-                }
                 if currency_token in currency_map:
                     result['currency'] = currency_map[currency_token]
@@ -860,17 +860,21 @@ class InfoExtractor:
             result["type"] = found_type
             result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
-        chinese_money_mapping = {
-            '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
-            '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
-            '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
-        }
         if not result.get("amount"):
-            for token in tokens:
-                if token in chinese_money_mapping:
-                    result["amount"] = chinese_money_mapping[token]
-                    # 这里同样不设置默认货币
                     break
         return result

     def _extract_budget_from_tokens(self, tokens: list) -> dict:
         result = {}
+        text = "".join(tokens).lower().strip()
         # --- 1. 统一提取金额和货币 ---
         # 按优先级排列正则表达式，越精确的模式越靠前
         ]
         amount_found = False
+        for p in patterns:
+            match = re.search(p['regex'], text)
             if match:
                 # 检查是否是纯数字模式，是的话需要上下文
+                if p.get('context_needed', False):
                     budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
                     if not any(indicator in text for indicator in budget_indicators):
                         continue # 如果没有上下文，则跳过纯数字匹配
                 groups = match.groups()
                 # 提取金额
+                amount_group_index = p['groups']['amount'] - 1
+                amount = float(groups[amount_group_index])
                 # 确定乘数
                 multiplier = 1
                 multiplier_token = ''
+                if 'multiplier' in p['groups']:
+                    multiplier_group_index = groups[p['groups']['multiplier']-1]
+                    if multiplier_group_index < len(groups) and groups[multiplier_group_index]:
+                        multiplier_token = groups[multiplier_group_index]
                 if '十' in multiplier_token:
                     multiplier = 10
                 elif '百' in multiplier_token or 'hundred' in multiplier_token:
                 # 确定货币
                 currency_token = ''
+                if p.get('currency'):
+                    result['currency'] = p['currency']
+                elif 'currency_str' in p['groups']:
+                    currency_group_index = p['groups']['currency_str'] - 1
+                    currency_token = groups[currency_group_index]
+                    currency_map = {'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'}
                 if currency_token in currency_map:
                     result['currency'] = currency_map[currency_token]
             result["type"] = found_type
             result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
         if not result.get("amount"):
+            chinese_money_mapping = {
+                '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000,
+                '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
+                '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
+                '一百': 100, '二百': 200, '三百': 300, '四百': 400, '五百': 500,
+                '六百': 600, '七百': 700, '八百': 800, '九百': 900,
+                '十': 10, '二十': 20, '三十': 30, '四十': 40, '五十': 50,
+                '六十': 60, '七十': 70, '八十': 80, '九十': 90
+            }
+            sorted_keys = sorted(chinese_money_mapping.keys(), key=len, reverse=True)
+            for name in sorted_keys:
+                if name in text:
+                    result['amount'] = chinese_money_mapping[name]
                     break
         return result