Eliot0110 commited on
Commit
3327cc4
·
1 Parent(s): c22dbae

improve: budget mapping

Browse files
Files changed (1) hide show
  1. modules/info_extractor.py +30 -26
modules/info_extractor.py CHANGED
@@ -756,7 +756,7 @@ class InfoExtractor:
756
  def _extract_budget_from_tokens(self, tokens: list) -> dict:
757
 
758
  result = {}
759
- text = "".join(tokens).lower()
760
 
761
  # --- 1. 统一提取金额和货币 ---
762
  # 按优先级排列正则表达式,越精确的模式越靠前
@@ -788,11 +788,11 @@ class InfoExtractor:
788
  ]
789
 
790
  amount_found = False
791
- for pattern, default_multiplier, default_currency in patterns:
792
- match = re.search(pattern, text)
793
  if match:
794
  # 检查是否是纯数字模式,是的话需要上下文
795
- if pattern in [r'(\d+\.?\d+)\s*(千|k|万|w)', r'(\d+\.?\d+)']:
796
  budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
797
  if not any(indicator in text for indicator in budget_indicators):
798
  continue # 如果没有上下文,则跳过纯数字匹配
@@ -800,14 +800,16 @@ class InfoExtractor:
800
  groups = match.groups()
801
 
802
  # 提取金额
803
- amount = float(groups[0])
 
804
 
805
  # 确定乘数
806
  multiplier = 1
807
  multiplier_token = ''
808
- if 'multiplier' in p['groups'] and p['groups']['multiplier'] <= len(groups) and groups[p['groups']['multiplier']-1]:
809
- multiplier_token = groups[p['groups']['multiplier']-1]
810
-
 
811
  if '十' in multiplier_token:
812
  multiplier = 10
813
  elif '百' in multiplier_token or 'hundred' in multiplier_token:
@@ -822,14 +824,12 @@ class InfoExtractor:
822
 
823
  # 确定货币
824
  currency_token = ''
825
- if default_currency:
826
- result['currency'] = default_currency
827
- elif len(groups) > 2 and groups[2]:
828
- currency_token = groups[2]
829
-
830
- currency_map = {
831
- 'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'
832
- }
833
  if currency_token in currency_map:
834
  result['currency'] = currency_map[currency_token]
835
 
@@ -860,17 +860,21 @@ class InfoExtractor:
860
  result["type"] = found_type
861
  result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
862
 
863
- chinese_money_mapping = {
864
- '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
865
- '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
866
- '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
867
- }
868
-
869
  if not result.get("amount"):
870
- for token in tokens:
871
- if token in chinese_money_mapping:
872
- result["amount"] = chinese_money_mapping[token]
873
- # 这里同样不设置默认货币
 
 
 
 
 
 
 
 
 
 
874
  break
875
 
876
  return result
 
756
  def _extract_budget_from_tokens(self, tokens: list) -> dict:
757
 
758
  result = {}
759
+ text = "".join(tokens).lower().strip()
760
 
761
  # --- 1. 统一提取金额和货币 ---
762
  # 按优先级排列正则表达式,越精确的模式越靠前
 
788
  ]
789
 
790
  amount_found = False
791
+ for p in patterns:
792
+ match = re.search(p['regex'], text)
793
  if match:
794
  # 检查是否是纯数字模式,是的话需要上下文
795
+ if p.get('context_needed', False):
796
  budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
797
  if not any(indicator in text for indicator in budget_indicators):
798
  continue # 如果没有上下文,则跳过纯数字匹配
 
800
  groups = match.groups()
801
 
802
  # 提取金额
803
+ amount_group_index = p['groups']['amount'] - 1
804
+ amount = float(groups[amount_group_index])
805
 
806
  # 确定乘数
807
  multiplier = 1
808
  multiplier_token = ''
809
+ if 'multiplier' in p['groups']:
810
+ multiplier_group_index = groups[p['groups']['multiplier']-1]
811
+ if multiplier_group_index < len(groups) and groups[multiplier_group_index]:
812
+ multiplier_token = groups[multiplier_group_index]
813
  if '十' in multiplier_token:
814
  multiplier = 10
815
  elif '百' in multiplier_token or 'hundred' in multiplier_token:
 
824
 
825
  # 确定货币
826
  currency_token = ''
827
+ if p.get('currency'):
828
+ result['currency'] = p['currency']
829
+ elif 'currency_str' in p['groups']:
830
+ currency_group_index = p['groups']['currency_str'] - 1
831
+ currency_token = groups[currency_group_index]
832
+ currency_map = {'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'}
 
 
833
  if currency_token in currency_map:
834
  result['currency'] = currency_map[currency_token]
835
 
 
860
  result["type"] = found_type
861
  result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
862
 
 
 
 
 
 
 
863
  if not result.get("amount"):
864
+ chinese_money_mapping = {
865
+ '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000,
866
+ '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
867
+ '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
868
+ '一百': 100, '二百': 200, '三百': 300, '四百': 400, '五百': 500,
869
+ '六百': 600, '七百': 700, '八百': 800, '九百': 900,
870
+ '十': 10, '二十': 20, '三十': 30, '四十': 40, '五十': 50,
871
+ '六十': 60, '七十': 70, '八十': 80, '九十': 90
872
+ }
873
+
874
+ sorted_keys = sorted(chinese_money_mapping.keys(), key=len, reverse=True)
875
+ for name in sorted_keys:
876
+ if name in text:
877
+ result['amount'] = chinese_money_mapping[name]
878
  break
879
 
880
  return result