Spaces:
Sleeping
Sleeping
improve: budget mapping
Browse files- modules/info_extractor.py +30 -26
modules/info_extractor.py
CHANGED
|
@@ -756,7 +756,7 @@ class InfoExtractor:
|
|
| 756 |
def _extract_budget_from_tokens(self, tokens: list) -> dict:
|
| 757 |
|
| 758 |
result = {}
|
| 759 |
-
text = "".join(tokens).lower()
|
| 760 |
|
| 761 |
# --- 1. 统一提取金额和货币 ---
|
| 762 |
# 按优先级排列正则表达式,越精确的模式越靠前
|
|
@@ -788,11 +788,11 @@ class InfoExtractor:
|
|
| 788 |
]
|
| 789 |
|
| 790 |
amount_found = False
|
| 791 |
-
for
|
| 792 |
-
match = re.search(
|
| 793 |
if match:
|
| 794 |
# 检查是否是纯数字模式,是的话需要上下文
|
| 795 |
-
if
|
| 796 |
budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
|
| 797 |
if not any(indicator in text for indicator in budget_indicators):
|
| 798 |
continue # 如果没有上下文,则跳过纯数字匹配
|
|
@@ -800,14 +800,16 @@ class InfoExtractor:
|
|
| 800 |
groups = match.groups()
|
| 801 |
|
| 802 |
# 提取金额
|
| 803 |
-
|
|
|
|
| 804 |
|
| 805 |
# 确定乘数
|
| 806 |
multiplier = 1
|
| 807 |
multiplier_token = ''
|
| 808 |
-
if 'multiplier' in p['groups']
|
| 809 |
-
|
| 810 |
-
|
|
|
|
| 811 |
if '十' in multiplier_token:
|
| 812 |
multiplier = 10
|
| 813 |
elif '百' in multiplier_token or 'hundred' in multiplier_token:
|
|
@@ -822,14 +824,12 @@ class InfoExtractor:
|
|
| 822 |
|
| 823 |
# 确定货币
|
| 824 |
currency_token = ''
|
| 825 |
-
if
|
| 826 |
-
result['currency'] =
|
| 827 |
-
elif
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'
|
| 832 |
-
}
|
| 833 |
if currency_token in currency_map:
|
| 834 |
result['currency'] = currency_map[currency_token]
|
| 835 |
|
|
@@ -860,17 +860,21 @@ class InfoExtractor:
|
|
| 860 |
result["type"] = found_type
|
| 861 |
result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
|
| 862 |
|
| 863 |
-
chinese_money_mapping = {
|
| 864 |
-
'一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
|
| 865 |
-
'六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
|
| 866 |
-
'一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
|
| 867 |
-
}
|
| 868 |
-
|
| 869 |
if not result.get("amount"):
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
break
|
| 875 |
|
| 876 |
return result
|
|
|
|
| 756 |
def _extract_budget_from_tokens(self, tokens: list) -> dict:
|
| 757 |
|
| 758 |
result = {}
|
| 759 |
+
text = "".join(tokens).lower().strip()
|
| 760 |
|
| 761 |
# --- 1. 统一提取金额和货币 ---
|
| 762 |
# 按优先级排列正则表达式,越精确的模式越靠前
|
|
|
|
| 788 |
]
|
| 789 |
|
| 790 |
amount_found = False
|
| 791 |
+
for p in patterns:
|
| 792 |
+
match = re.search(p['regex'], text)
|
| 793 |
if match:
|
| 794 |
# 检查是否是纯数字模式,是的话需要上下文
|
| 795 |
+
if p.get('context_needed', False):
|
| 796 |
budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
|
| 797 |
if not any(indicator in text for indicator in budget_indicators):
|
| 798 |
continue # 如果没有上下文,则跳过纯数字匹配
|
|
|
|
| 800 |
groups = match.groups()
|
| 801 |
|
| 802 |
# 提取金额
|
| 803 |
+
amount_group_index = p['groups']['amount'] - 1
|
| 804 |
+
amount = float(groups[amount_group_index])
|
| 805 |
|
| 806 |
# 确定乘数
|
| 807 |
multiplier = 1
|
| 808 |
multiplier_token = ''
|
| 809 |
+
if 'multiplier' in p['groups']:
|
| 810 |
+
multiplier_group_index = groups[p['groups']['multiplier']-1]
|
| 811 |
+
if multiplier_group_index < len(groups) and groups[multiplier_group_index]:
|
| 812 |
+
multiplier_token = groups[multiplier_group_index]
|
| 813 |
if '十' in multiplier_token:
|
| 814 |
multiplier = 10
|
| 815 |
elif '百' in multiplier_token or 'hundred' in multiplier_token:
|
|
|
|
| 824 |
|
| 825 |
# 确定货币
|
| 826 |
currency_token = ''
|
| 827 |
+
if p.get('currency'):
|
| 828 |
+
result['currency'] = p['currency']
|
| 829 |
+
elif 'currency_str' in p['groups']:
|
| 830 |
+
currency_group_index = p['groups']['currency_str'] - 1
|
| 831 |
+
currency_token = groups[currency_group_index]
|
| 832 |
+
currency_map = {'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'}
|
|
|
|
|
|
|
| 833 |
if currency_token in currency_map:
|
| 834 |
result['currency'] = currency_map[currency_token]
|
| 835 |
|
|
|
|
| 860 |
result["type"] = found_type
|
| 861 |
result["description"] = found_type_keyword # 使用找到的最匹配的关键词作为描述
|
| 862 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 863 |
if not result.get("amount"):
|
| 864 |
+
chinese_money_mapping = {
|
| 865 |
+
'一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000,
|
| 866 |
+
'一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
|
| 867 |
+
'六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
|
| 868 |
+
'一百': 100, '二百': 200, '三百': 300, '四百': 400, '五百': 500,
|
| 869 |
+
'六百': 600, '七百': 700, '八百': 800, '九百': 900,
|
| 870 |
+
'十': 10, '二十': 20, '三十': 30, '四十': 40, '五十': 50,
|
| 871 |
+
'六十': 60, '七十': 70, '八十': 80, '九十': 90
|
| 872 |
+
}
|
| 873 |
+
|
| 874 |
+
sorted_keys = sorted(chinese_money_mapping.keys(), key=len, reverse=True)
|
| 875 |
+
for name in sorted_keys:
|
| 876 |
+
if name in text:
|
| 877 |
+
result['amount'] = chinese_money_mapping[name]
|
| 878 |
break
|
| 879 |
|
| 880 |
return result
|