File size: 3,458 Bytes
74ebe5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
import pandas as pd
import json
from model_handler import ModelHandler
from config import LING_1T

def _format_kb_for_prompt(df: pd.DataFrame) -> str:
    """Formats the knowledge base DataFrame into a simple list for the prompt."""
    if df is None or df.empty:
        return "无。"
    terms = [f"- {row['Term']}" for _, row in df.iterrows()]
    return "\n".join(terms)

def suggest_new_kb_terms_agent(kb_df: pd.DataFrame, editor_content: str):
    """
    Agent to extract new terms from the text to recommend for the knowledge base using a real LLM.
    """
    
    if editor_content is None or len(editor_content.strip()) < 50:
        print("[Agent] Editor content too short, skipping KB suggestion.")
        # Return empty data and keep components hidden
        return gr.update(value=[], visible=False), gr.update(visible=False)

    try:
        # 1. Prepare Prompts
        system_prompt = (
            "你是一个实体提取机器人。你的任务是从给定文本中识别出新的、重要的、值得记录的专有名词(如人名、地名、组织、物品)或核心概念,并为它们提供一句简洁的描述。\n"
            "你的回答必须是一个遵循以下规则的 JSON 数组:\n"
            "1. 数组中的每个元素都是一个对象。\n"
            "2. 每个对象必须包含两个键:`Term` (词条名) 和 `Description` (描述)。\n"
            "3. 不要提取已经存在于'现有知识库'中的词条。\n"
            "4. 最多返回 5 个最重要的词条。\n"
            "5. 不要返回除了这个 JSON 数组之外的任何其他文本、解释或代码块标记。"
        )
        
        kb_str = _format_kb_for_prompt(kb_df)
        user_prompt = (
            f"### 现有知识库\n{kb_str}\n\n"
            f"### 当前文本\n{editor_content[-4000:]}\n\n"
            "### 指令\n请根据'当前文本',分析并提取出新的知识库词条,并返回 JSON 数组。"
        )
        
        # 2. Call LLM
        model_handler = ModelHandler()
        response_generator = model_handler.generate_code(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            model_choice=LING_1T 
        )
        full_response = "".join(chunk for chunk in response_generator)
        
        # 3. Parse JSON and format for DataFrame
        print("【收到的完整上下文】")
        print("full_response:", repr(full_response))
        
        if full_response.strip().startswith("```json"):
            full_response = full_response.strip()[7:-3].strip()
            
        suggested_terms = json.loads(full_response)
        
        # Convert list of dicts to list of lists for Gradio Dataframe
        df_data = [[item.get("Term", ""), item.get("Description", "")] for item in suggested_terms]
        
        print("【收到的完整上下文】")
        print("suggested_terms:", repr(suggested_terms))
        
        # Make components visible and return data
        return gr.update(value=df_data, visible=True), gr.update(visible=True)

    except json.JSONDecodeError:
        print(f"[Agent] Error: Failed to decode JSON from LLM response for KB: {full_response}")
        return gr.update(visible=False), gr.update(visible=False)
    except Exception as e:
        print(f"[Agent] Error suggesting new KB terms: {e}")
        return gr.update(visible=False), gr.update(visible=False)