github-actions[bot] commited on
Commit
8ad3d05
·
1 Parent(s): 3f86ed0

Auto-sync from demo at Fri Nov 7 11:21:53 UTC 2025

Browse files
graphgen/models/extractor/schema_guided_extractor.py CHANGED
@@ -60,7 +60,9 @@ class SchemaGuidedExtractor(BaseExtractor):
60
  return prompt
61
 
62
  async def extract(self, chunk: dict) -> dict:
63
- text = chunk.get("text", "")
 
 
64
  prompt = self.build_prompt(text)
65
  response = await self.llm_client.generate_answer(prompt)
66
  try:
@@ -74,13 +76,20 @@ class SchemaGuidedExtractor(BaseExtractor):
74
  return {}
75
  main_keys_info = {key: extracted_info[key] for key in self.required_keys}
76
  logger.debug("Extracted info: %s", extracted_info)
77
- return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
 
 
 
 
 
 
78
  except json.JSONDecodeError:
79
  logger.error("Failed to parse extraction response: %s", response)
80
  return {}
81
 
 
82
  async def merge_extractions(
83
- self, extraction_list: List[Dict[str, dict]]
84
  ) -> Dict[str, dict]:
85
  """
86
  Merge multiple extraction results based on their hashes.
 
60
  return prompt
61
 
62
  async def extract(self, chunk: dict) -> dict:
63
+ _chunk_id = list(chunk.keys())[0]
64
+ text = chunk[_chunk_id].get("content", "")
65
+
66
  prompt = self.build_prompt(text)
67
  response = await self.llm_client.generate_answer(prompt)
68
  try:
 
76
  return {}
77
  main_keys_info = {key: extracted_info[key] for key in self.required_keys}
78
  logger.debug("Extracted info: %s", extracted_info)
79
+
80
+ # add chunk metadata
81
+ extracted_info["_chunk_id"] = _chunk_id
82
+
83
+ return {
84
+ compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info
85
+ }
86
  except json.JSONDecodeError:
87
  logger.error("Failed to parse extraction response: %s", response)
88
  return {}
89
 
90
+ @staticmethod
91
  async def merge_extractions(
92
+ extraction_list: List[Dict[str, dict]]
93
  ) -> Dict[str, dict]:
94
  """
95
  Merge multiple extraction results based on their hashes.
graphgen/operators/split/split_chunks.py CHANGED
@@ -64,7 +64,7 @@ async def chunk_documents(
64
  compute_content_hash(txt, prefix="chunk-"): {
65
  "content": txt,
66
  "type": "text",
67
- "full_doc_id": doc_key,
68
  "length": len(tokenizer_instance.encode(txt))
69
  if tokenizer_instance
70
  else len(txt),
 
64
  compute_content_hash(txt, prefix="chunk-"): {
65
  "content": txt,
66
  "type": "text",
67
+ "_full_docs_id": doc_key,
68
  "length": len(tokenizer_instance.encode(txt))
69
  if tokenizer_instance
70
  else len(txt),