Spaces:

chenzihong
/

GraphGen

Sleeping

github-actions[bot] commited on Nov 7

Commit

8ad3d05

1 Parent(s): 3f86ed0

Auto-sync from demo at Fri Nov 7 11:21:53 UTC 2025

Files changed (2) hide show

graphgen/models/extractor/schema_guided_extractor.py CHANGED Viewed

@@ -60,7 +60,9 @@ class SchemaGuidedExtractor(BaseExtractor):
         return prompt
     async def extract(self, chunk: dict) -> dict:
-        text = chunk.get("text", "")
         prompt = self.build_prompt(text)
         response = await self.llm_client.generate_answer(prompt)
         try:
@@ -74,13 +76,20 @@ class SchemaGuidedExtractor(BaseExtractor):
                 return {}
             main_keys_info = {key: extracted_info[key] for key in self.required_keys}
             logger.debug("Extracted info: %s", extracted_info)
-            return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
         except json.JSONDecodeError:
             logger.error("Failed to parse extraction response: %s", response)
             return {}
     async def merge_extractions(
-        self, extraction_list: List[Dict[str, dict]]
     ) -> Dict[str, dict]:
         """
         Merge multiple extraction results based on their hashes.

         return prompt
     async def extract(self, chunk: dict) -> dict:
+        _chunk_id = list(chunk.keys())[0]
+        text = chunk[_chunk_id].get("content", "")
         prompt = self.build_prompt(text)
         response = await self.llm_client.generate_answer(prompt)
         try:
                 return {}
             main_keys_info = {key: extracted_info[key] for key in self.required_keys}
             logger.debug("Extracted info: %s", extracted_info)
+            # add chunk metadata
+            extracted_info["_chunk_id"] = _chunk_id
+            return {
+                compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info
+            }
         except json.JSONDecodeError:
             logger.error("Failed to parse extraction response: %s", response)
             return {}
+    @staticmethod
     async def merge_extractions(
+        extraction_list: List[Dict[str, dict]]
     ) -> Dict[str, dict]:
         """
         Merge multiple extraction results based on their hashes.

graphgen/operators/split/split_chunks.py CHANGED Viewed

@@ -64,7 +64,7 @@ async def chunk_documents(
                 compute_content_hash(txt, prefix="chunk-"): {
                     "content": txt,
                     "type": "text",
-                    "full_doc_id": doc_key,
                     "length": len(tokenizer_instance.encode(txt))
                     if tokenizer_instance
                     else len(txt),

                 compute_content_hash(txt, prefix="chunk-"): {
                     "content": txt,
                     "type": "text",
+                    "_full_docs_id": doc_key,
                     "length": len(tokenizer_instance.encode(txt))
                     if tokenizer_instance
                     else len(txt),