Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
·
8ad3d05
1
Parent(s):
3f86ed0
Auto-sync from demo at Fri Nov 7 11:21:53 UTC 2025
Browse files
graphgen/models/extractor/schema_guided_extractor.py
CHANGED
|
@@ -60,7 +60,9 @@ class SchemaGuidedExtractor(BaseExtractor):
|
|
| 60 |
return prompt
|
| 61 |
|
| 62 |
async def extract(self, chunk: dict) -> dict:
|
| 63 |
-
|
|
|
|
|
|
|
| 64 |
prompt = self.build_prompt(text)
|
| 65 |
response = await self.llm_client.generate_answer(prompt)
|
| 66 |
try:
|
|
@@ -74,13 +76,20 @@ class SchemaGuidedExtractor(BaseExtractor):
|
|
| 74 |
return {}
|
| 75 |
main_keys_info = {key: extracted_info[key] for key in self.required_keys}
|
| 76 |
logger.debug("Extracted info: %s", extracted_info)
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
except json.JSONDecodeError:
|
| 79 |
logger.error("Failed to parse extraction response: %s", response)
|
| 80 |
return {}
|
| 81 |
|
|
|
|
| 82 |
async def merge_extractions(
|
| 83 |
-
|
| 84 |
) -> Dict[str, dict]:
|
| 85 |
"""
|
| 86 |
Merge multiple extraction results based on their hashes.
|
|
|
|
| 60 |
return prompt
|
| 61 |
|
| 62 |
async def extract(self, chunk: dict) -> dict:
|
| 63 |
+
_chunk_id = list(chunk.keys())[0]
|
| 64 |
+
text = chunk[_chunk_id].get("content", "")
|
| 65 |
+
|
| 66 |
prompt = self.build_prompt(text)
|
| 67 |
response = await self.llm_client.generate_answer(prompt)
|
| 68 |
try:
|
|
|
|
| 76 |
return {}
|
| 77 |
main_keys_info = {key: extracted_info[key] for key in self.required_keys}
|
| 78 |
logger.debug("Extracted info: %s", extracted_info)
|
| 79 |
+
|
| 80 |
+
# add chunk metadata
|
| 81 |
+
extracted_info["_chunk_id"] = _chunk_id
|
| 82 |
+
|
| 83 |
+
return {
|
| 84 |
+
compute_dict_hash(main_keys_info, prefix="extract-"): extracted_info
|
| 85 |
+
}
|
| 86 |
except json.JSONDecodeError:
|
| 87 |
logger.error("Failed to parse extraction response: %s", response)
|
| 88 |
return {}
|
| 89 |
|
| 90 |
+
@staticmethod
|
| 91 |
async def merge_extractions(
|
| 92 |
+
extraction_list: List[Dict[str, dict]]
|
| 93 |
) -> Dict[str, dict]:
|
| 94 |
"""
|
| 95 |
Merge multiple extraction results based on their hashes.
|
graphgen/operators/split/split_chunks.py
CHANGED
|
@@ -64,7 +64,7 @@ async def chunk_documents(
|
|
| 64 |
compute_content_hash(txt, prefix="chunk-"): {
|
| 65 |
"content": txt,
|
| 66 |
"type": "text",
|
| 67 |
-
"
|
| 68 |
"length": len(tokenizer_instance.encode(txt))
|
| 69 |
if tokenizer_instance
|
| 70 |
else len(txt),
|
|
|
|
| 64 |
compute_content_hash(txt, prefix="chunk-"): {
|
| 65 |
"content": txt,
|
| 66 |
"type": "text",
|
| 67 |
+
"_full_docs_id": doc_key,
|
| 68 |
"length": len(tokenizer_instance.encode(txt))
|
| 69 |
if tokenizer_instance
|
| 70 |
else len(txt),
|