Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
2bc66d4
1
Parent(s):
dee1edd
Auto-sync from demo at Wed Nov 26 11:48:49 UTC 2025
Browse files
graphgen/graphgen.py
CHANGED
|
@@ -9,7 +9,6 @@ from graphgen.bases.datatypes import Chunk
|
|
| 9 |
from graphgen.models import (
|
| 10 |
JsonKVStorage,
|
| 11 |
JsonListStorage,
|
| 12 |
-
MetaJsonKVStorage,
|
| 13 |
NetworkXStorage,
|
| 14 |
OpenAIClient,
|
| 15 |
Tokenizer,
|
|
@@ -54,9 +53,6 @@ class GraphGen:
|
|
| 54 |
)
|
| 55 |
self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
|
| 56 |
|
| 57 |
-
self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
|
| 58 |
-
self.working_dir, namespace="_meta"
|
| 59 |
-
)
|
| 60 |
self.full_docs_storage: JsonKVStorage = JsonKVStorage(
|
| 61 |
self.working_dir, namespace="full_docs"
|
| 62 |
)
|
|
@@ -98,11 +94,7 @@ class GraphGen:
|
|
| 98 |
batch = {}
|
| 99 |
for doc in doc_stream:
|
| 100 |
doc_id = compute_mm_hash(doc, prefix="doc-")
|
| 101 |
-
|
| 102 |
batch[doc_id] = doc
|
| 103 |
-
if batch:
|
| 104 |
-
self.full_docs_storage.upsert(batch)
|
| 105 |
-
self.full_docs_storage.index_done_callback()
|
| 106 |
|
| 107 |
# TODO: configurable whether to use coreference resolution
|
| 108 |
|
|
@@ -120,7 +112,7 @@ class GraphGen:
|
|
| 120 |
chunk documents into smaller pieces from full_docs_storage if not already present
|
| 121 |
"""
|
| 122 |
|
| 123 |
-
new_docs = self.
|
| 124 |
if len(new_docs) == 0:
|
| 125 |
logger.warning("All documents are already in the storage")
|
| 126 |
return
|
|
@@ -143,16 +135,15 @@ class GraphGen:
|
|
| 143 |
|
| 144 |
self.chunks_storage.upsert(inserting_chunks)
|
| 145 |
self.chunks_storage.index_done_callback()
|
| 146 |
-
self.meta_storage.mark_done(self.full_docs_storage)
|
| 147 |
-
self.meta_storage.index_done_callback()
|
| 148 |
|
| 149 |
@async_to_sync_method
|
| 150 |
async def build_kg(self):
|
| 151 |
"""
|
| 152 |
build knowledge graph from text chunks
|
| 153 |
"""
|
| 154 |
-
# Step 1: get new chunks
|
| 155 |
-
inserting_chunks = self.
|
|
|
|
| 156 |
if len(inserting_chunks) == 0:
|
| 157 |
logger.warning("All chunks are already in the storage")
|
| 158 |
return
|
|
@@ -169,10 +160,8 @@ class GraphGen:
|
|
| 169 |
logger.warning("No entities or relations extracted from text chunks")
|
| 170 |
return
|
| 171 |
|
| 172 |
-
# Step 3:
|
| 173 |
self.graph_storage.index_done_callback()
|
| 174 |
-
self.meta_storage.mark_done(self.chunks_storage)
|
| 175 |
-
self.meta_storage.index_done_callback()
|
| 176 |
|
| 177 |
return _add_entities_and_relations
|
| 178 |
|
|
@@ -180,7 +169,7 @@ class GraphGen:
|
|
| 180 |
async def search(self, search_config: Dict):
|
| 181 |
logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
|
| 182 |
|
| 183 |
-
seeds = self.
|
| 184 |
if len(seeds) == 0:
|
| 185 |
logger.warning("All documents are already been searched")
|
| 186 |
return
|
|
@@ -198,8 +187,6 @@ class GraphGen:
|
|
| 198 |
return
|
| 199 |
self.search_storage.upsert(search_results)
|
| 200 |
self.search_storage.index_done_callback()
|
| 201 |
-
self.meta_storage.mark_done(self.full_docs_storage)
|
| 202 |
-
self.meta_storage.index_done_callback()
|
| 203 |
|
| 204 |
@async_to_sync_method
|
| 205 |
async def quiz_and_judge(self, quiz_and_judge_config: Dict):
|
|
@@ -268,8 +255,6 @@ class GraphGen:
|
|
| 268 |
|
| 269 |
self.extract_storage.upsert(results)
|
| 270 |
self.extract_storage.index_done_callback()
|
| 271 |
-
self.meta_storage.mark_done(self.chunks_storage)
|
| 272 |
-
self.meta_storage.index_done_callback()
|
| 273 |
|
| 274 |
@async_to_sync_method
|
| 275 |
async def generate(self, generate_config: Dict):
|
|
|
|
| 9 |
from graphgen.models import (
|
| 10 |
JsonKVStorage,
|
| 11 |
JsonListStorage,
|
|
|
|
| 12 |
NetworkXStorage,
|
| 13 |
OpenAIClient,
|
| 14 |
Tokenizer,
|
|
|
|
| 53 |
)
|
| 54 |
self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
self.full_docs_storage: JsonKVStorage = JsonKVStorage(
|
| 57 |
self.working_dir, namespace="full_docs"
|
| 58 |
)
|
|
|
|
| 94 |
batch = {}
|
| 95 |
for doc in doc_stream:
|
| 96 |
doc_id = compute_mm_hash(doc, prefix="doc-")
|
|
|
|
| 97 |
batch[doc_id] = doc
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
# TODO: configurable whether to use coreference resolution
|
| 100 |
|
|
|
|
| 112 |
chunk documents into smaller pieces from full_docs_storage if not already present
|
| 113 |
"""
|
| 114 |
|
| 115 |
+
new_docs = self.full_docs_storage.get_all()
|
| 116 |
if len(new_docs) == 0:
|
| 117 |
logger.warning("All documents are already in the storage")
|
| 118 |
return
|
|
|
|
| 135 |
|
| 136 |
self.chunks_storage.upsert(inserting_chunks)
|
| 137 |
self.chunks_storage.index_done_callback()
|
|
|
|
|
|
|
| 138 |
|
| 139 |
@async_to_sync_method
|
| 140 |
async def build_kg(self):
|
| 141 |
"""
|
| 142 |
build knowledge graph from text chunks
|
| 143 |
"""
|
| 144 |
+
# Step 1: get new chunks
|
| 145 |
+
inserting_chunks = self.chunks_storage.get_all()
|
| 146 |
+
|
| 147 |
if len(inserting_chunks) == 0:
|
| 148 |
logger.warning("All chunks are already in the storage")
|
| 149 |
return
|
|
|
|
| 160 |
logger.warning("No entities or relations extracted from text chunks")
|
| 161 |
return
|
| 162 |
|
| 163 |
+
# Step 3: upsert new entities and relations to the graph storage
|
| 164 |
self.graph_storage.index_done_callback()
|
|
|
|
|
|
|
| 165 |
|
| 166 |
return _add_entities_and_relations
|
| 167 |
|
|
|
|
| 169 |
async def search(self, search_config: Dict):
|
| 170 |
logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
|
| 171 |
|
| 172 |
+
seeds = self.full_docs_storage.get_all()
|
| 173 |
if len(seeds) == 0:
|
| 174 |
logger.warning("All documents are already been searched")
|
| 175 |
return
|
|
|
|
| 187 |
return
|
| 188 |
self.search_storage.upsert(search_results)
|
| 189 |
self.search_storage.index_done_callback()
|
|
|
|
|
|
|
| 190 |
|
| 191 |
@async_to_sync_method
|
| 192 |
async def quiz_and_judge(self, quiz_and_judge_config: Dict):
|
|
|
|
| 255 |
|
| 256 |
self.extract_storage.upsert(results)
|
| 257 |
self.extract_storage.index_done_callback()
|
|
|
|
|
|
|
| 258 |
|
| 259 |
@async_to_sync_method
|
| 260 |
async def generate(self, generate_config: Dict):
|
graphgen/models/__init__.py
CHANGED
|
@@ -31,5 +31,5 @@ from .searcher.kg.wiki_search import WikiSearch
|
|
| 31 |
from .searcher.web.bing_search import BingSearch
|
| 32 |
from .searcher.web.google_search import GoogleSearch
|
| 33 |
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
|
| 34 |
-
from .storage import JsonKVStorage, JsonListStorage,
|
| 35 |
from .tokenizer import Tokenizer
|
|
|
|
| 31 |
from .searcher.web.bing_search import BingSearch
|
| 32 |
from .searcher.web.google_search import GoogleSearch
|
| 33 |
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
|
| 34 |
+
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
|
| 35 |
from .tokenizer import Tokenizer
|
graphgen/models/storage/__init__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
from .json_storage import JsonKVStorage, JsonListStorage
|
| 2 |
from .networkx_storage import NetworkXStorage
|
|
|
|
| 1 |
+
from .json_storage import JsonKVStorage, JsonListStorage
|
| 2 |
from .networkx_storage import NetworkXStorage
|
graphgen/models/storage/json_storage.py
CHANGED
|
@@ -92,23 +92,3 @@ class JsonListStorage(BaseListStorage):
|
|
| 92 |
|
| 93 |
def drop(self):
|
| 94 |
self._data = []
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
@dataclass
|
| 98 |
-
class MetaJsonKVStorage(JsonKVStorage):
|
| 99 |
-
def __post_init__(self):
|
| 100 |
-
self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
|
| 101 |
-
self._data = load_json(self._file_name) or {}
|
| 102 |
-
logger.info("Load KV %s with %d data", self.namespace, len(self._data))
|
| 103 |
-
|
| 104 |
-
def get_new_data(self, storage_instance: "JsonKVStorage") -> dict:
|
| 105 |
-
new_data = {}
|
| 106 |
-
for k, v in storage_instance.data.items():
|
| 107 |
-
if k not in self._data:
|
| 108 |
-
new_data[k] = v
|
| 109 |
-
return new_data
|
| 110 |
-
|
| 111 |
-
def mark_done(self, storage_instance: "JsonKVStorage"):
|
| 112 |
-
new_data = self.get_new_data(storage_instance)
|
| 113 |
-
if new_data:
|
| 114 |
-
self._data.update(new_data)
|
|
|
|
| 92 |
|
| 93 |
def drop(self):
|
| 94 |
self._data = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|