github-actions[bot] commited on
Commit
2bc66d4
·
1 Parent(s): dee1edd

Auto-sync from demo at Wed Nov 26 11:48:49 UTC 2025

Browse files
graphgen/graphgen.py CHANGED
@@ -9,7 +9,6 @@ from graphgen.bases.datatypes import Chunk
9
  from graphgen.models import (
10
  JsonKVStorage,
11
  JsonListStorage,
12
- MetaJsonKVStorage,
13
  NetworkXStorage,
14
  OpenAIClient,
15
  Tokenizer,
@@ -54,9 +53,6 @@ class GraphGen:
54
  )
55
  self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
56
 
57
- self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
58
- self.working_dir, namespace="_meta"
59
- )
60
  self.full_docs_storage: JsonKVStorage = JsonKVStorage(
61
  self.working_dir, namespace="full_docs"
62
  )
@@ -98,11 +94,7 @@ class GraphGen:
98
  batch = {}
99
  for doc in doc_stream:
100
  doc_id = compute_mm_hash(doc, prefix="doc-")
101
-
102
  batch[doc_id] = doc
103
- if batch:
104
- self.full_docs_storage.upsert(batch)
105
- self.full_docs_storage.index_done_callback()
106
 
107
  # TODO: configurable whether to use coreference resolution
108
 
@@ -120,7 +112,7 @@ class GraphGen:
120
  chunk documents into smaller pieces from full_docs_storage if not already present
121
  """
122
 
123
- new_docs = self.meta_storage.get_new_data(self.full_docs_storage)
124
  if len(new_docs) == 0:
125
  logger.warning("All documents are already in the storage")
126
  return
@@ -143,16 +135,15 @@ class GraphGen:
143
 
144
  self.chunks_storage.upsert(inserting_chunks)
145
  self.chunks_storage.index_done_callback()
146
- self.meta_storage.mark_done(self.full_docs_storage)
147
- self.meta_storage.index_done_callback()
148
 
149
  @async_to_sync_method
150
  async def build_kg(self):
151
  """
152
  build knowledge graph from text chunks
153
  """
154
- # Step 1: get new chunks according to meta and chunks storage
155
- inserting_chunks = self.meta_storage.get_new_data(self.chunks_storage)
 
156
  if len(inserting_chunks) == 0:
157
  logger.warning("All chunks are already in the storage")
158
  return
@@ -169,10 +160,8 @@ class GraphGen:
169
  logger.warning("No entities or relations extracted from text chunks")
170
  return
171
 
172
- # Step 3: mark meta
173
  self.graph_storage.index_done_callback()
174
- self.meta_storage.mark_done(self.chunks_storage)
175
- self.meta_storage.index_done_callback()
176
 
177
  return _add_entities_and_relations
178
 
@@ -180,7 +169,7 @@ class GraphGen:
180
  async def search(self, search_config: Dict):
181
  logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
182
 
183
- seeds = self.meta_storage.get_new_data(self.full_docs_storage)
184
  if len(seeds) == 0:
185
  logger.warning("All documents are already been searched")
186
  return
@@ -198,8 +187,6 @@ class GraphGen:
198
  return
199
  self.search_storage.upsert(search_results)
200
  self.search_storage.index_done_callback()
201
- self.meta_storage.mark_done(self.full_docs_storage)
202
- self.meta_storage.index_done_callback()
203
 
204
  @async_to_sync_method
205
  async def quiz_and_judge(self, quiz_and_judge_config: Dict):
@@ -268,8 +255,6 @@ class GraphGen:
268
 
269
  self.extract_storage.upsert(results)
270
  self.extract_storage.index_done_callback()
271
- self.meta_storage.mark_done(self.chunks_storage)
272
- self.meta_storage.index_done_callback()
273
 
274
  @async_to_sync_method
275
  async def generate(self, generate_config: Dict):
 
9
  from graphgen.models import (
10
  JsonKVStorage,
11
  JsonListStorage,
 
12
  NetworkXStorage,
13
  OpenAIClient,
14
  Tokenizer,
 
53
  )
54
  self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
55
 
 
 
 
56
  self.full_docs_storage: JsonKVStorage = JsonKVStorage(
57
  self.working_dir, namespace="full_docs"
58
  )
 
94
  batch = {}
95
  for doc in doc_stream:
96
  doc_id = compute_mm_hash(doc, prefix="doc-")
 
97
  batch[doc_id] = doc
 
 
 
98
 
99
  # TODO: configurable whether to use coreference resolution
100
 
 
112
  chunk documents into smaller pieces from full_docs_storage if not already present
113
  """
114
 
115
+ new_docs = self.full_docs_storage.get_all()
116
  if len(new_docs) == 0:
117
  logger.warning("All documents are already in the storage")
118
  return
 
135
 
136
  self.chunks_storage.upsert(inserting_chunks)
137
  self.chunks_storage.index_done_callback()
 
 
138
 
139
  @async_to_sync_method
140
  async def build_kg(self):
141
  """
142
  build knowledge graph from text chunks
143
  """
144
+ # Step 1: get new chunks
145
+ inserting_chunks = self.chunks_storage.get_all()
146
+
147
  if len(inserting_chunks) == 0:
148
  logger.warning("All chunks are already in the storage")
149
  return
 
160
  logger.warning("No entities or relations extracted from text chunks")
161
  return
162
 
163
+ # Step 3: upsert new entities and relations to the graph storage
164
  self.graph_storage.index_done_callback()
 
 
165
 
166
  return _add_entities_and_relations
167
 
 
169
  async def search(self, search_config: Dict):
170
  logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
171
 
172
+ seeds = self.full_docs_storage.get_all()
173
  if len(seeds) == 0:
174
  logger.warning("All documents are already been searched")
175
  return
 
187
  return
188
  self.search_storage.upsert(search_results)
189
  self.search_storage.index_done_callback()
 
 
190
 
191
  @async_to_sync_method
192
  async def quiz_and_judge(self, quiz_and_judge_config: Dict):
 
255
 
256
  self.extract_storage.upsert(results)
257
  self.extract_storage.index_done_callback()
 
 
258
 
259
  @async_to_sync_method
260
  async def generate(self, generate_config: Dict):
graphgen/models/__init__.py CHANGED
@@ -31,5 +31,5 @@ from .searcher.kg.wiki_search import WikiSearch
31
  from .searcher.web.bing_search import BingSearch
32
  from .searcher.web.google_search import GoogleSearch
33
  from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
34
- from .storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage, NetworkXStorage
35
  from .tokenizer import Tokenizer
 
31
  from .searcher.web.bing_search import BingSearch
32
  from .searcher.web.google_search import GoogleSearch
33
  from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
34
+ from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
35
  from .tokenizer import Tokenizer
graphgen/models/storage/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- from .json_storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage
2
  from .networkx_storage import NetworkXStorage
 
1
+ from .json_storage import JsonKVStorage, JsonListStorage
2
  from .networkx_storage import NetworkXStorage
graphgen/models/storage/json_storage.py CHANGED
@@ -92,23 +92,3 @@ class JsonListStorage(BaseListStorage):
92
 
93
  def drop(self):
94
  self._data = []
95
-
96
-
97
- @dataclass
98
- class MetaJsonKVStorage(JsonKVStorage):
99
- def __post_init__(self):
100
- self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
101
- self._data = load_json(self._file_name) or {}
102
- logger.info("Load KV %s with %d data", self.namespace, len(self._data))
103
-
104
- def get_new_data(self, storage_instance: "JsonKVStorage") -> dict:
105
- new_data = {}
106
- for k, v in storage_instance.data.items():
107
- if k not in self._data:
108
- new_data[k] = v
109
- return new_data
110
-
111
- def mark_done(self, storage_instance: "JsonKVStorage"):
112
- new_data = self.get_new_data(storage_instance)
113
- if new_data:
114
- self._data.update(new_data)
 
92
 
93
  def drop(self):
94
  self._data = []