Spaces:

chenzihong
/

GraphGen

Running

App Files Files Community

github-actions[bot] commited on 17 days ago

Commit

2bc66d4

1 Parent(s): dee1edd

Auto-sync from demo at Wed Nov 26 11:48:49 UTC 2025

Browse files

Files changed (4) hide show

graphgen/graphgen.py +6 -21
graphgen/models/__init__.py +1 -1
graphgen/models/storage/__init__.py +1 -1
graphgen/models/storage/json_storage.py +0 -20

graphgen/graphgen.py CHANGED Viewed

@@ -9,7 +9,6 @@ from graphgen.bases.datatypes import Chunk
 from graphgen.models import (
     JsonKVStorage,
     JsonListStorage,
-    MetaJsonKVStorage,
     NetworkXStorage,
     OpenAIClient,
     Tokenizer,
@@ -54,9 +53,6 @@ class GraphGen:
         )
         self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
-        self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
-            self.working_dir, namespace="_meta"
-        )
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
@@ -98,11 +94,7 @@ class GraphGen:
         batch = {}
         for doc in doc_stream:
             doc_id = compute_mm_hash(doc, prefix="doc-")
             batch[doc_id] = doc
-        if batch:
-            self.full_docs_storage.upsert(batch)
-            self.full_docs_storage.index_done_callback()
         # TODO: configurable whether to use coreference resolution
@@ -120,7 +112,7 @@ class GraphGen:
         chunk documents into smaller pieces from full_docs_storage if not already present
         """
-        new_docs = self.meta_storage.get_new_data(self.full_docs_storage)
         if len(new_docs) == 0:
             logger.warning("All documents are already in the storage")
             return
@@ -143,16 +135,15 @@ class GraphGen:
         self.chunks_storage.upsert(inserting_chunks)
         self.chunks_storage.index_done_callback()
-        self.meta_storage.mark_done(self.full_docs_storage)
-        self.meta_storage.index_done_callback()
     @async_to_sync_method
     async def build_kg(self):
         """
         build knowledge graph from text chunks
         """
-        # Step 1: get new chunks according to meta and chunks storage
-        inserting_chunks = self.meta_storage.get_new_data(self.chunks_storage)
         if len(inserting_chunks) == 0:
             logger.warning("All chunks are already in the storage")
             return
@@ -169,10 +160,8 @@ class GraphGen:
             logger.warning("No entities or relations extracted from text chunks")
             return
-        # Step 3: mark meta
         self.graph_storage.index_done_callback()
-        self.meta_storage.mark_done(self.chunks_storage)
-        self.meta_storage.index_done_callback()
         return _add_entities_and_relations
@@ -180,7 +169,7 @@ class GraphGen:
     async def search(self, search_config: Dict):
         logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
-        seeds = self.meta_storage.get_new_data(self.full_docs_storage)
         if len(seeds) == 0:
             logger.warning("All documents are already been searched")
             return
@@ -198,8 +187,6 @@ class GraphGen:
             return
         self.search_storage.upsert(search_results)
         self.search_storage.index_done_callback()
-        self.meta_storage.mark_done(self.full_docs_storage)
-        self.meta_storage.index_done_callback()
     @async_to_sync_method
     async def quiz_and_judge(self, quiz_and_judge_config: Dict):
@@ -268,8 +255,6 @@ class GraphGen:
         self.extract_storage.upsert(results)
         self.extract_storage.index_done_callback()
-        self.meta_storage.mark_done(self.chunks_storage)
-        self.meta_storage.index_done_callback()
     @async_to_sync_method
     async def generate(self, generate_config: Dict):

 from graphgen.models import (
     JsonKVStorage,
     JsonListStorage,
     NetworkXStorage,
     OpenAIClient,
     Tokenizer,
         )
         self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
         batch = {}
         for doc in doc_stream:
             doc_id = compute_mm_hash(doc, prefix="doc-")
             batch[doc_id] = doc
         # TODO: configurable whether to use coreference resolution
         chunk documents into smaller pieces from full_docs_storage if not already present
         """
+        new_docs = self.full_docs_storage.get_all()
         if len(new_docs) == 0:
             logger.warning("All documents are already in the storage")
             return
         self.chunks_storage.upsert(inserting_chunks)
         self.chunks_storage.index_done_callback()
     @async_to_sync_method
     async def build_kg(self):
         """
         build knowledge graph from text chunks
         """
+        # Step 1: get new chunks
+        inserting_chunks = self.chunks_storage.get_all()
         if len(inserting_chunks) == 0:
             logger.warning("All chunks are already in the storage")
             return
             logger.warning("No entities or relations extracted from text chunks")
             return
+        # Step 3: upsert new entities and relations to the graph storage
         self.graph_storage.index_done_callback()
         return _add_entities_and_relations
     async def search(self, search_config: Dict):
         logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
+        seeds = self.full_docs_storage.get_all()
         if len(seeds) == 0:
             logger.warning("All documents are already been searched")
             return
             return
         self.search_storage.upsert(search_results)
         self.search_storage.index_done_callback()
     @async_to_sync_method
     async def quiz_and_judge(self, quiz_and_judge_config: Dict):
         self.extract_storage.upsert(results)
         self.extract_storage.index_done_callback()
     @async_to_sync_method
     async def generate(self, generate_config: Dict):

graphgen/models/__init__.py CHANGED Viewed

@@ -31,5 +31,5 @@ from .searcher.kg.wiki_search import WikiSearch
 from .searcher.web.bing_search import BingSearch
 from .searcher.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
-from .storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage, NetworkXStorage
 from .tokenizer import Tokenizer

 from .searcher.web.bing_search import BingSearch
 from .searcher.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
+from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
 from .tokenizer import Tokenizer

graphgen/models/storage/__init__.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- from .json_storage import JsonKVStorage, JsonListStorage~~, MetaJsonKVStorage~~
2	from .networkx_storage import NetworkXStorage


1	+ from .json_storage import JsonKVStorage, JsonListStorage
2	from .networkx_storage import NetworkXStorage

graphgen/models/storage/json_storage.py CHANGED Viewed

@@ -92,23 +92,3 @@ class JsonListStorage(BaseListStorage):
     def drop(self):
         self._data = []
-@dataclass
-class MetaJsonKVStorage(JsonKVStorage):
-    def __post_init__(self):
-        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
-        self._data = load_json(self._file_name) or {}
-        logger.info("Load KV %s with %d data", self.namespace, len(self._data))
-    def get_new_data(self, storage_instance: "JsonKVStorage") -> dict:
-        new_data = {}
-        for k, v in storage_instance.data.items():
-            if k not in self._data:
-                new_data[k] = v
-        return new_data
-    def mark_done(self, storage_instance: "JsonKVStorage"):
-        new_data = self.get_new_data(storage_instance)
-        if new_data:
-            self._data.update(new_data)


92
93	def drop(self):
94	self._data = []