github-actions[bot] commited on
Commit
fa9fcab
·
1 Parent(s): 06c3276

Auto-sync from demo at Wed Dec 3 07:50:52 UTC 2025

Browse files
graphgen/models/__init__.py CHANGED
@@ -33,5 +33,5 @@ from .searcher.kg.wiki_search import WikiSearch
33
  from .searcher.web.bing_search import BingSearch
34
  from .searcher.web.google_search import GoogleSearch
35
  from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
36
- from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
37
  from .tokenizer import Tokenizer
 
33
  from .searcher.web.bing_search import BingSearch
34
  from .searcher.web.google_search import GoogleSearch
35
  from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
36
+ from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache
37
  from .tokenizer import Tokenizer
graphgen/models/storage/__init__.py CHANGED
@@ -1,2 +1,3 @@
1
  from .json_storage import JsonKVStorage, JsonListStorage
2
  from .networkx_storage import NetworkXStorage
 
 
1
  from .json_storage import JsonKVStorage, JsonListStorage
2
  from .networkx_storage import NetworkXStorage
3
+ from .rocksdb_cache import RocksDBCache
graphgen/models/storage/rocksdb_cache.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any, Iterator, Optional
3
+
4
+ # rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
5
+ # pylint: disable=no-name-in-module
6
+ from rocksdict import Rdict
7
+
8
+
9
+ class RocksDBCache:
10
+ def __init__(self, cache_dir: str):
11
+ self.db_path = Path(cache_dir)
12
+ self.db = Rdict(str(self.db_path))
13
+
14
+ def get(self, key: str) -> Optional[Any]:
15
+ return self.db.get(key)
16
+
17
+ def set(self, key: str, value: Any):
18
+ self.db[key] = value
19
+
20
+ def delete(self, key: str):
21
+ try:
22
+ del self.db[key]
23
+ except KeyError:
24
+ # If the key does not exist, do nothing (deletion is idempotent for caches)
25
+ pass
26
+
27
+ def close(self):
28
+ if hasattr(self, "db") and self.db is not None:
29
+ self.db.close()
30
+ self.db = None
31
+
32
+ def __del__(self):
33
+ # Ensure the database is closed when the object is destroyed
34
+ self.close()
35
+
36
+ def __enter__(self):
37
+ return self
38
+
39
+ def __exit__(self, exc_type, exc_val, exc_tb):
40
+ self.close()
41
+
42
+ def __iter__(self) -> Iterator[str]:
43
+ return iter(self.db.keys())
graphgen/operators/read/parallel_file_scanner.py CHANGED
@@ -4,8 +4,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
4
  from pathlib import Path
5
  from typing import Any, Dict, List, Set, Union
6
 
7
- from diskcache import Cache
8
-
9
  from graphgen.utils import logger
10
 
11
 
@@ -13,7 +12,7 @@ class ParallelFileScanner:
13
  def __init__(
14
  self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
15
  ):
16
- self.cache = Cache(cache_dir)
17
  self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
18
  self.rescan = rescan
19
  self.max_workers = max_workers
 
4
  from pathlib import Path
5
  from typing import Any, Dict, List, Set, Union
6
 
7
+ from graphgen.models import RocksDBCache
 
8
  from graphgen.utils import logger
9
 
10
 
 
12
  def __init__(
13
  self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
14
  ):
15
+ self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
16
  self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
17
  self.rescan = rescan
18
  self.max_workers = max_workers
requirements.txt CHANGED
@@ -20,13 +20,15 @@ requests
20
  fastapi
21
  trafilatura
22
  aiohttp
23
- diskcache
24
  socksio
25
 
26
  leidenalg
27
  igraph
28
  python-louvain
29
 
 
 
 
30
  # KG
31
  rdflib
32
 
 
20
  fastapi
21
  trafilatura
22
  aiohttp
 
23
  socksio
24
 
25
  leidenalg
26
  igraph
27
  python-louvain
28
 
29
+ # storage
30
+ rocksdict
31
+
32
  # KG
33
  rdflib
34