Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
fa9fcab
1
Parent(s):
06c3276
Auto-sync from demo at Wed Dec 3 07:50:52 UTC 2025
Browse files
graphgen/models/__init__.py
CHANGED
|
@@ -33,5 +33,5 @@ from .searcher.kg.wiki_search import WikiSearch
|
|
| 33 |
from .searcher.web.bing_search import BingSearch
|
| 34 |
from .searcher.web.google_search import GoogleSearch
|
| 35 |
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
|
| 36 |
-
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
|
| 37 |
from .tokenizer import Tokenizer
|
|
|
|
| 33 |
from .searcher.web.bing_search import BingSearch
|
| 34 |
from .searcher.web.google_search import GoogleSearch
|
| 35 |
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
|
| 36 |
+
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache
|
| 37 |
from .tokenizer import Tokenizer
|
graphgen/models/storage/__init__.py
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
from .json_storage import JsonKVStorage, JsonListStorage
|
| 2 |
from .networkx_storage import NetworkXStorage
|
|
|
|
|
|
| 1 |
from .json_storage import JsonKVStorage, JsonListStorage
|
| 2 |
from .networkx_storage import NetworkXStorage
|
| 3 |
+
from .rocksdb_cache import RocksDBCache
|
graphgen/models/storage/rocksdb_cache.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Any, Iterator, Optional
|
| 3 |
+
|
| 4 |
+
# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
|
| 5 |
+
# pylint: disable=no-name-in-module
|
| 6 |
+
from rocksdict import Rdict
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class RocksDBCache:
|
| 10 |
+
def __init__(self, cache_dir: str):
|
| 11 |
+
self.db_path = Path(cache_dir)
|
| 12 |
+
self.db = Rdict(str(self.db_path))
|
| 13 |
+
|
| 14 |
+
def get(self, key: str) -> Optional[Any]:
|
| 15 |
+
return self.db.get(key)
|
| 16 |
+
|
| 17 |
+
def set(self, key: str, value: Any):
|
| 18 |
+
self.db[key] = value
|
| 19 |
+
|
| 20 |
+
def delete(self, key: str):
|
| 21 |
+
try:
|
| 22 |
+
del self.db[key]
|
| 23 |
+
except KeyError:
|
| 24 |
+
# If the key does not exist, do nothing (deletion is idempotent for caches)
|
| 25 |
+
pass
|
| 26 |
+
|
| 27 |
+
def close(self):
|
| 28 |
+
if hasattr(self, "db") and self.db is not None:
|
| 29 |
+
self.db.close()
|
| 30 |
+
self.db = None
|
| 31 |
+
|
| 32 |
+
def __del__(self):
|
| 33 |
+
# Ensure the database is closed when the object is destroyed
|
| 34 |
+
self.close()
|
| 35 |
+
|
| 36 |
+
def __enter__(self):
|
| 37 |
+
return self
|
| 38 |
+
|
| 39 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 40 |
+
self.close()
|
| 41 |
+
|
| 42 |
+
def __iter__(self) -> Iterator[str]:
|
| 43 |
+
return iter(self.db.keys())
|
graphgen/operators/read/parallel_file_scanner.py
CHANGED
|
@@ -4,8 +4,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Any, Dict, List, Set, Union
|
| 6 |
|
| 7 |
-
from
|
| 8 |
-
|
| 9 |
from graphgen.utils import logger
|
| 10 |
|
| 11 |
|
|
@@ -13,7 +12,7 @@ class ParallelFileScanner:
|
|
| 13 |
def __init__(
|
| 14 |
self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
|
| 15 |
):
|
| 16 |
-
self.cache =
|
| 17 |
self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
|
| 18 |
self.rescan = rescan
|
| 19 |
self.max_workers = max_workers
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Any, Dict, List, Set, Union
|
| 6 |
|
| 7 |
+
from graphgen.models import RocksDBCache
|
|
|
|
| 8 |
from graphgen.utils import logger
|
| 9 |
|
| 10 |
|
|
|
|
| 12 |
def __init__(
|
| 13 |
self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
|
| 14 |
):
|
| 15 |
+
self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
|
| 16 |
self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
|
| 17 |
self.rescan = rescan
|
| 18 |
self.max_workers = max_workers
|
requirements.txt
CHANGED
|
@@ -20,13 +20,15 @@ requests
|
|
| 20 |
fastapi
|
| 21 |
trafilatura
|
| 22 |
aiohttp
|
| 23 |
-
diskcache
|
| 24 |
socksio
|
| 25 |
|
| 26 |
leidenalg
|
| 27 |
igraph
|
| 28 |
python-louvain
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
# KG
|
| 31 |
rdflib
|
| 32 |
|
|
|
|
| 20 |
fastapi
|
| 21 |
trafilatura
|
| 22 |
aiohttp
|
|
|
|
| 23 |
socksio
|
| 24 |
|
| 25 |
leidenalg
|
| 26 |
igraph
|
| 27 |
python-louvain
|
| 28 |
|
| 29 |
+
# storage
|
| 30 |
+
rocksdict
|
| 31 |
+
|
| 32 |
# KG
|
| 33 |
rdflib
|
| 34 |
|