Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
6acd2be
1
Parent(s):
8ad3d05
Auto-sync from demo at Fri Nov 7 12:11:52 UTC 2025
Browse files
graphgen/graphgen.py
CHANGED
|
@@ -96,7 +96,7 @@ class GraphGen:
|
|
| 96 |
"""
|
| 97 |
read files from input sources
|
| 98 |
"""
|
| 99 |
-
data = read_files(read_config
|
| 100 |
if len(data) == 0:
|
| 101 |
logger.warning("No data to process")
|
| 102 |
return
|
|
|
|
| 96 |
"""
|
| 97 |
read files from input sources
|
| 98 |
"""
|
| 99 |
+
data = read_files(**read_config, cache_dir=self.working_dir)
|
| 100 |
if len(data) == 0:
|
| 101 |
logger.warning("No data to process")
|
| 102 |
return
|
graphgen/operators/read/read_files.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
-
from typing import Any, Dict, List
|
| 3 |
|
| 4 |
from graphgen.models import (
|
| 5 |
CSVReader,
|
|
@@ -34,26 +34,49 @@ def _build_reader(suffix: str, cache_dir: str | None):
|
|
| 34 |
return _MAPPING[suffix]()
|
| 35 |
|
| 36 |
|
| 37 |
-
def read_files(
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if not path.exists():
|
| 40 |
-
raise FileNotFoundError(f"input_path not found: {
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
if path.is_file():
|
| 43 |
-
suffix = path.suffix.lstrip(".")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
reader = _build_reader(suffix, cache_dir)
|
| 45 |
return reader.read(str(path))
|
| 46 |
|
| 47 |
-
|
| 48 |
files_to_read = [
|
| 49 |
p for p in path.rglob("*") if p.suffix.lstrip(".").lower() in support_suffix
|
| 50 |
]
|
| 51 |
-
logger.info(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
all_docs: List[Dict[str, Any]] = []
|
| 54 |
for p in files_to_read:
|
| 55 |
try:
|
| 56 |
-
suffix = p.suffix.lstrip(".")
|
| 57 |
reader = _build_reader(suffix, cache_dir)
|
| 58 |
all_docs.extend(reader.read(str(p)))
|
| 59 |
except Exception as e: # pylint: disable=broad-except
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
from typing import Any, Dict, List, Optional
|
| 3 |
|
| 4 |
from graphgen.models import (
|
| 5 |
CSVReader,
|
|
|
|
| 34 |
return _MAPPING[suffix]()
|
| 35 |
|
| 36 |
|
| 37 |
+
def read_files(
|
| 38 |
+
input_file: str,
|
| 39 |
+
allowed_suffix: Optional[List[str]] = None,
|
| 40 |
+
cache_dir: Optional[str] = None,
|
| 41 |
+
) -> list[dict]:
|
| 42 |
+
path = Path(input_file).expanduser()
|
| 43 |
if not path.exists():
|
| 44 |
+
raise FileNotFoundError(f"input_path not found: {input_file}")
|
| 45 |
|
| 46 |
+
if allowed_suffix is None:
|
| 47 |
+
support_suffix = set(_MAPPING.keys())
|
| 48 |
+
else:
|
| 49 |
+
support_suffix = {s.lower().lstrip(".") for s in allowed_suffix}
|
| 50 |
+
|
| 51 |
+
# single file
|
| 52 |
if path.is_file():
|
| 53 |
+
suffix = path.suffix.lstrip(".").lower()
|
| 54 |
+
if suffix not in support_suffix:
|
| 55 |
+
logger.warning(
|
| 56 |
+
"Skip file %s (suffix '%s' not in allowed_suffix %s)",
|
| 57 |
+
path,
|
| 58 |
+
suffix,
|
| 59 |
+
support_suffix,
|
| 60 |
+
)
|
| 61 |
+
return []
|
| 62 |
reader = _build_reader(suffix, cache_dir)
|
| 63 |
return reader.read(str(path))
|
| 64 |
|
| 65 |
+
# folder
|
| 66 |
files_to_read = [
|
| 67 |
p for p in path.rglob("*") if p.suffix.lstrip(".").lower() in support_suffix
|
| 68 |
]
|
| 69 |
+
logger.info(
|
| 70 |
+
"Found %d eligible file(s) under folder %s (allowed_suffix=%s)",
|
| 71 |
+
len(files_to_read),
|
| 72 |
+
input_file,
|
| 73 |
+
support_suffix,
|
| 74 |
+
)
|
| 75 |
|
| 76 |
all_docs: List[Dict[str, Any]] = []
|
| 77 |
for p in files_to_read:
|
| 78 |
try:
|
| 79 |
+
suffix = p.suffix.lstrip(".").lower()
|
| 80 |
reader = _build_reader(suffix, cache_dir)
|
| 81 |
all_docs.extend(reader.read(str(p)))
|
| 82 |
except Exception as e: # pylint: disable=broad-except
|