Spaces:
Running
Running
File size: 1,485 Bytes
90052d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from typing import Any, Dict, List
import rdflib
from rdflib import Literal
from rdflib.util import guess_format
from graphgen.bases.base_reader import BaseReader
class RDFReader(BaseReader):
"""
Reader for RDF files that extracts triples and represents them as dictionaries.
"""
def read(self, file_path: str) -> List[Dict[str, Any]]:
g = rdflib.Graph()
fmt = guess_format(file_path)
try:
g.parse(file_path, format=fmt)
except Exception as e:
raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e
docs: List[Dict[str, Any]] = []
text_col = self.text_column
for subj in set(g.subjects()):
literals = []
props = {}
for _, pred, obj in g.triples((subj, None, None)):
pred_str = str(pred)
if isinstance(obj, Literal):
literals.append(str(obj))
props.setdefault(pred_str, []).append(str(obj))
text = " ".join(literals).strip()
if not text:
raise ValueError(
f"Subject {subj} has no literal values; "
f"missing '{text_col}' for text column."
)
doc = {"id": str(subj), text_col: text, "properties": props}
docs.append(doc)
if not docs:
raise ValueError("RDF file contains no valid documents.")
return self.filter(docs)
|