File size: 1,485 Bytes
90052d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from typing import Any, Dict, List

import rdflib
from rdflib import Literal
from rdflib.util import guess_format

from graphgen.bases.base_reader import BaseReader


class RDFReader(BaseReader):
    """
    Reader for RDF files that extracts triples and represents them as dictionaries.
    """

    def read(self, file_path: str) -> List[Dict[str, Any]]:
        g = rdflib.Graph()
        fmt = guess_format(file_path)
        try:
            g.parse(file_path, format=fmt)
        except Exception as e:
            raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e

        docs: List[Dict[str, Any]] = []
        text_col = self.text_column

        for subj in set(g.subjects()):
            literals = []
            props = {}
            for _, pred, obj in g.triples((subj, None, None)):
                pred_str = str(pred)
                if isinstance(obj, Literal):
                    literals.append(str(obj))
                props.setdefault(pred_str, []).append(str(obj))

            text = " ".join(literals).strip()
            if not text:
                raise ValueError(
                    f"Subject {subj} has no literal values; "
                    f"missing '{text_col}' for text column."
                )

            doc = {"id": str(subj), text_col: text, "properties": props}
            docs.append(doc)

        if not docs:
            raise ValueError("RDF file contains no valid documents.")

        return self.filter(docs)