File size: 1,097 Bytes
d2a63cc
 
 
 
 
 
 
e4316f1
52419fe
 
 
 
 
 
 
d2a63cc
 
 
 
 
 
52419fe
0b9d8c7
d2a63cc
 
 
0b9d8c7
d2a63cc
 
0b9d8c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
from typing import Any, Dict, List

from graphgen.bases.base_reader import BaseReader
from graphgen.utils import logger


class JSONLReader(BaseReader):
    """
    Reader for JSONL files.
    Columns:
        - type: The type of the document (e.g., "text", "image", etc.)
        - if type is "text", "content" column must be present.
    """

    def read(self, file_path: str) -> List[Dict[str, Any]]:
        docs = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    doc = json.loads(line)
                    assert "type" in doc, f"Missing 'type' in document: {doc}"
                    if doc.get("type") == "text" and self.text_column not in doc:
                        raise ValueError(
                            f"Missing '{self.text_column}' in document: {doc}"
                        )
                    docs.append(doc)
                except json.JSONDecodeError as e:
                    logger.error("Error decoding JSON line: %s. Error: %s", line, e)
        return self.filter(docs)