github-actions[bot] commited on
Commit
52419fe
·
1 Parent(s): 386d4ee

Auto-sync from demo at Thu Nov 6 09:42:04 UTC 2025

Browse files
graphgen/models/reader/csv_reader.py CHANGED
@@ -6,11 +6,19 @@ from graphgen.bases.base_reader import BaseReader
6
 
7
 
8
  class CSVReader(BaseReader):
 
 
 
 
 
 
 
9
  def read(self, file_path: str) -> List[Dict[str, Any]]:
10
 
11
  df = pd.read_csv(file_path)
12
  for _, row in df.iterrows():
13
- if "type" in row and row["type"] == "text" and self.text_column not in row:
 
14
  raise ValueError(
15
  f"Missing '{self.text_column}' in document: {row.to_dict()}"
16
  )
 
6
 
7
 
8
  class CSVReader(BaseReader):
9
+ """
10
+ Reader for CSV files.
11
+ Columns:
12
+ - type: The type of the document (e.g., "text", "image", etc.)
13
+ - if type is "text", "content" column must be present.
14
+ """
15
+
16
  def read(self, file_path: str) -> List[Dict[str, Any]]:
17
 
18
  df = pd.read_csv(file_path)
19
  for _, row in df.iterrows():
20
+ assert "type" in row, f"Missing 'type' column in document: {row.to_dict()}"
21
+ if row["type"] == "text" and self.text_column not in row:
22
  raise ValueError(
23
  f"Missing '{self.text_column}' in document: {row.to_dict()}"
24
  )
graphgen/models/reader/json_reader.py CHANGED
@@ -5,11 +5,19 @@ from graphgen.bases.base_reader import BaseReader
5
 
6
 
7
  class JSONReader(BaseReader):
 
 
 
 
 
 
 
8
  def read(self, file_path: str) -> List[Dict[str, Any]]:
9
  with open(file_path, "r", encoding="utf-8") as f:
10
  data = json.load(f)
11
  if isinstance(data, list):
12
  for doc in data:
 
13
  if doc.get("type") == "text" and self.text_column not in doc:
14
  raise ValueError(
15
  f"Missing '{self.text_column}' in document: {doc}"
 
5
 
6
 
7
  class JSONReader(BaseReader):
8
+ """
9
+ Reader for JSON files.
10
+ Columns:
11
+ - type: The type of the document (e.g., "text", "image", etc.)
12
+ - if type is "text", "content" column must be present.
13
+ """
14
+
15
  def read(self, file_path: str) -> List[Dict[str, Any]]:
16
  with open(file_path, "r", encoding="utf-8") as f:
17
  data = json.load(f)
18
  if isinstance(data, list):
19
  for doc in data:
20
+ assert "type" in doc, f"Missing 'type' in document: {doc}"
21
  if doc.get("type") == "text" and self.text_column not in doc:
22
  raise ValueError(
23
  f"Missing '{self.text_column}' in document: {doc}"
graphgen/models/reader/jsonl_reader.py CHANGED
@@ -6,12 +6,20 @@ from graphgen.utils import logger
6
 
7
 
8
  class JSONLReader(BaseReader):
 
 
 
 
 
 
 
9
  def read(self, file_path: str) -> List[Dict[str, Any]]:
10
  docs = []
11
  with open(file_path, "r", encoding="utf-8") as f:
12
  for line in f:
13
  try:
14
  doc = json.loads(line)
 
15
  if doc.get("type") == "text" and self.text_column not in doc:
16
  raise ValueError(
17
  f"Missing '{self.text_column}' in document: {doc}"
 
6
 
7
 
8
  class JSONLReader(BaseReader):
9
+ """
10
+ Reader for JSONL files.
11
+ Columns:
12
+ - type: The type of the document (e.g., "text", "image", etc.)
13
+ - if type is "text", "content" column must be present.
14
+ """
15
+
16
  def read(self, file_path: str) -> List[Dict[str, Any]]:
17
  docs = []
18
  with open(file_path, "r", encoding="utf-8") as f:
19
  for line in f:
20
  try:
21
  doc = json.loads(line)
22
+ assert "type" in doc, f"Missing 'type' in document: {doc}"
23
  if doc.get("type") == "text" and self.text_column not in doc:
24
  raise ValueError(
25
  f"Missing '{self.text_column}' in document: {doc}"
graphgen/models/reader/parquet_reader.py CHANGED
@@ -8,6 +8,9 @@ from graphgen.bases.base_reader import BaseReader
8
  class ParquetReader(BaseReader):
9
  """
10
  Read parquet files, requiring the schema to be restored to List[Dict[str, Any]].
 
 
 
11
  """
12
 
13
  def read(self, file_path: str) -> List[Dict[str, Any]]:
@@ -15,6 +18,7 @@ class ParquetReader(BaseReader):
15
  data: List[Dict[str, Any]] = df.to_dict(orient="records")
16
 
17
  for doc in data:
 
18
  if doc.get("type") == "text" and self.text_column not in doc:
19
  raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
20
  return self.filter(data)
 
8
  class ParquetReader(BaseReader):
9
  """
10
  Read parquet files, requiring the schema to be restored to List[Dict[str, Any]].
11
+ Columns:
12
+ - type: The type of the document (e.g., "text", "image", etc.)
13
+ - if type is "text", "content" column must be present.
14
  """
15
 
16
  def read(self, file_path: str) -> List[Dict[str, Any]]:
 
18
  data: List[Dict[str, Any]] = df.to_dict(orient="records")
19
 
20
  for doc in data:
21
+ assert "type" in doc, f"Missing 'type' in document: {doc}"
22
  if doc.get("type") == "text" and self.text_column not in doc:
23
  raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
24
  return self.filter(data)
graphgen/models/reader/pickle_reader.py CHANGED
@@ -7,6 +7,10 @@ from graphgen.bases.base_reader import BaseReader
7
  class PickleReader(BaseReader):
8
  """
9
  Read pickle files, requiring the top-level object to be List[Dict[str, Any]].
 
 
 
 
10
  """
11
 
12
  def read(self, file_path: str) -> List[Dict[str, Any]]:
@@ -19,6 +23,7 @@ class PickleReader(BaseReader):
19
  for doc in data:
20
  if not isinstance(doc, dict):
21
  raise ValueError("Every item in the list must be a dict.")
 
22
  if doc.get("type") == "text" and self.text_column not in doc:
23
  raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
24
 
 
7
  class PickleReader(BaseReader):
8
  """
9
  Read pickle files, requiring the top-level object to be List[Dict[str, Any]].
10
+
11
+ Columns:
12
+ - type: The type of the document (e.g., "text", "image", etc.)
13
+ - if type is "text", "content" column must be present.
14
  """
15
 
16
  def read(self, file_path: str) -> List[Dict[str, Any]]:
 
23
  for doc in data:
24
  if not isinstance(doc, dict):
25
  raise ValueError("Every item in the list must be a dict.")
26
+ assert "type" in doc, f"Missing 'type' in document: {doc}"
27
  if doc.get("type") == "text" and self.text_column not in doc:
28
  raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
29