Spaces:

chenzihong
/

GraphGen

Running

App Files Files Community

github-actions[bot] commited on 3 days ago

Commit

31086ae

1 Parent(s): 10ba08f

Auto-sync from demo at Tue Dec 16 08:21:05 UTC 2025

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +134 -172
graphgen/bases/__init__.py +3 -7
graphgen/bases/base_llm_wrapper.py +0 -6
graphgen/bases/base_operator.py +57 -0
graphgen/bases/base_partitioner.py +22 -27
graphgen/bases/base_reader.py +57 -41
graphgen/bases/base_splitter.py +3 -3
graphgen/bases/base_storage.py +6 -17
graphgen/bases/datatypes.py +44 -0
graphgen/{operators/init → common}/__init__.py +1 -0
graphgen/{operators/init → common}/init_llm.py +125 -29
graphgen/common/init_storage.py +262 -0
graphgen/configs/aggregated_config.yaml +0 -41
graphgen/configs/atomic_config.yaml +0 -31
graphgen/configs/cot_config.yaml +0 -33
graphgen/configs/multi_hop_config.yaml +0 -34
graphgen/configs/schema_guided_extraction_config.yaml +0 -20
graphgen/configs/search_dna_config.yaml +0 -17
graphgen/configs/search_protein_config.yaml +0 -15
graphgen/configs/search_rna_config.yaml +0 -14
graphgen/configs/vqa_config.yaml +0 -32
graphgen/engine.py +191 -106
graphgen/graphgen.py +0 -295
graphgen/models/__init__.py +7 -2
graphgen/models/extractor/schema_guided_extractor.py +3 -5
graphgen/models/generator/vqa_generator.py +2 -2
graphgen/models/llm/local/sglang_wrapper.py +0 -12
graphgen/models/llm/local/vllm_wrapper.py +35 -47
graphgen/models/partitioner/anchor_bfs_partitioner.py +9 -14
graphgen/models/partitioner/bfs_partitioner.py +4 -9
graphgen/models/partitioner/dfs_partitioner.py +5 -9
graphgen/models/partitioner/ece_partitioner.py +19 -24
graphgen/models/partitioner/leiden_partitioner.py +5 -9
graphgen/models/reader/__init__.py +0 -1
graphgen/models/reader/csv_reader.py +14 -11
graphgen/models/reader/json_reader.py +41 -14
graphgen/models/reader/jsonl_reader.py +0 -30
graphgen/models/reader/parquet_reader.py +16 -10
graphgen/models/reader/pdf_reader.py +35 -20
graphgen/models/reader/pickle_reader.py +64 -16
graphgen/models/reader/rdf_reader.py +93 -13
graphgen/models/reader/txt_reader.py +27 -5
graphgen/models/splitter/character_splitter.py +1 -1
graphgen/models/splitter/markdown_splitter.py +2 -2
graphgen/models/splitter/recursive_character_splitter.py +2 -2
graphgen/models/storage/__init__.py +5 -2
graphgen/{configs → models/storage/graph}/__init__.py +0 -0
graphgen/models/storage/graph/kuzu_storage.py +256 -0
graphgen/models/storage/{networkx_storage.py → graph/networkx_storage.py} +17 -20
graphgen/models/storage/kv/__init__.py +0 -0

app.py CHANGED Viewed

@@ -5,14 +5,12 @@ import tempfile
 from importlib.resources import files
 import gradio as gr
-import pandas as pd
 from dotenv import load_dotenv
-from graphgen.engine import Context, Engine, collect_ops
-from graphgen.graphgen import GraphGen
-from graphgen.models import OpenAIClient, Tokenizer
-from graphgen.models.llm.limitter import RPM, TPM
-from graphgen.utils import set_logger
 from webui.base import WebuiParams
 from webui.i18n import Translate
 from webui.i18n import gettext as _
@@ -22,7 +20,6 @@ from webui.utils import cleanup_workspace, count_tokens, preview_file, setup_wor
 root_dir = files("webui").parent
 sys.path.append(root_dir)
 load_dotenv()
 css = """
@@ -34,131 +31,136 @@ css = """
 """
-def init_graph_gen(config: dict, env: dict) -> GraphGen:
-    # Set up working directory
-    log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
-    set_logger(log_file, if_stream=True)
-    os.environ.update({k: str(v) for k, v in env.items()})
-    tokenizer_instance = Tokenizer(config.get("tokenizer", "cl100k_base"))
-    synthesizer_llm_client = OpenAIClient(
-        model=env.get("SYNTHESIZER_MODEL", ""),
-        base_url=env.get("SYNTHESIZER_BASE_URL", ""),
-        api_key=env.get("SYNTHESIZER_API_KEY", ""),
-        request_limit=True,
-        rpm=RPM(env.get("RPM", 1000)),
-        tpm=TPM(env.get("TPM", 50000)),
-        tokenizer=tokenizer_instance,
-    )
-    trainee_llm_client = OpenAIClient(
-        model=env.get("TRAINEE_MODEL", ""),
-        base_url=env.get("TRAINEE_BASE_URL", ""),
-        api_key=env.get("TRAINEE_API_KEY", ""),
-        request_limit=True,
-        rpm=RPM(env.get("RPM", 1000)),
-        tpm=TPM(env.get("TPM", 50000)),
-        tokenizer=tokenizer_instance,
-    )
-    graph_gen = GraphGen(
-        working_dir=working_dir,
-        tokenizer_instance=tokenizer_instance,
-        synthesizer_llm_client=synthesizer_llm_client,
-        trainee_llm_client=trainee_llm_client,
-    )
-    return graph_gen
-# pylint: disable=too-many-statements
-def run_graphgen(params: WebuiParams, progress=gr.Progress()):
-    def sum_tokens(client):
-        return sum(u["total_tokens"] for u in client.token_usage)
     method = params.partition_method
     if method == "dfs":
-        partition_params = {
             "max_units_per_community": params.dfs_max_units,
         }
-    elif method == "bfs":
-        partition_params = {
             "max_units_per_community": params.bfs_max_units,
         }
-    elif method == "leiden":
-        partition_params = {
             "max_size": params.leiden_max_size,
             "use_lcc": params.leiden_use_lcc,
             "random_seed": params.leiden_random_seed,
         }
-    else:  # ece
-        partition_params = {
-            "max_units_per_community": params.ece_max_units,
-            "min_units_per_community": params.ece_min_units,
-            "max_tokens_per_community": params.ece_max_tokens,
-            "unit_sampling": params.ece_unit_sampling,
-        }
-    pipeline = [
         {
-            "name": "read",
-            "op_key": "read",
             "params": {
-                "input_file": params.upload_file,
             },
         },
         {
-            "name": "chunk",
-            "deps": ["read"],
-            "op_key": "chunk",
             "params": {
                 "chunk_size": params.chunk_size,
                 "chunk_overlap": params.chunk_overlap,
             },
         },
         {
-            "name": "build_kg",
-            "deps": ["chunk"],
-            "op_key": "build_kg",
         },
     ]
     if params.if_trainee_model:
-        pipeline.append(
-            {
-                "name": "quiz_and_judge",
-                "deps": ["build_kg"],
-                "op_key": "quiz_and_judge",
-                "params": {"quiz_samples": params.quiz_samples, "re_judge": True},
-            }
-        )
-        pipeline.append(
             {
-                "name": "partition",
-                "deps": ["quiz_and_judge"],
-                "op_key": "partition",
                 "params": {
-                    "method": params.partition_method,
-                    "method_params": partition_params,
                 },
             }
         )
-    else:
-        pipeline.append(
             {
-                "name": "partition",
-                "deps": ["build_kg"],
-                "op_key": "partition",
-                "params": {
-                    "method": params.partition_method,
-                    "method_params": partition_params,
-                },
             }
         )
-    pipeline.append(
         {
-            "name": "generate",
-            "deps": ["partition"],
-            "op_key": "generate",
             "params": {
                 "method": params.mode,
                 "data_format": params.data_format,
@@ -166,88 +168,50 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
         }
     )
-    config = {
-        "if_trainee_model": params.if_trainee_model,
-        "read": {"input_file": params.upload_file},
-        "pipeline": pipeline,
-    }
-    env = {
-        "TOKENIZER_MODEL": params.tokenizer,
-        "SYNTHESIZER_BASE_URL": params.synthesizer_url,
-        "SYNTHESIZER_MODEL": params.synthesizer_model,
-        "TRAINEE_BASE_URL": params.trainee_url,
-        "TRAINEE_MODEL": params.trainee_model,
-        "SYNTHESIZER_API_KEY": params.api_key,
-        "TRAINEE_API_KEY": params.trainee_api_key,
-        "RPM": params.rpm,
-        "TPM": params.tpm,
-    }
-    # Test API connection
-    test_api_connection(
-        env["SYNTHESIZER_BASE_URL"],
-        env["SYNTHESIZER_API_KEY"],
-        env["SYNTHESIZER_MODEL"],
-    )
-    if config["if_trainee_model"]:
-        test_api_connection(
-            env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
-        )
-    # Initialize GraphGen
-    graph_gen = init_graph_gen(config, env)
-    graph_gen.clear()
-    graph_gen.progress_bar = progress
-    try:
-        ctx = Context(config=config, graph_gen=graph_gen)
-        ops = collect_ops(config, graph_gen)
-        Engine(max_workers=config.get("max_workers", 4)).run(ops, ctx)
-        # Save output
-        output_data = graph_gen.qa_storage.data
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
-        ) as tmpfile:
-            json.dump(output_data, tmpfile, ensure_ascii=False)
-            output_file = tmpfile.name
-        synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
-        trainee_tokens = (
-            sum_tokens(graph_gen.trainee_llm_client)
-            if config["if_trainee_model"]
-            else 0
-        )
-        total_tokens = synthesizer_tokens + trainee_tokens
-        data_frame = params.token_counter
-        try:
-            _update_data = [
-                [data_frame.iloc[0, 0], data_frame.iloc[0, 1], str(total_tokens)]
-            ]
-            new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
-            data_frame = new_df
-        except Exception as e:
-            raise gr.Error(f"DataFrame operation error: {str(e)}")
-        return output_file, gr.DataFrame(
-            label="Token Stats",
-            headers=["Source Text Token Count", "Expected Token Usage", "Token Used"],
-            datatype="str",
-            interactive=False,
-            value=data_frame,
-            visible=True,
-            wrap=True,
-        )
     except Exception as e:  # pylint: disable=broad-except
         raise gr.Error(f"Error occurred: {str(e)}")
     finally:
         # Clean up workspace
-        cleanup_workspace(graph_gen.working_dir)
 with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
@@ -267,7 +231,6 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
             ("简体中文", "zh"),
         ],
         value="en",
-        # label=_("Language"),
         render=False,
         container=False,
         elem_classes=["center-row"],
@@ -295,7 +258,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
         os.path.join(root_dir, "webui", "translation.json"),
         lang_btn,
         placeholder_langs=["en", "zh"],
-        persistant=False,  # True to save the language setting in the browser. Requires gradio >= 5.6.0
     ):
         lang_btn.render()
@@ -701,7 +664,6 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
             outputs=[output, token_counter],
         )
 if __name__ == "__main__":
     demo.queue(api_open=False, default_concurrency_limit=2)
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

 from importlib.resources import files
 import gradio as gr
+import ray
 from dotenv import load_dotenv
+from graphgen.engine import Engine
+from graphgen.operators import operators
+from graphgen.utils import CURRENT_LOGGER_VAR, set_logger
 from webui.base import WebuiParams
 from webui.i18n import Translate
 from webui.i18n import gettext as _
 root_dir = files("webui").parent
 sys.path.append(root_dir)
 load_dotenv()
 css = """
 """
+def _get_partition_params(params: WebuiParams):
     method = params.partition_method
     if method == "dfs":
+        return {
             "max_units_per_community": params.dfs_max_units,
         }
+    if method == "bfs":
+        return {
             "max_units_per_community": params.bfs_max_units,
         }
+    if method == "leiden":
+        return {
             "max_size": params.leiden_max_size,
             "use_lcc": params.leiden_use_lcc,
             "random_seed": params.leiden_random_seed,
         }
+    # ece
+    return {
+        "max_units_per_community": params.ece_max_units,
+        "min_units_per_community": params.ece_min_units,
+        "max_tokens_per_community": params.ece_max_tokens,
+        "unit_sampling": params.ece_unit_sampling,
+    }
+# pylint: disable=too-many-statements
+def run_graphgen(params: WebuiParams, progress=gr.Progress()):
+    # 1. Setup Workspace
+    log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
+    driver_logger = set_logger(log_file, "GraphGeb", if_stream=True)
+    CURRENT_LOGGER_VAR.set(driver_logger)
+    # 2. Setup Environment Variables for Ray Actors/LLM Init
+    # The refactored code relies on env vars in graphgen/common/init_llm.py
+    os.environ["SYNTHESIZER_BACKEND"] = "openai_api"  # Assuming OpenAI compatible API
+    os.environ["SYNTHESIZER_BASE_URL"] = params.synthesizer_url
+    os.environ["SYNTHESIZER_API_KEY"] = params.api_key
+    os.environ["SYNTHESIZER_MODEL"] = params.synthesizer_model
+    os.environ["RPM"] = str(params.rpm)
+    os.environ["TPM"] = str(params.tpm)
+    os.environ["TOKENIZER_MODEL"] = params.tokenizer
+    if params.if_trainee_model:
+        os.environ["TRAINEE_BACKEND"] = "openai_api"
+        os.environ["TRAINEE_BASE_URL"] = params.trainee_url
+        os.environ["TRAINEE_API_KEY"] = params.trainee_api_key
+        os.environ["TRAINEE_MODEL"] = params.trainee_model
+    # 3. Construct Pipeline Configuration (DAG)
+    nodes = [
         {
+            "id": "read",
+            "op_name": "read",
+            "type": "source",
+            "dependencies": [],
             "params": {
+                "input_path": [params.upload_file],
             },
         },
         {
+            "id": "chunk",
+            "op_name": "chunk",
+            "type": "map_batch",
+            "dependencies": ["read"],
+            "execution_params": {"replicas": 1},
             "params": {
                 "chunk_size": params.chunk_size,
                 "chunk_overlap": params.chunk_overlap,
             },
         },
         {
+            "id": "build_kg",
+            "op_name": "build_kg",
+            "type": "map_batch",
+            "dependencies": ["chunk"],
+            "execution_params": {"replicas": 1, "batch_size": 128},
         },
     ]
+    last_node_id = "build_kg"
+    # Optional: Quiz and Judge
     if params.if_trainee_model:
+        nodes.append(
             {
+                "id": "quiz",
+                "op_name": "quiz",
+                "type": "aggregate",  # QuizService uses aggregate in config
+                "dependencies": ["build_kg"],
+                "execution_params": {"replicas": 1, "batch_size": 128},
                 "params": {
+                    "quiz_samples": params.quiz_samples,
+                    "concurrency_limit": 200,
                 },
             }
         )
+        nodes.append(
             {
+                "id": "judge",
+                "op_name": "judge",
+                "type": "map_batch",
+                "dependencies": ["quiz"],
+                "execution_params": {"replicas": 1, "batch_size": 128},
             }
         )
+        last_node_id = "judge"
+    # Node: Partition
+    nodes.append(
+        {
+            "id": "partition",
+            "op_name": "partition",
+            "type": "aggregate",  # PartitionService uses aggregate
+            "dependencies": [last_node_id],
+            "params": {
+                "method": params.partition_method,
+                "method_params": _get_partition_params(params),
+            },
+        }
+    )
+    # Node: Generate
+    nodes.append(
         {
+            "id": "generate",
+            "op_name": "generate",
+            "type": "map_batch",
+            "dependencies": ["partition"],
+            "execution_params": {"replicas": 1, "batch_size": 128},
             "params": {
                 "method": params.mode,
                 "data_format": params.data_format,
         }
     )
+    config = {"global_params": {"working_dir": working_dir}, "nodes": nodes}
+    try:
+        # 4. Initialize and Run Engine
+        # Initialize Ray if not already running (Engine handles this mostly, but good for safety)
+        if not ray.is_initialized():
+            ray.init(ignore_reinit_error=True, log_to_driver=True)
+        engine = Engine(config, operators)
+        # Start with an empty dataset to kick off the pipeline
+        ds = ray.data.from_items([])
+        # Execute pipeline
+        results = engine.execute(ds)
+        # 5. Process Output
+        # Extract the result from the 'generate' node
+        if "generate" in results:
+            result_ds = results["generate"]
+            # Create a temporary file to save the output
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+            ) as tmpfile:
+                # Iterate over rows and write to file
+                for row in result_ds.iter_rows():
+                    json.dump(row, tmpfile, ensure_ascii=False)
+                    tmpfile.write("\n")
+                output_file = tmpfile.name
+        else:
+            raise gr.Error("Generation step failed to produce output.")
+        # Note: Dynamic token counting from distributed actors is not directly available
+        # via client properties in the new architecture. We return the estimated stats from input.
+        return output_file, params.token_counter
     except Exception as e:  # pylint: disable=broad-except
         raise gr.Error(f"Error occurred: {str(e)}")
     finally:
         # Clean up workspace
+        cleanup_workspace(working_dir)  # Optional: keep for debugging or enable
 with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
             ("简体中文", "zh"),
         ],
         value="en",
         render=False,
         container=False,
         elem_classes=["center-row"],
         os.path.join(root_dir, "webui", "translation.json"),
         lang_btn,
         placeholder_langs=["en", "zh"],
+        persistant=False,
     ):
         lang_btn.render()
             outputs=[output, token_counter],
         )
 if __name__ == "__main__":
     demo.queue(api_open=False, default_concurrency_limit=2)
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

graphgen/bases/__init__.py CHANGED Viewed

@@ -2,15 +2,11 @@ from .base_extractor import BaseExtractor
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper
 from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
 from .base_searcher import BaseSearcher
 from .base_splitter import BaseSplitter
-from .base_storage import (
-    BaseGraphStorage,
-    BaseKVStorage,
-    BaseListStorage,
-    StorageNameSpace,
-)
 from .base_tokenizer import BaseTokenizer
-from .datatypes import Chunk, QAPair, Token

 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper
+from .base_operator import BaseOperator
 from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
 from .base_searcher import BaseSearcher
 from .base_splitter import BaseSplitter
+from .base_storage import BaseGraphStorage, BaseKVStorage, StorageNameSpace
 from .base_tokenizer import BaseTokenizer
+from .datatypes import Chunk, Config, Node, QAPair, Token

graphgen/bases/base_llm_wrapper.py CHANGED Viewed

@@ -72,9 +72,3 @@ class BaseLLMWrapper(abc.ABC):
         filtered = filtered.strip()
         return filtered if filtered else text.strip()
-    def shutdown(self) -> None:
-        """Shutdown the LLM engine if applicable."""
-    def restart(self) -> None:
-        """Reinitialize the LLM engine if applicable."""


72
73	filtered = filtered.strip()
74	return filtered if filtered else text.strip()

graphgen/bases/base_operator.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import inspect
+import os
+from abc import ABC, abstractmethod
+from typing import Iterable, Union
+import pandas as pd
+import ray
+from graphgen.utils import CURRENT_LOGGER_VAR, set_logger
+class BaseOperator(ABC):
+    def __init__(self, working_dir: str = "cache", op_name: str = None):
+        log_dir = os.path.join(working_dir, "logs")
+        self.op_name = op_name or self.__class__.__name__
+        try:
+            ctx = ray.get_runtime_context()
+            worker_id = ctx.get_actor_id() or ctx.get_worker_id()
+            worker_id_short = worker_id[-6:] if worker_id else "driver"
+        except Exception as e:
+            print(
+                "Warning: Could not get Ray worker ID, defaulting to 'local'. Exception:",
+                e,
+            )
+            worker_id_short = "local"
+        # e.g. cache/logs/ChunkService_a1b2c3.log
+        log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log")
+        self.logger = set_logger(
+            log_file=log_file, name=f"{self.op_name}.{worker_id_short}", force=True
+        )
+        self.logger.info(
+            "[%s] Operator initialized on Worker %s", self.op_name, worker_id_short
+        )
+    def __call__(
+        self, batch: pd.DataFrame
+    ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]:
+        logger_token = CURRENT_LOGGER_VAR.set(self.logger)
+        try:
+            result = self.process(batch)
+            if inspect.isgenerator(result):
+                yield from result
+            else:
+                yield result
+        finally:
+            CURRENT_LOGGER_VAR.reset(logger_token)
+    @abstractmethod
+    def process(self, batch):
+        raise NotImplementedError("Subclasses must implement the process method.")
+    def get_logger(self):
+        return self.logger

graphgen/bases/base_partitioner.py CHANGED Viewed

@@ -7,7 +7,7 @@ from graphgen.bases.datatypes import Community
 class BasePartitioner(ABC):
     @abstractmethod
-    async def partition(
         self,
         g: BaseGraphStorage,
         **kwargs: Any,
@@ -20,39 +20,34 @@ class BasePartitioner(ABC):
         """
     @staticmethod
-    async def community2batch(
-        communities: List[Community], g: BaseGraphStorage
-    ) -> list[
-        tuple[
-            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
-        ]
     ]:
         """
         Convert communities to batches of nodes and edges.
-        :param communities
         :param g: Graph storage instance
         :return: List of batches, each batch is a tuple of (nodes, edges)
         """
-        batches = []
-        for comm in communities:
-            nodes = comm.nodes
-            edges = comm.edges
-            nodes_data = []
-            for node in nodes:
-                node_data = g.get_node(node)
-                if node_data:
-                    nodes_data.append((node, node_data))
-            edges_data = []
-            for u, v in edges:
-                edge_data = g.get_edge(u, v)
                 if edge_data:
-                    edges_data.append((u, v, edge_data))
-                else:
-                    edge_data = g.get_edge(v, u)
-                    if edge_data:
-                        edges_data.append((v, u, edge_data))
-            batches.append((nodes_data, edges_data))
-        return batches
     @staticmethod
     def _build_adjacency_list(

 class BasePartitioner(ABC):
     @abstractmethod
+    def partition(
         self,
         g: BaseGraphStorage,
         **kwargs: Any,
         """
     @staticmethod
+    def community2batch(
+        comm: Community, g: BaseGraphStorage
+    ) -> tuple[
+        list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
     ]:
         """
         Convert communities to batches of nodes and edges.
+        :param comm: Community
         :param g: Graph storage instance
         :return: List of batches, each batch is a tuple of (nodes, edges)
         """
+        nodes = comm.nodes
+        edges = comm.edges
+        nodes_data = []
+        for node in nodes:
+            node_data = g.get_node(node)
+            if node_data:
+                nodes_data.append((node, node_data))
+        edges_data = []
+        for u, v in edges:
+            edge_data = g.get_edge(u, v)
+            if edge_data:
+                edges_data.append((u, v, edge_data))
+            else:
+                edge_data = g.get_edge(v, u)
                 if edge_data:
+                    edges_data.append((v, u, edge_data))
+        return nodes_data, edges_data
     @staticmethod
     def _build_adjacency_list(

graphgen/bases/base_reader.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
 import requests
 class BaseReader(ABC):
@@ -10,56 +12,70 @@ class BaseReader(ABC):
     Abstract base class for reading and processing data.
     """
-    def __init__(self, text_column: str = "content"):
         self.text_column = text_column
     @abstractmethod
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
         """
         Read data from the specified file path.
-        :param file_path: Path to the input file.
-        :return: List of dictionaries containing the data.
         """
-    @staticmethod
-    def filter(data: List[dict]) -> List[dict]:
         """
-        Filter out entries with empty or missing text in the specified column.
-        :param data: List of dictionaries containing the data.
-        :return: Filtered list of dictionaries.
         """
-        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
-            """
-            Check if an image exists at the given local path or URL.
-            :param path_or_url: Local file path or remote URL of the image.
-            :param timeout: Timeout for remote URL requests in seconds.
-            :return: True if the image exists, False otherwise.
-            """
-            if not path_or_url:
-                return False
-            if not path_or_url.startswith(("http://", "https://", "ftp://")):
-                path = path_or_url.replace("file://", "", 1)
-                path = os.path.abspath(path)
-                return os.path.isfile(path)
-            try:
-                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
-                return resp.status_code == 200
-            except requests.RequestException:
-                return False
-        filtered_data = []
-        for item in data:
-            if item.get("type") == "text":
-                content = item.get("content", "").strip()
-                if content:
-                    filtered_data.append(item)
-            elif item.get("type") in ("image", "table", "equation"):
-                img_path = item.get("img_path")
-                if _image_exists(img_path):
-                    filtered_data.append(item)
-            else:
-                filtered_data.append(item)
-        return filtered_data

 import os
 from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
+import pandas as pd
 import requests
+from ray.data import Dataset
 class BaseReader(ABC):
     Abstract base class for reading and processing data.
     """
+    def __init__(self, text_column: str = "content", modalities: list = None):
         self.text_column = text_column
+        self.modalities = modalities if modalities is not None else ["text"]
     @abstractmethod
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
         """
         Read data from the specified file path.
+        :param input_path: Path to the input file or list of file paths.
+        :return: Ray Dataset containing the read data.
         """
+    def _should_keep_item(self, item: Dict[str, Any]) -> bool:
+        """
+        Determine whether to keep the given item based on the text column.
+        :param item: Dictionary representing a data entry.
+        :return: True if the item should be kept, False otherwise.
         """
+        item_type = item.get("type")
+        assert item_type in [
+            "text",
+            "image",
+            "table",
+            "equation",
+            "protein",
+        ], f"Unsupported item type: {item_type}"
+        if item_type == "text":
+            content = item.get(self.text_column, "").strip()
+            return bool(content)
+        return True
+    def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame:
+        """
+        Validate data format.
         """
+        if "type" not in batch.columns:
+            raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}")
+        if "text" in batch["type"].values:
+            if self.text_column not in batch.columns:
+                raise ValueError(
+                    f"Missing '{self.text_column}' column for text documents"
+                )
+        return batch
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False

graphgen/bases/base_splitter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from typing import Callable, Iterable, List, Literal, Optional, Union
 from graphgen.bases.datatypes import Chunk
-from graphgen.utils import logger
 class BaseSplitter(ABC):
@@ -33,7 +33,7 @@ class BaseSplitter(ABC):
         """
         Split the input text into smaller chunks.
-        :param text: The input text to be split.
         :return: A list of text chunks.
         """
@@ -111,7 +111,7 @@ class BaseSplitter(ABC):
     def _split_text_with_regex(
         text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
     ) -> List[str]:
-        # Now that we have the separator, split the text
         if separator:
             if keep_separator:
                 # The parentheses in the pattern keep the delimiters in the result.

 from typing import Callable, Iterable, List, Literal, Optional, Union
 from graphgen.bases.datatypes import Chunk
+from graphgen.utils.log import logger
 class BaseSplitter(ABC):
         """
         Split the input text into smaller chunks.
+        :param text: The input text to be chunk.
         :return: A list of text chunks.
         """
     def _split_text_with_regex(
         text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
     ) -> List[str]:
+        # Now that we have the separator, chunk the text
         if separator:
             if keep_separator:
                 # The parentheses in the pattern keep the delimiters in the result.

graphgen/bases/base_storage.py CHANGED Viewed

@@ -16,23 +16,6 @@ class StorageNameSpace:
         """commit the storage operations after querying"""
-class BaseListStorage(Generic[T], StorageNameSpace):
-    def all_items(self) -> list[T]:
-        raise NotImplementedError
-    def get_by_index(self, index: int) -> Union[T, None]:
-        raise NotImplementedError
-    def append(self, data: T):
-        raise NotImplementedError
-    def upsert(self, data: list[T]):
-        raise NotImplementedError
-    def drop(self):
-        raise NotImplementedError
 class BaseKVStorage(Generic[T], StorageNameSpace):
     def all_keys(self) -> list[str]:
         raise NotImplementedError
@@ -58,6 +41,9 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
     def drop(self):
         raise NotImplementedError
 class BaseGraphStorage(StorageNameSpace):
     def has_node(self, node_id: str) -> bool:
@@ -105,3 +91,6 @@ class BaseGraphStorage(StorageNameSpace):
     def delete_node(self, node_id: str):
         raise NotImplementedError

         """commit the storage operations after querying"""
 class BaseKVStorage(Generic[T], StorageNameSpace):
     def all_keys(self) -> list[str]:
         raise NotImplementedError
     def drop(self):
         raise NotImplementedError
+    def reload(self):
+        raise NotImplementedError
 class BaseGraphStorage(StorageNameSpace):
     def has_node(self, node_id: str) -> bool:
     def delete_node(self, node_id: str):
         raise NotImplementedError
+    def reload(self):
+        raise NotImplementedError

graphgen/bases/datatypes.py CHANGED Viewed

@@ -2,6 +2,8 @@ import math
 from dataclasses import dataclass, field
 from typing import List, Union
 @dataclass
 class Chunk:
@@ -48,3 +50,45 @@ class Community:
     nodes: List[str] = field(default_factory=list)
     edges: List[tuple] = field(default_factory=list)
     metadata: dict = field(default_factory=dict)

 from dataclasses import dataclass, field
 from typing import List, Union
+from pydantic import BaseModel, Field, field_validator
 @dataclass
 class Chunk:
     nodes: List[str] = field(default_factory=list)
     edges: List[tuple] = field(default_factory=list)
     metadata: dict = field(default_factory=dict)
+class Node(BaseModel):
+    id: str = Field(..., description="unique node id")
+    op_name: str = Field(..., description="operator name")
+    type: str = Field(
+        ..., description="task type, e.g., map, filter, flatmap, aggregate, map_batch"
+    )
+    params: dict = Field(default_factory=dict, description="operator parameters")
+    dependencies: List[str] = Field(
+        default_factory=list, description="list of dependent node ids"
+    )
+    execution_params: dict = Field(
+        default_factory=dict, description="execution parameters like replicas, batch_size"
+    )
+    @classmethod
+    @field_validator("type")
+    def validate_type(cls, v: str) -> str:
+        valid_types = {"map", "filter", "flatmap", "aggregate", "map_batch"}
+        if v not in valid_types:
+            raise ValueError(f"Invalid node type: {v}. Must be one of {valid_types}.")
+        return v
+class Config(BaseModel):
+    global_params: dict = Field(
+        default_factory=dict, description="global context for the computation graph"
+    )
+    nodes: List[Node] = Field(
+        ..., min_length=1, description="list of nodes in the computation graph"
+    )
+    @classmethod
+    @field_validator("nodes")
+    def validate_unique_ids(cls, v: List[Node]) -> List[Node]:
+        ids = [node.id for node in v]
+        if len(ids) != len(set(ids)):
+            duplicates = {id_ for id_ in ids if ids.count(id_) > 1}
+            raise ValueError(f"Duplicate node ids found: {duplicates}")
+        return v

graphgen/{operators/init → common}/__init__.py RENAMED Viewed

	@@ -1 +1,2 @@
1	from .init_llm import init_llm


1	from .init_llm import init_llm
2	+ from .init_storage import init_storage

graphgen/{operators/init → common}/init_llm.py RENAMED Viewed

@@ -1,56 +1,152 @@
 import os
 from typing import Any, Dict, Optional
 from graphgen.bases import BaseLLMWrapper
 from graphgen.models import Tokenizer
-class LLMFactory:
     """
-    A factory class to create LLM wrapper instances based on the specified backend.
-    Supported backends include:
-    - http_api: HTTPClient
-    - openai_api: OpenAIClient
-    - ollama_api: OllamaClient
-    - huggingface: HuggingFaceWrapper
-    - sglang: SGLangWrapper
     """
-    @staticmethod
-    def create_llm_wrapper(backend: str, config: Dict[str, Any]) -> BaseLLMWrapper:
-        # add tokenizer
-        tokenizer: Tokenizer = Tokenizer(
-            os.environ.get("TOKENIZER_MODEL", "cl100k_base"),
-        )
         config["tokenizer"] = tokenizer
         if backend == "http_api":
             from graphgen.models.llm.api.http_client import HTTPClient
-            return HTTPClient(**config)
-        if backend in ("openai_api", "azure_openai_api"):
             from graphgen.models.llm.api.openai_client import OpenAIClient
             # pass in concrete backend to the OpenAIClient so that internally we can distinguish
             # between OpenAI and Azure OpenAI
-            return OpenAIClient(**config, backend=backend)
-        if backend == "ollama_api":
             from graphgen.models.llm.api.ollama_client import OllamaClient
-            return OllamaClient(**config)
-        if backend == "huggingface":
             from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper
-            return HuggingFaceWrapper(**config)
-        if backend == "sglang":
             from graphgen.models.llm.local.sglang_wrapper import SGLangWrapper
-            return SGLangWrapper(**config)
-        # if backend == "vllm":
-        #     from graphgen.models.llm.local.vllm_wrapper import VLLMWrapper
-        #
-        #     return VLLMWrapper(**config)
-        raise NotImplementedError(f"Backend {backend} is not implemented yet.")
 def _load_env_group(prefix: str) -> Dict[str, Any]:
@@ -77,5 +173,5 @@ def init_llm(model_type: str) -> Optional[BaseLLMWrapper]:
     if not config:
         return None
     backend = config.pop("backend")
-    llm_wrapper = LLMFactory.create_llm_wrapper(backend, config)
     return llm_wrapper

 import os
 from typing import Any, Dict, Optional
+import ray
 from graphgen.bases import BaseLLMWrapper
+from graphgen.common.init_storage import get_actor_handle
 from graphgen.models import Tokenizer
+class LLMServiceActor:
     """
+    A Ray actor class to wrap LLM wrapper instances for distributed usage.
     """
+    def __init__(self, backend: str, config: Dict[str, Any]):
+        self.backend = backend
+        tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base")
+        tokenizer = Tokenizer(model_name=tokenizer_model)
         config["tokenizer"] = tokenizer
         if backend == "http_api":
             from graphgen.models.llm.api.http_client import HTTPClient
+            self.llm_instance = HTTPClient(**config)
+        elif backend in ("openai_api", "azure_openai_api"):
             from graphgen.models.llm.api.openai_client import OpenAIClient
             # pass in concrete backend to the OpenAIClient so that internally we can distinguish
             # between OpenAI and Azure OpenAI
+            self.llm_instance = OpenAIClient(**config, backend=backend)
+        elif backend == "ollama_api":
             from graphgen.models.llm.api.ollama_client import OllamaClient
+            self.llm_instance = OllamaClient(**config)
+        elif backend == "huggingface":
             from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper
+            self.llm_instance = HuggingFaceWrapper(**config)
+        elif backend == "sglang":
             from graphgen.models.llm.local.sglang_wrapper import SGLangWrapper
+            self.llm_instance = SGLangWrapper(**config)
+        elif backend == "vllm":
+            from graphgen.models.llm.local.vllm_wrapper import VLLMWrapper
+            self.llm_instance = VLLMWrapper(**config)
+        else:
+            raise NotImplementedError(f"Backend {backend} is not implemented yet.")
+    async def generate_answer(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> str:
+        return await self.llm_instance.generate_answer(text, history, **extra)
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        return await self.llm_instance.generate_topk_per_token(text, history, **extra)
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        return await self.llm_instance.generate_inputs_prob(text, history, **extra)
+    def ready(self) -> bool:
+        """A simple method to check if the actor is ready."""
+        return True
+class LLMServiceProxy(BaseLLMWrapper):
+    """
+    A proxy class to interact with the LLMServiceActor for distributed LLM operations.
+    """
+    def __init__(self, actor_name: str):
+        super().__init__()
+        self.actor_handle = get_actor_handle(actor_name)
+        self._create_local_tokenizer()
+    async def generate_answer(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> str:
+        object_ref = self.actor_handle.generate_answer.remote(text, history, **extra)
+        return await object_ref
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        object_ref = self.actor_handle.generate_topk_per_token.remote(
+            text, history, **extra
+        )
+        return await object_ref
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        object_ref = self.actor_handle.generate_inputs_prob.remote(
+            text, history, **extra
+        )
+        return await object_ref
+    def _create_local_tokenizer(self):
+        tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base")
+        self.tokenizer = Tokenizer(model_name=tokenizer_model)
+class LLMFactory:
+    """
+    A factory class to create LLM wrapper instances based on the specified backend.
+    Supported backends include:
+    - http_api: HTTPClient
+    - openai_api: OpenAIClient
+    - ollama_api: OllamaClient
+    - huggingface: HuggingFaceWrapper
+    - sglang: SGLangWrapper
+    """
+    @staticmethod
+    def create_llm(
+        model_type: str, backend: str, config: Dict[str, Any]
+    ) -> BaseLLMWrapper:
+        if not config:
+            raise ValueError(
+                f"No configuration provided for LLM {model_type} with backend {backend}."
+            )
+        actor_name = f"Actor_LLM_{model_type}"
+        try:
+            ray.get_actor(actor_name)
+        except ValueError:
+            print(f"Creating Ray actor for LLM {model_type} with backend {backend}.")
+            num_gpus = int(config.pop("num_gpus", 0))
+            actor = (
+                ray.remote(LLMServiceActor)
+                .options(
+                    name=actor_name,
+                    num_gpus=num_gpus,
+                    lifetime="detached",
+                    get_if_exists=True,
+                )
+                .remote(backend, config)
+            )
+            # wait for actor to be ready
+            ray.get(actor.ready.remote())
+        return LLMServiceProxy(actor_name)
 def _load_env_group(prefix: str) -> Dict[str, Any]:
     if not config:
         return None
     backend = config.pop("backend")
+    llm_wrapper = LLMFactory.create_llm(model_type, backend, config)
     return llm_wrapper

graphgen/common/init_storage.py ADDED Viewed

	@@ -0,0 +1,262 @@

+from typing import Any, Dict, Union
+import ray
+from graphgen.bases.base_storage import BaseGraphStorage, BaseKVStorage
+class KVStorageActor:
+    def __init__(self, backend: str, working_dir: str, namespace: str):
+        if backend == "json_kv":
+            from graphgen.models import JsonKVStorage
+            self.kv = JsonKVStorage(working_dir, namespace)
+        elif backend == "rocksdb":
+            from graphgen.models import RocksDBKVStorage
+            self.kv = RocksDBKVStorage(working_dir, namespace)
+        else:
+            raise ValueError(f"Unknown KV backend: {backend}")
+    def data(self) -> Dict[str, Dict]:
+        return self.kv.data
+    def all_keys(self) -> list[str]:
+        return self.kv.all_keys()
+    def index_done_callback(self):
+        return self.kv.index_done_callback()
+    def get_by_id(self, id: str) -> Dict:
+        return self.kv.get_by_id(id)
+    def get_by_ids(self, ids: list[str], fields=None) -> list:
+        return self.kv.get_by_ids(ids, fields)
+    def get_all(self) -> Dict[str, Dict]:
+        return self.kv.get_all()
+    def filter_keys(self, data: list[str]) -> set[str]:
+        return self.kv.filter_keys(data)
+    def upsert(self, data: dict) -> dict:
+        return self.kv.upsert(data)
+    def drop(self):
+        return self.kv.drop()
+    def reload(self):
+        return self.kv.reload()
+class GraphStorageActor:
+    def __init__(self, backend: str, working_dir: str, namespace: str):
+        if backend == "networkx":
+            from graphgen.models import NetworkXStorage
+            self.graph = NetworkXStorage(working_dir, namespace)
+        elif backend == "kuzu":
+            from graphgen.models import KuzuStorage
+            self.graph = KuzuStorage(working_dir, namespace)
+        else:
+            raise ValueError(f"Unknown Graph backend: {backend}")
+    def index_done_callback(self):
+        return self.graph.index_done_callback()
+    def has_node(self, node_id: str) -> bool:
+        return self.graph.has_node(node_id)
+    def has_edge(self, source_node_id: str, target_node_id: str):
+        return self.graph.has_edge(source_node_id, target_node_id)
+    def node_degree(self, node_id: str) -> int:
+        return self.graph.node_degree(node_id)
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return self.graph.edge_degree(src_id, tgt_id)
+    def get_node(self, node_id: str) -> Any:
+        return self.graph.get_node(node_id)
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        return self.graph.update_node(node_id, node_data)
+    def get_all_nodes(self) -> Any:
+        return self.graph.get_all_nodes()
+    def get_edge(self, source_node_id: str, target_node_id: str):
+        return self.graph.get_edge(source_node_id, target_node_id)
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return self.graph.update_edge(source_node_id, target_node_id, edge_data)
+    def get_all_edges(self) -> Any:
+        return self.graph.get_all_edges()
+    def get_node_edges(self, source_node_id: str) -> Any:
+        return self.graph.get_node_edges(source_node_id)
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        return self.graph.upsert_node(node_id, node_data)
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return self.graph.upsert_edge(source_node_id, target_node_id, edge_data)
+    def delete_node(self, node_id: str):
+        return self.graph.delete_node(node_id)
+    def reload(self):
+        return self.graph.reload()
+def get_actor_handle(name: str):
+    try:
+        return ray.get_actor(name)
+    except ValueError as exc:
+        raise RuntimeError(
+            f"Actor {name} not found. Make sure it is created before accessing."
+        ) from exc
+class RemoteKVStorageProxy(BaseKVStorage):
+    def __init__(self, namespace: str):
+        super().__init__()
+        self.namespace = namespace
+        self.actor_name = f"Actor_KV_{namespace}"
+        self.actor = get_actor_handle(self.actor_name)
+    def data(self) -> Dict[str, Any]:
+        return ray.get(self.actor.data.remote())
+    def all_keys(self) -> list[str]:
+        return ray.get(self.actor.all_keys.remote())
+    def index_done_callback(self):
+        return ray.get(self.actor.index_done_callback.remote())
+    def get_by_id(self, id: str) -> Union[Any, None]:
+        return ray.get(self.actor.get_by_id.remote(id))
+    def get_by_ids(self, ids: list[str], fields=None) -> list[Any]:
+        return ray.get(self.actor.get_by_ids.remote(ids, fields))
+    def get_all(self) -> Dict[str, Any]:
+        return ray.get(self.actor.get_all.remote())
+    def filter_keys(self, data: list[str]) -> set[str]:
+        return ray.get(self.actor.filter_keys.remote(data))
+    def upsert(self, data: Dict[str, Any]):
+        return ray.get(self.actor.upsert.remote(data))
+    def drop(self):
+        return ray.get(self.actor.drop.remote())
+    def reload(self):
+        return ray.get(self.actor.reload.remote())
+class RemoteGraphStorageProxy(BaseGraphStorage):
+    def __init__(self, namespace: str):
+        super().__init__()
+        self.namespace = namespace
+        self.actor_name = f"Actor_Graph_{namespace}"
+        self.actor = get_actor_handle(self.actor_name)
+    def index_done_callback(self):
+        return ray.get(self.actor.index_done_callback.remote())
+    def has_node(self, node_id: str) -> bool:
+        return ray.get(self.actor.has_node.remote(node_id))
+    def has_edge(self, source_node_id: str, target_node_id: str):
+        return ray.get(self.actor.has_edge.remote(source_node_id, target_node_id))
+    def node_degree(self, node_id: str) -> int:
+        return ray.get(self.actor.node_degree.remote(node_id))
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return ray.get(self.actor.edge_degree.remote(src_id, tgt_id))
+    def get_node(self, node_id: str) -> Any:
+        return ray.get(self.actor.get_node.remote(node_id))
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        return ray.get(self.actor.update_node.remote(node_id, node_data))
+    def get_all_nodes(self) -> Any:
+        return ray.get(self.actor.get_all_nodes.remote())
+    def get_edge(self, source_node_id: str, target_node_id: str):
+        return ray.get(self.actor.get_edge.remote(source_node_id, target_node_id))
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return ray.get(
+            self.actor.update_edge.remote(source_node_id, target_node_id, edge_data)
+        )
+    def get_all_edges(self) -> Any:
+        return ray.get(self.actor.get_all_edges.remote())
+    def get_node_edges(self, source_node_id: str) -> Any:
+        return ray.get(self.actor.get_node_edges.remote(source_node_id))
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        return ray.get(self.actor.upsert_node.remote(node_id, node_data))
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        return ray.get(
+            self.actor.upsert_edge.remote(source_node_id, target_node_id, edge_data)
+        )
+    def delete_node(self, node_id: str):
+        return ray.get(self.actor.delete_node.remote(node_id))
+    def reload(self):
+        return ray.get(self.actor.reload.remote())
+class StorageFactory:
+    """
+    Factory class to create storage instances based on backend.
+    """
+    @staticmethod
+    def create_storage(backend: str, working_dir: str, namespace: str):
+        if backend in ["json_kv", "rocksdb"]:
+            actor_name = f"Actor_KV_{namespace}"
+            try:
+                ray.get_actor(actor_name)
+            except ValueError:
+                ray.remote(KVStorageActor).options(
+                    name=actor_name,
+                    lifetime="detached",
+                    get_if_exists=True,
+                ).remote(backend, working_dir, namespace)
+            return RemoteKVStorageProxy(namespace)
+        if backend in ["networkx", "kuzu"]:
+            actor_name = f"Actor_Graph_{namespace}"
+            try:
+                ray.get_actor(actor_name)
+            except ValueError:
+                ray.remote(GraphStorageActor).options(
+                    name=actor_name,
+                    lifetime="detached",
+                    get_if_exists=True,
+                ).remote(backend, working_dir, namespace)
+            return RemoteGraphStorageProxy(namespace)
+        raise ValueError(f"Unknown storage backend: {backend}")
+def init_storage(backend: str, working_dir: str, namespace: str):
+    return StorageFactory.create_storage(backend, working_dir, namespace)

graphgen/configs/aggregated_config.yaml DELETED Viewed

@@ -1,41 +0,0 @@
-pipeline:
-  - name: read_step # step name is unique in the pipeline, and can be referenced by other steps
-    op_key: read
-    params:
-      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
-    params:
-        chunk_size: 1024 # chunk size for text splitting
-        chunk_overlap: 100 # chunk overlap for text splitting
-  - name: build_kg_step
-    op_key: build_kg
-    deps: [chunk_step] # build_kg_step depends on chunk_step
-  - name: quiz_and_judge_step
-    op_key: quiz_and_judge
-    deps: [build_kg_step] # quiz_and_judge depends on build_kg_step
-    params:
-      quiz_samples: 2 # number of quiz samples to generate
-      re_judge: false # whether to re-judge the existing quiz samples
-  - name: partition_step
-    op_key: partition
-    deps: [quiz_and_judge_step] # partition_step depends on quiz_and_judge_step
-    params:
-      method: ece # ece is a custom partition method based on comprehension loss
-      method_params:
-        max_units_per_community: 20 # max nodes and edges per community
-        min_units_per_community: 5 # min nodes and edges per community
-        max_tokens_per_community: 10240 # max tokens per community
-        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
-  - name: generate_step
-    op_key: generate
-    deps: [partition_step] # generate_step depends on partition_step
-    params:
-      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
-      data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/atomic_config.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
-    params:
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
-  - name: build_kg_step
-    op_key: build_kg
-    deps: [chunk_step] # build_kg depends on chunk_step
-  - name: partition_step
-    op_key: partition
-    deps: [build_kg] # partition_step depends on build_kg
-    params:
-      method: dfs # partition method, support: dfs, bfs, ece, leiden
-      method_params:
-        max_units_per_community: 1 # atomic partition, one node or edge per community
-  - name: generate_step
-    op_key: generate
-    deps: [partition_step] # generate_step depends on partition_step
-    params:
-      method: atomic # atomic, aggregated, multi_hop, cot, vqa
-      data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
-    params:
-        chunk_size: 1024 # chunk size for text splitting
-        chunk_overlap: 100 # chunk overlap for text splitting
-  - name: build_kg_step
-    op_key: build_kg
-    deps: [chunk_step] # build_kg depends on chunk_step
-  - name: partition_step
-    op_key: partition
-    deps: [build_kg_step] # partition_step depends on build_kg
-    params:
-      method: leiden # leiden is a partitioner detection algorithm
-      method_params:
-        max_size: 20 # Maximum size of communities
-        use_lcc: false # whether to use the largest connected component
-        random_seed: 42 # random seed for partitioning
-  - name: generate_step
-    op_key: generate
-    deps: [partition_step] # generate_step depends on partition_step
-    params:
-      method: cot # atomic, aggregated, multi_hop, cot, vqa
-      data_format: Sharegpt # Alpaca, Sharegpt, ChatML

graphgen/configs/multi_hop_config.yaml DELETED Viewed

@@ -1,34 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
-    params:
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
-  - name: build_kg_step
-    op_key: build_kg
-    deps: [chunk_step] # build_kg_step depends on chunk_step
-  - name: partition_step
-    op_key: partition
-    deps: [build_kg_step] # partition_step depends on build_kg_step
-    params:
-      method: ece # ece is a custom partition method based on comprehension loss
-      method_params:
-        max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
-        min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
-        max_tokens_per_community: 10240 # max tokens per community
-        unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
-  - name: generate_step
-    op_key: generate
-    deps: [partition_step] # generate_step depends on partition_step
-    params:
-      method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
-      data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/schema_guided_extraction_config.yaml DELETED Viewed

@@ -1,20 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
-    params:
-      chunk_size: 20480
-      chunk_overlap: 2000
-      separators: []
-  - name: extract_step
-    op_key: extract
-    deps: [chunk_step] # extract_step depends on chunk_step
-    params:
-      method: schema_guided # extraction method, support: schema_guided
-      schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method

graphgen/configs/search_dna_config.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file:  resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: search_step
-    op_key: search
-    deps: [read_step] # search_step depends on read_step
-    params:
-      data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
-      ncbi_params:
-        email: test@example.com # NCBI requires an email address
-        tool: GraphGen # tool name for NCBI API
-        use_local_blast: true # whether to use local blast for DNA search
-        local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)

graphgen/configs/search_protein_config.yaml DELETED Viewed

@@ -1,15 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file:  resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: search_step
-    op_key: search
-    deps: [read_step] # search_step depends on read_step
-    params:
-      data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
-      uniprot_params:
-        use_local_blast: true # whether to use local blast for uniprot search
-        local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot
-        # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database)

graphgen/configs/search_rna_config.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file:  resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: search_step
-    op_key: search
-    deps: [read_step] # search_step depends on read_step
-    params:
-      data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
-      rnacentral_params:
-        use_local_blast: true # whether to use local blast for RNA search
-        local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)

graphgen/configs/vqa_config.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-pipeline:
-  - name: read_step
-    op_key: read
-    params:
-      input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
-    params:
-        chunk_size: 1024 # chunk size for text splitting
-        chunk_overlap: 100 # chunk overlap for text splitting
-  - name: build_kg_step
-    op_key: build_kg
-    deps: [chunk_step] # build_kg depends on chunk_step
-  - name: partition_step
-    op_key: partition
-    deps: [build_kg_step] # partition_step depends on build_kg_step
-    params:
-      method: anchor_bfs # partition method
-      method_params:
-        anchor_type: image # node type to select anchor nodes
-        max_units_per_community: 10 # atomic partition, one node or edge per community
-  - name: generate_step
-    op_key: generate
-    deps: [partition_step] # generate_step depends on partition_step
-    params:
-      method: vqa # atomic, aggregated, multi_hop, cot, vqa
-      data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/engine.py CHANGED Viewed

@@ -1,125 +1,210 @@
-"""
-orchestration engine for GraphGen
-"""
-import threading
-import traceback
-from typing import Any, Callable, List
-class Context(dict):
-    _lock = threading.Lock()
-    def set(self, k, v):
-        with self._lock:
-            self[k] = v
-    def get(self, k, default=None):
-        with self._lock:
-            return super().get(k, default)
-class OpNode:
     def __init__(
-        self, name: str, deps: List[str], func: Callable[["OpNode", Context], Any]
     ):
-        self.name, self.deps, self.func = name, deps, func
-class Engine:
-    def __init__(self, max_workers: int = 4):
-        self.max_workers = max_workers
-    def run(self, ops: List[OpNode], ctx: Context):
-        self._validate(ops)
-        name2op = {operation.name: operation for operation in ops}
-        # topological sort
-        graph = {n: set(name2op[n].deps) for n in name2op}
-        topo = []
-        q = [n for n, d in graph.items() if not d]
-        while q:
-            cur = q.pop(0)
-            topo.append(cur)
-            for child in [c for c, d in graph.items() if cur in d]:
-                graph[child].remove(cur)
-                if not graph[child]:
-                    q.append(child)
-        if len(topo) != len(ops):
             raise ValueError(
-                "Cyclic dependencies detected among operations."
-                "Please check your configuration."
             )
-        # semaphore for max_workers
-        sem = threading.Semaphore(self.max_workers)
-        done = {n: threading.Event() for n in name2op}
-        exc = {}
-        def _exec(n: str):
-            with sem:
-                for d in name2op[n].deps:
-                    done[d].wait()
-                if any(d in exc for d in name2op[n].deps):
-                    exc[n] = Exception("Skipped due to failed dependencies")
-                    done[n].set()
-                    return
-                try:
-                    name2op[n].func(name2op[n], ctx)
-                except Exception:
-                    exc[n] = traceback.format_exc()
-                done[n].set()
-        ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
-        for t in ts:
-            t.start()
-        for t in ts:
-            t.join()
-        if exc:
-            raise RuntimeError(
-                "Some operations failed:\n"
-                + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
             )
-    @staticmethod
-    def _validate(ops: List[OpNode]):
-        name_set = set()
-        for op in ops:
-            if op.name in name_set:
-                raise ValueError(f"Duplicate operation name: {op.name}")
-            name_set.add(op.name)
-        for op in ops:
-            for dep in op.deps:
-                if dep not in name_set:
-                    raise ValueError(
-                        f"Operation {op.name} has unknown dependency: {dep}"
-                    )
-def collect_ops(config: dict, graph_gen) -> List[OpNode]:
-    """
-    build operation nodes from yaml config
-    :param config
-    :param graph_gen
-    """
-    ops: List[OpNode] = []
-    for stage in config["pipeline"]:
-        name = stage["name"]
-        method_name = stage.get("op_key")
-        method = getattr(graph_gen, method_name)
-        deps = stage.get("deps", [])
-        if "params" in stage:
-            def func(self, ctx, _method=method, _params=stage.get("params", {})):
-                return _method(_params)
-        else:
-            def func(self, ctx, _method=method):
-                return _method()
-        op_node = OpNode(name=name, deps=deps, func=func)
-        ops.append(op_node)
-    return ops

+import inspect
+import logging
+from collections import defaultdict, deque
+from functools import wraps
+from typing import Any, Callable, Dict, List, Set
+import ray
+import ray.data
+from graphgen.bases import Config, Node
+from graphgen.utils import logger
+class Engine:
     def __init__(
+        self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs
     ):
+        self.config = Config(**config)
+        self.global_params = self.config.global_params
+        self.functions = functions
+        self.datasets: Dict[str, ray.data.Dataset] = {}
+        if not ray.is_initialized():
+            context = ray.init(
+                ignore_reinit_error=True,
+                logging_level=logging.ERROR,
+                log_to_driver=True,
+                **ray_init_kwargs,
+            )
+            logger.info("Ray Dashboard URL: %s", context.dashboard_url)
+    @staticmethod
+    def _topo_sort(nodes: List[Node]) -> List[Node]:
+        id_to_node: Dict[str, Node] = {}
+        for n in nodes:
+            id_to_node[n.id] = n
+        indeg: Dict[str, int] = {nid: 0 for nid in id_to_node}
+        adj: Dict[str, List[str]] = defaultdict(list)
+        for n in nodes:
+            nid = n.id
+            deps: List[str] = n.dependencies
+            uniq_deps: Set[str] = set(deps)
+            for d in uniq_deps:
+                if d not in id_to_node:
+                    raise ValueError(
+                        f"The dependency node id {d} of node {nid} is not defined in the configuration."
+                    )
+                indeg[nid] += 1
+                adj[d].append(nid)
+        zero_deg: deque = deque(
+            [id_to_node[nid] for nid, deg in indeg.items() if deg == 0]
+        )
+        sorted_nodes: List[Node] = []
+        while zero_deg:
+            cur = zero_deg.popleft()
+            sorted_nodes.append(cur)
+            cur_id = cur.id
+            for nb_id in adj.get(cur_id, []):
+                indeg[nb_id] -= 1
+                if indeg[nb_id] == 0:
+                    zero_deg.append(id_to_node[nb_id])
+        if len(sorted_nodes) != len(nodes):
+            remaining = [nid for nid, deg in indeg.items() if deg > 0]
             raise ValueError(
+                f"The configuration contains cycles, unable to execute. Remaining nodes with indegree > 0: {remaining}"
             )
+        return sorted_nodes
+    def _get_input_dataset(
+        self, node: Node, initial_ds: ray.data.Dataset
+    ) -> ray.data.Dataset:
+        deps = node.dependencies
+        if not deps:
+            return initial_ds
+        if len(deps) == 1:
+            return self.datasets[deps[0]]
+        main_ds = self.datasets[deps[0]]
+        other_dss = [self.datasets[d] for d in deps[1:]]
+        return main_ds.union(*other_dss)
+    def _execute_node(self, node: Node, initial_ds: ray.data.Dataset):
+        def _filter_kwargs(
+            func_or_class: Callable,
+            global_params: Dict[str, Any],
+            func_params: Dict[str, Any],
+        ) -> Dict[str, Any]:
+            """
+            1. global_params: only when specified in function signature, will be passed
+            2. func_params: pass specified params first, then **kwargs if exists
+            """
+            try:
+                sig = inspect.signature(func_or_class)
+            except ValueError:
+                return {}
+            params = sig.parameters
+            final_kwargs = {}
+            has_var_keywords = any(
+                p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+            )
+            valid_keys = set(params.keys())
+            for k, v in global_params.items():
+                if k in valid_keys:
+                    final_kwargs[k] = v
+            for k, v in func_params.items():
+                if k in valid_keys or has_var_keywords:
+                    final_kwargs[k] = v
+            return final_kwargs
+        if node.op_name not in self.functions:
+            raise ValueError(f"Operator {node.op_name} not found for node {node.id}")
+        op_handler = self.functions[node.op_name]
+        node_params = _filter_kwargs(op_handler, self.global_params, node.params or {})
+        if node.type == "source":
+            self.datasets[node.id] = op_handler(**node_params)
+            return
+        input_ds = self._get_input_dataset(node, initial_ds)
+        if inspect.isclass(op_handler):
+            execution_params = node.execution_params or {}
+            replicas = execution_params.get("replicas", 1)
+            batch_size = (
+                int(execution_params.get("batch_size"))
+                if "batch_size" in execution_params
+                else "default"
             )
+            compute_resources = execution_params.get("compute_resources", {})
+            if node.type == "aggregate":
+                self.datasets[node.id] = input_ds.repartition(1).map_batches(
+                    op_handler,
+                    compute=ray.data.ActorPoolStrategy(min_size=1, max_size=1),
+                    batch_size=None,  # aggregate processes the whole dataset at once
+                    num_gpus=compute_resources.get("num_gpus", 0)
+                    if compute_resources
+                    else 0,
+                    fn_constructor_kwargs=node_params,
+                    batch_format="pandas",
+                )
+            else:
+                # others like map, filter, flatmap, map_batch let actors process data inside batches
+                self.datasets[node.id] = input_ds.map_batches(
+                    op_handler,
+                    compute=ray.data.ActorPoolStrategy(min_size=1, max_size=replicas),
+                    batch_size=batch_size,
+                    num_gpus=compute_resources.get("num_gpus", 0)
+                    if compute_resources
+                    else 0,
+                    fn_constructor_kwargs=node_params,
+                    batch_format="pandas",
+                )
+        else:
+            @wraps(op_handler)
+            def func_wrapper(row_or_batch: Dict[str, Any]) -> Dict[str, Any]:
+                return op_handler(row_or_batch, **node_params)
+            if node.type == "map":
+                self.datasets[node.id] = input_ds.map(func_wrapper)
+            elif node.type == "filter":
+                self.datasets[node.id] = input_ds.filter(func_wrapper)
+            elif node.type == "flatmap":
+                self.datasets[node.id] = input_ds.flat_map(func_wrapper)
+            elif node.type == "aggregate":
+                self.datasets[node.id] = input_ds.repartition(1).map_batches(
+                    func_wrapper, batch_format="default"
+                )
+            elif node.type == "map_batch":
+                self.datasets[node.id] = input_ds.map_batches(func_wrapper)
+            else:
+                raise ValueError(
+                    f"Unsupported node type {node.type} for node {node.id}"
+                )
+    @staticmethod
+    def _find_leaf_nodes(nodes: List[Node]) -> Set[str]:
+        all_ids = {n.id for n in nodes}
+        deps_set = set()
+        for n in nodes:
+            deps_set.update(n.dependencies)
+        return all_ids - deps_set
+    def execute(self, initial_ds: ray.data.Dataset) -> Dict[str, ray.data.Dataset]:
+        sorted_nodes = self._topo_sort(self.config.nodes)
+        for node in sorted_nodes:
+            self._execute_node(node, initial_ds)
+        leaf_nodes = self._find_leaf_nodes(sorted_nodes)
+        @ray.remote
+        def _fetch_result(ds: ray.data.Dataset) -> List[Any]:
+            return ds.take_all()
+        return {node_id: self.datasets[node_id] for node_id in leaf_nodes}

graphgen/graphgen.py DELETED Viewed

@@ -1,295 +0,0 @@
-import os
-import time
-from typing import Dict
-import gradio as gr
-from graphgen.bases import BaseLLMWrapper
-from graphgen.bases.datatypes import Chunk
-from graphgen.models import (
-    JsonKVStorage,
-    JsonListStorage,
-    NetworkXStorage,
-    OpenAIClient,
-    Tokenizer,
-)
-from graphgen.operators import (
-    build_kg,
-    chunk_documents,
-    extract_info,
-    generate_qas,
-    init_llm,
-    judge_statement,
-    partition_kg,
-    quiz,
-    read_files,
-    search_all,
-)
-from graphgen.utils import async_to_sync_method, compute_mm_hash, logger
-sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-class GraphGen:
-    def __init__(
-        self,
-        unique_id: int = int(time.time()),
-        working_dir: str = os.path.join(sys_path, "cache"),
-        tokenizer_instance: Tokenizer = None,
-        synthesizer_llm_client: OpenAIClient = None,
-        trainee_llm_client: OpenAIClient = None,
-        progress_bar: gr.Progress = None,
-    ):
-        self.unique_id: int = unique_id
-        self.working_dir: str = working_dir
-        # llm
-        self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
-            model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base")
-        )
-        self.synthesizer_llm_client: BaseLLMWrapper = (
-            synthesizer_llm_client or init_llm("synthesizer")
-        )
-        self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
-        self.full_docs_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="full_docs"
-        )
-        self.chunks_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="chunks"
-        )
-        self.graph_storage: NetworkXStorage = NetworkXStorage(
-            self.working_dir, namespace="graph"
-        )
-        self.rephrase_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="rephrase"
-        )
-        self.partition_storage: JsonListStorage = JsonListStorage(
-            self.working_dir, namespace="partition"
-        )
-        self.search_storage: JsonKVStorage = JsonKVStorage(
-            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
-            namespace="search",
-        )
-        self.qa_storage: JsonListStorage = JsonListStorage(
-            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
-            namespace="qa",
-        )
-        self.extract_storage: JsonKVStorage = JsonKVStorage(
-            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
-            namespace="extraction",
-        )
-        # webui
-        self.progress_bar: gr.Progress = progress_bar
-    @async_to_sync_method
-    async def read(self, read_config: Dict):
-        """
-        read files from input sources
-        """
-        doc_stream = read_files(**read_config, cache_dir=self.working_dir)
-        batch = {}
-        for doc in doc_stream:
-            doc_id = compute_mm_hash(doc, prefix="doc-")
-            batch[doc_id] = doc
-        # TODO: configurable whether to use coreference resolution
-        _add_doc_keys = self.full_docs_storage.filter_keys(list(batch.keys()))
-        new_docs = {k: v for k, v in batch.items() if k in _add_doc_keys}
-        if len(new_docs) == 0:
-            logger.warning("All documents are already in the storage")
-            return
-        self.full_docs_storage.upsert(new_docs)
-        self.full_docs_storage.index_done_callback()
-    @async_to_sync_method
-    async def chunk(self, chunk_config: Dict):
-        """
-        chunk documents into smaller pieces from full_docs_storage if not already present
-        """
-        new_docs = self.full_docs_storage.get_all()
-        if len(new_docs) == 0:
-            logger.warning("All documents are already in the storage")
-            return
-        inserting_chunks = await chunk_documents(
-            new_docs,
-            self.tokenizer_instance,
-            self.progress_bar,
-            **chunk_config,
-        )
-        _add_chunk_keys = self.chunks_storage.filter_keys(list(inserting_chunks.keys()))
-        inserting_chunks = {
-            k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
-        }
-        if len(inserting_chunks) == 0:
-            logger.warning("All chunks are already in the storage")
-            return
-        self.chunks_storage.upsert(inserting_chunks)
-        self.chunks_storage.index_done_callback()
-    @async_to_sync_method
-    async def build_kg(self):
-        """
-        build knowledge graph from text chunks
-        """
-        # Step 1: get new chunks
-        inserting_chunks = self.chunks_storage.get_all()
-        if len(inserting_chunks) == 0:
-            logger.warning("All chunks are already in the storage")
-            return
-        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
-        # Step 2: build knowledge graph from new chunks
-        _add_entities_and_relations = await build_kg(
-            llm_client=self.synthesizer_llm_client,
-            kg_instance=self.graph_storage,
-            chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()],
-            progress_bar=self.progress_bar,
-        )
-        if not _add_entities_and_relations:
-            logger.warning("No entities or relations extracted from text chunks")
-            return
-        # Step 3: upsert new entities and relations to the graph storage
-        self.graph_storage.index_done_callback()
-        return _add_entities_and_relations
-    @async_to_sync_method
-    async def search(self, search_config: Dict):
-        logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
-        seeds = self.full_docs_storage.get_all()
-        if len(seeds) == 0:
-            logger.warning("All documents are already been searched")
-            return
-        search_results = await search_all(
-            seed_data=seeds,
-            search_config=search_config,
-        )
-        _add_search_keys = self.search_storage.filter_keys(list(search_results.keys()))
-        search_results = {
-            k: v for k, v in search_results.items() if k in _add_search_keys
-        }
-        if len(search_results) == 0:
-            logger.warning("All search results are already in the storage")
-            return
-        self.search_storage.upsert(search_results)
-        self.search_storage.index_done_callback()
-    @async_to_sync_method
-    async def quiz_and_judge(self, quiz_and_judge_config: Dict):
-        logger.warning(
-            "Quiz and Judge operation needs trainee LLM client."
-            " Make sure to provide one."
-        )
-        max_samples = quiz_and_judge_config["quiz_samples"]
-        await quiz(
-            self.synthesizer_llm_client,
-            self.graph_storage,
-            self.rephrase_storage,
-            max_samples,
-            progress_bar=self.progress_bar,
-        )
-        # TODO： assert trainee_llm_client is valid before judge
-        if not self.trainee_llm_client:
-            # TODO: shutdown existing synthesizer_llm_client properly
-            logger.info("No trainee LLM client provided, initializing a new one.")
-            self.synthesizer_llm_client.shutdown()
-            self.trainee_llm_client = init_llm("trainee")
-        re_judge = quiz_and_judge_config["re_judge"]
-        _update_relations = await judge_statement(
-            self.trainee_llm_client,
-            self.graph_storage,
-            self.rephrase_storage,
-            re_judge,
-            progress_bar=self.progress_bar,
-        )
-        self.rephrase_storage.index_done_callback()
-        _update_relations.index_done_callback()
-        logger.info("Shutting down trainee LLM client.")
-        self.trainee_llm_client.shutdown()
-        self.trainee_llm_client = None
-        logger.info("Restarting synthesizer LLM client.")
-        self.synthesizer_llm_client.restart()
-    @async_to_sync_method
-    async def partition(self, partition_config: Dict):
-        batches = await partition_kg(
-            self.graph_storage,
-            self.chunks_storage,
-            self.tokenizer_instance,
-            partition_config,
-        )
-        self.partition_storage.upsert(batches)
-        return batches
-    @async_to_sync_method
-    async def extract(self, extract_config: Dict):
-        logger.info("Extracting information from given chunks...")
-        results = await extract_info(
-            self.synthesizer_llm_client,
-            self.chunks_storage,
-            extract_config,
-            progress_bar=self.progress_bar,
-        )
-        if not results:
-            logger.warning("No information extracted")
-            return
-        self.extract_storage.upsert(results)
-        self.extract_storage.index_done_callback()
-    @async_to_sync_method
-    async def generate(self, generate_config: Dict):
-        batches = self.partition_storage.data
-        if not batches:
-            logger.warning("No partitions found for QA generation")
-            return
-        # Step 2： generate QA pairs
-        results = await generate_qas(
-            self.synthesizer_llm_client,
-            batches,
-            generate_config,
-            progress_bar=self.progress_bar,
-        )
-        if not results:
-            logger.warning("No QA pairs generated")
-            return
-        # Step 3: store the generated QA pairs
-        self.qa_storage.upsert(results)
-        self.qa_storage.index_done_callback()
-    @async_to_sync_method
-    async def clear(self):
-        self.full_docs_storage.drop()
-        self.chunks_storage.drop()
-        self.search_storage.drop()
-        self.graph_storage.clear()
-        self.rephrase_storage.drop()
-        self.qa_storage.drop()
-        logger.info("All caches are cleared")
-    # TODO: add data filtering step here in the future
-    # graph_gen.filter(filter_config=config["filter"])

graphgen/models/__init__.py CHANGED Viewed

@@ -18,7 +18,6 @@ from .partitioner import (
 )
 from .reader import (
     CSVReader,
-    JSONLReader,
     JSONReader,
     ParquetReader,
     PDFReader,
@@ -33,5 +32,11 @@ from .searcher.kg.wiki_search import WikiSearch
 from .searcher.web.bing_search import BingSearch
 from .searcher.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
-from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache
 from .tokenizer import Tokenizer

 )
 from .reader import (
     CSVReader,
     JSONReader,
     ParquetReader,
     PDFReader,
 from .searcher.web.bing_search import BingSearch
 from .searcher.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
+from .storage import (
+    JsonKVStorage,
+    KuzuStorage,
+    NetworkXStorage,
+    RocksDBCache,
+    RocksDBKVStorage,
+)
 from .tokenizer import Tokenizer

graphgen/models/extractor/schema_guided_extractor.py CHANGED Viewed

@@ -60,8 +60,8 @@ class SchemaGuidedExtractor(BaseExtractor):
         return prompt
     async def extract(self, chunk: dict) -> dict:
-        _chunk_id = list(chunk.keys())[0]
-        text = chunk[_chunk_id].get("content", "")
         prompt = self.build_prompt(text)
         response = await self.llm_client.generate_answer(prompt)
@@ -88,9 +88,7 @@ class SchemaGuidedExtractor(BaseExtractor):
             return {}
     @staticmethod
-    async def merge_extractions(
-        extraction_list: List[Dict[str, dict]]
-    ) -> Dict[str, dict]:
         """
         Merge multiple extraction results based on their hashes.
         :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.

         return prompt
     async def extract(self, chunk: dict) -> dict:
+        _chunk_id = chunk.get("_chunk_id", "")
+        text = chunk.get("content", "")
         prompt = self.build_prompt(text)
         response = await self.llm_client.generate_answer(prompt)
             return {}
     @staticmethod
+    def merge_extractions(extraction_list: List[Dict[str, dict]]) -> Dict[str, dict]:
         """
         Merge multiple extraction results based on their hashes.
         :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.

graphgen/models/generator/vqa_generator.py CHANGED Viewed

@@ -77,8 +77,8 @@ class VQAGenerator(BaseGenerator):
         nodes, _ = batch
         for node in nodes:
             node_data = node[1]
-            if "images" in node_data and node_data["images"]:
-                img_path = node_data["images"]["img_path"]
                 for qa in qa_pairs.values():
                     qa["img_path"] = img_path
         result.update(qa_pairs)

         nodes, _ = batch
         for node in nodes:
             node_data = node[1]
+            if "image_data" in node_data and node_data["image_data"]:
+                img_path = node_data["image_data"]["img_path"]
                 for qa in qa_pairs.values():
                     qa["img_path"] = img_path
         result.update(qa_pairs)

graphgen/models/llm/local/sglang_wrapper.py CHANGED Viewed

@@ -138,15 +138,3 @@ class SGLangWrapper(BaseLLMWrapper):
         raise NotImplementedError(
             "SGLangWrapper does not support per-token logprobs yet."
         )
-    def shutdown(self) -> None:
-        """Gracefully shutdown the SGLang engine."""
-        if hasattr(self, "engine"):
-            self.engine.shutdown()
-    def restart(self) -> None:
-        """Restart the SGLang engine."""
-        self.shutdown()
-        self.engine = self.engine.__class__(
-            model_path=self.model_path, tp_size=self.tp_size
-        )

         raise NotImplementedError(
             "SGLangWrapper does not support per-token logprobs yet."
         )

graphgen/models/llm/local/vllm_wrapper.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from typing import Any, List, Optional
 from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
@@ -6,7 +8,7 @@ from graphgen.bases.datatypes import Token
 class VLLMWrapper(BaseLLMWrapper):
     """
-    Async inference backend based on vLLM (https://github.com/vllm-project/vllm)
     """
     def __init__(
@@ -20,12 +22,11 @@ class VLLMWrapper(BaseLLMWrapper):
         **kwargs: Any,
     ):
         super().__init__(temperature=temperature, top_p=top_p, **kwargs)
         try:
             from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
         except ImportError as exc:
             raise ImportError(
-                "VLLMWrapper requires vllm. Install it with:  uv pip install vllm --torch-backend=auto"
             ) from exc
         self.SamplingParams = SamplingParams
@@ -35,9 +36,9 @@ class VLLMWrapper(BaseLLMWrapper):
             tensor_parallel_size=tensor_parallel_size,
             gpu_memory_utilization=gpu_memory_utilization,
             trust_remote_code=kwargs.get("trust_remote_code", True),
         )
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
         self.temperature = temperature
         self.top_p = top_p
         self.topk = topk
@@ -60,6 +61,7 @@ class VLLMWrapper(BaseLLMWrapper):
         self, text: str, history: Optional[List[str]] = None, **extra: Any
     ) -> str:
         full_prompt = self._build_inputs(text, history)
         sp = self.SamplingParams(
             temperature=self.temperature if self.temperature > 0 else 1.0,
@@ -67,71 +69,57 @@ class VLLMWrapper(BaseLLMWrapper):
             max_tokens=extra.get("max_new_tokens", 512),
         )
-        results = []
-        async for req_output in self.engine.generate(
-            full_prompt, sp, request_id="graphgen_req"
-        ):
-            results = req_output.outputs
-        return results[-1].text
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
     ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
         sp = self.SamplingParams(
             temperature=0,
             max_tokens=1,
             logprobs=self.topk,
         )
-        results = []
-        async for req_output in self.engine.generate(
-            full_prompt, sp, request_id="graphgen_topk"
         ):
-            results = req_output.outputs
-        top_logprobs = results[-1].logprobs[0]
         tokens = []
         for _, logprob_obj in top_logprobs.items():
             tok_str = logprob_obj.decoded_token
-            prob = float(logprob_obj.logprob.exp())
             tokens.append(Token(tok_str, prob))
         tokens.sort(key=lambda x: -x.prob)
         return tokens
     async def generate_inputs_prob(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
     ) -> List[Token]:
-        full_prompt = self._build_inputs(text, history)
-        # vLLM 没有现成的“mask 一个 token 再算 prob”接口，
-        # 我们采用最直观的方式：把 prompt 一次性送进去，打开
-        # prompt_logprobs=True，让 vLLM 返回 *输入部分* 每个位置的
-        # logprob，然后挑出对应 token 的概率即可。
-        sp = self.SamplingParams(
-            temperature=0,
-            max_tokens=0,  # 不生成新 token
-            prompt_logprobs=1,  # 只要 top-1 就够了
         )
-        results = []
-        async for req_output in self.engine.generate(
-            full_prompt, sp, request_id="graphgen_prob"
-        ):
-            results = req_output.outputs
-        # prompt_logprobs 是一个 list，长度 = prompt token 数，
-        # 每个元素是 dict{token_id: logprob_obj} 或 None（首个位置为 None）
-        prompt_logprobs = results[-1].prompt_logprobs
-        tokens = []
-        for _, logprob_dict in enumerate(prompt_logprobs):
-            if logprob_dict is None:
-                continue
-            # 这里每个 dict 只有 1 个 kv，因为 top-1
-            _, logprob_obj = next(iter(logprob_dict.items()))
-            tok_str = logprob_obj.decoded_token
-            prob = float(logprob_obj.logprob.exp())
-            tokens.append(Token(tok_str, prob))
-        return tokens

+import math
+import uuid
 from typing import Any, List, Optional
 from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
 class VLLMWrapper(BaseLLMWrapper):
     """
+    Async inference backend based on vLLM.
     """
     def __init__(
         **kwargs: Any,
     ):
         super().__init__(temperature=temperature, top_p=top_p, **kwargs)
         try:
             from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
         except ImportError as exc:
             raise ImportError(
+                "VLLMWrapper requires vllm. Install it with: uv pip install vllm"
             ) from exc
         self.SamplingParams = SamplingParams
             tensor_parallel_size=tensor_parallel_size,
             gpu_memory_utilization=gpu_memory_utilization,
             trust_remote_code=kwargs.get("trust_remote_code", True),
+            disable_log_stats=False,
         )
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
         self.temperature = temperature
         self.top_p = top_p
         self.topk = topk
         self, text: str, history: Optional[List[str]] = None, **extra: Any
     ) -> str:
         full_prompt = self._build_inputs(text, history)
+        request_id = f"graphgen_req_{uuid.uuid4()}"
         sp = self.SamplingParams(
             temperature=self.temperature if self.temperature > 0 else 1.0,
             max_tokens=extra.get("max_new_tokens", 512),
         )
+        result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
+        final_output = None
+        async for request_output in result_generator:
+            final_output = request_output
+        if not final_output or not final_output.outputs:
+            return ""
+        return final_output.outputs[0].text
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
     ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
+        request_id = f"graphgen_topk_{uuid.uuid4()}"
         sp = self.SamplingParams(
             temperature=0,
             max_tokens=1,
             logprobs=self.topk,
         )
+        result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
+        final_output = None
+        async for request_output in result_generator:
+            final_output = request_output
+        if (
+            not final_output
+            or not final_output.outputs
+            or not final_output.outputs[0].logprobs
         ):
+            return []
+        top_logprobs = final_output.outputs[0].logprobs[0]
         tokens = []
         for _, logprob_obj in top_logprobs.items():
             tok_str = logprob_obj.decoded_token
+            prob = float(math.exp(logprob_obj.logprob))
             tokens.append(Token(tok_str, prob))
         tokens.sort(key=lambda x: -x.prob)
         return tokens
     async def generate_inputs_prob(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
     ) -> List[Token]:
+        raise NotImplementedError(
+            "VLLMWrapper does not support per-token logprobs yet."
         )

graphgen/models/partitioner/anchor_bfs_partitioner.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import random
 from collections import deque
-from typing import Any, List, Literal, Set, Tuple
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
@@ -30,24 +30,23 @@ class AnchorBFSPartitioner(BFSPartitioner):
         self.anchor_type = anchor_type
         self.anchor_ids = anchor_ids
-    async def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
-    ) -> List[Community]:
         nodes = g.get_all_nodes()  # List[tuple[id, meta]]
         edges = g.get_all_edges()  # List[tuple[u, v, meta]]
         adj, _ = self._build_adjacency_list(nodes, edges)
-        anchors: Set[str] = await self._pick_anchor_ids(nodes)
         if not anchors:
-            return []  # if no anchors, return empty list
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
-        communities: List[Community] = []
         seeds = list(anchors)
         random.shuffle(seeds)
@@ -55,17 +54,13 @@ class AnchorBFSPartitioner(BFSPartitioner):
         for seed_node in seeds:
             if seed_node in used_n:
                 continue
-            comm_n, comm_e = await self._grow_community(
                 seed_node, adj, max_units_per_community, used_n, used_e
             )
             if comm_n or comm_e:
-                communities.append(
-                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
-                )
-        return communities
-    async def _pick_anchor_ids(
         self,
         nodes: List[tuple[str, dict]],
     ) -> Set[str]:
@@ -80,7 +75,7 @@ class AnchorBFSPartitioner(BFSPartitioner):
         return anchor_ids
     @staticmethod
-    async def _grow_community(
         seed: str,
         adj: dict[str, List[str]],
         max_units: int,

 import random
 from collections import deque
+from typing import Any, Iterable, List, Literal, Set, Tuple
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
         self.anchor_type = anchor_type
         self.anchor_ids = anchor_ids
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
+    ) -> Iterable[Community]:
         nodes = g.get_all_nodes()  # List[tuple[id, meta]]
         edges = g.get_all_edges()  # List[tuple[u, v, meta]]
         adj, _ = self._build_adjacency_list(nodes, edges)
+        anchors: Set[str] = self._pick_anchor_ids(nodes)
         if not anchors:
+            return  # if no anchors, return nothing
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
         seeds = list(anchors)
         random.shuffle(seeds)
         for seed_node in seeds:
             if seed_node in used_n:
                 continue
+            comm_n, comm_e = self._grow_community(
                 seed_node, adj, max_units_per_community, used_n, used_e
             )
             if comm_n or comm_e:
+                yield Community(id=seed_node, nodes=comm_n, edges=comm_e)
+    def _pick_anchor_ids(
         self,
         nodes: List[tuple[str, dict]],
     ) -> Set[str]:
         return anchor_ids
     @staticmethod
+    def _grow_community(
         seed: str,
         adj: dict[str, List[str]],
         max_units: int,

graphgen/models/partitioner/bfs_partitioner.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import random
 from collections import deque
-from typing import Any, List
 from graphgen.bases import BaseGraphStorage, BasePartitioner
 from graphgen.bases.datatypes import Community
@@ -17,12 +17,12 @@ class BFSPartitioner(BasePartitioner):
     (A unit is a node or an edge.)
     """
-    async def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
-    ) -> List[Community]:
         nodes = g.get_all_nodes()
         edges = g.get_all_edges()
@@ -30,7 +30,6 @@ class BFSPartitioner(BasePartitioner):
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
-        communities: List[Community] = []
         units = [(NODE_UNIT, n[0]) for n in nodes] + [
             (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
@@ -74,8 +73,4 @@ class BFSPartitioner(BasePartitioner):
                             queue.append((NODE_UNIT, n))
             if comm_n or comm_e:
-                communities.append(
-                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
-                )
-        return communities

 import random
 from collections import deque
+from typing import Any, Iterable, List
 from graphgen.bases import BaseGraphStorage, BasePartitioner
 from graphgen.bases.datatypes import Community
     (A unit is a node or an edge.)
     """
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
+    ) -> Iterable[Community]:
         nodes = g.get_all_nodes()
         edges = g.get_all_edges()
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
         units = [(NODE_UNIT, n[0]) for n in nodes] + [
             (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
                             queue.append((NODE_UNIT, n))
             if comm_n or comm_e:
+                yield Community(id=seed, nodes=comm_n, edges=comm_e)

graphgen/models/partitioner/dfs_partitioner.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import random
-from typing import Any, List
 from graphgen.bases import BaseGraphStorage, BasePartitioner
 from graphgen.bases.datatypes import Community
@@ -16,12 +17,12 @@ class DFSPartitioner(BasePartitioner):
     (In GraphGen, a unit is defined as a node or an edge.)
     """
-    async def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
-    ) -> List[Community]:
         nodes = g.get_all_nodes()
         edges = g.get_all_edges()
@@ -29,7 +30,6 @@ class DFSPartitioner(BasePartitioner):
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
-        communities: List[Community] = []
         units = [(NODE_UNIT, n[0]) for n in nodes] + [
             (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
@@ -71,8 +71,4 @@ class DFSPartitioner(BasePartitioner):
                             stack.append((NODE_UNIT, n))
             if comm_n or comm_e:
-                communities.append(
-                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
-                )
-        return communities

 import random
+from collections.abc import Iterable
+from typing import Any
 from graphgen.bases import BaseGraphStorage, BasePartitioner
 from graphgen.bases.datatypes import Community
     (In GraphGen, a unit is defined as a node or an edge.)
     """
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
+    ) -> Iterable[Community]:
         nodes = g.get_all_nodes()
         edges = g.get_all_edges()
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
         units = [(NODE_UNIT, n[0]) for n in nodes] + [
             (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
                             stack.append((NODE_UNIT, n))
             if comm_n or comm_e:
+                yield Community(id=seed, nodes=comm_n, edges=comm_e)

graphgen/models/partitioner/ece_partitioner.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import asyncio
 import random
-from typing import Any, Dict, List, Optional, Set, Tuple
-from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
@@ -51,7 +51,7 @@ class ECEPartitioner(BFSPartitioner):
             raise ValueError(f"Invalid edge sampling: {edge_sampling}")
         return units
-    async def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 10,
@@ -59,7 +59,7 @@ class ECEPartitioner(BFSPartitioner):
         max_tokens_per_community: int = 10240,
         unit_sampling: str = "random",
         **kwargs: Any,
-    ) -> List[Community]:
         nodes: List[Tuple[str, dict]] = g.get_all_nodes()
         edges: List[Tuple[str, str, dict]] = g.get_all_edges()
@@ -73,21 +73,18 @@ class ECEPartitioner(BFSPartitioner):
         used_n: Set[str] = set()
         used_e: Set[frozenset[str]] = set()
-        communities: List = []
         all_units = self._sort_units(all_units, unit_sampling)
-        async def _grow_community(
-            seed_unit: Tuple[str, Any, dict]
-        ) -> Optional[Community]:
             nonlocal used_n, used_e
             community_nodes: Dict[str, dict] = {}
             community_edges: Dict[frozenset[str], dict] = {}
-            queue: asyncio.Queue = asyncio.Queue()
             token_sum = 0
-            async def _add_unit(u):
                 nonlocal token_sum
                 t, i, d = u
                 if t == NODE_UNIT:  # node
@@ -103,11 +100,11 @@ class ECEPartitioner(BFSPartitioner):
                 token_sum += d.get("length", 0)
                 return True
-            await _add_unit(seed_unit)
-            await queue.put(seed_unit)
             # BFS
-            while not queue.empty():
                 if (
                     len(community_nodes) + len(community_edges)
                     >= max_units_per_community
@@ -115,7 +112,7 @@ class ECEPartitioner(BFSPartitioner):
                 ):
                     break
-                cur_type, cur_id, _ = await queue.get()
                 neighbors: List[Tuple[str, Any, dict]] = []
                 if cur_type == NODE_UNIT:
@@ -136,26 +133,24 @@ class ECEPartitioner(BFSPartitioner):
                         or token_sum >= max_tokens_per_community
                     ):
                         break
-                    if await _add_unit(nb):
-                        await queue.put(nb)
             if len(community_nodes) + len(community_edges) < min_units_per_community:
                 return None
             return Community(
-                id=len(communities),
                 nodes=list(community_nodes.keys()),
                 edges=[(u, v) for (u, v), _ in community_edges.items()],
             )
-        async for unit in tqdm_async(all_units, desc="ECE partition"):
             utype, uid, _ = unit
             if (utype == NODE_UNIT and uid in used_n) or (
                 utype == EDGE_UNIT and uid in used_e
             ):
                 continue
-            comm = await _grow_community(unit)
-            if comm is not None:
-                communities.append(comm)
-        return communities

 import random
+from collections import deque
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+from tqdm import tqdm
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
             raise ValueError(f"Invalid edge sampling: {edge_sampling}")
         return units
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 10,
         max_tokens_per_community: int = 10240,
         unit_sampling: str = "random",
         **kwargs: Any,
+    ) -> Iterable[Community]:
         nodes: List[Tuple[str, dict]] = g.get_all_nodes()
         edges: List[Tuple[str, str, dict]] = g.get_all_edges()
         used_n: Set[str] = set()
         used_e: Set[frozenset[str]] = set()
         all_units = self._sort_units(all_units, unit_sampling)
+        def _grow_community(seed_unit: Tuple[str, Any, dict]) -> Optional[Community]:
             nonlocal used_n, used_e
             community_nodes: Dict[str, dict] = {}
             community_edges: Dict[frozenset[str], dict] = {}
+            queue = deque()
             token_sum = 0
+            def _add_unit(u):
                 nonlocal token_sum
                 t, i, d = u
                 if t == NODE_UNIT:  # node
                 token_sum += d.get("length", 0)
                 return True
+            _add_unit(seed_unit)
+            queue.append(seed_unit)
             # BFS
+            while queue:
                 if (
                     len(community_nodes) + len(community_edges)
                     >= max_units_per_community
                 ):
                     break
+                cur_type, cur_id, _ = queue.popleft()
                 neighbors: List[Tuple[str, Any, dict]] = []
                 if cur_type == NODE_UNIT:
                         or token_sum >= max_tokens_per_community
                     ):
                         break
+                    if _add_unit(nb):
+                        queue.append(nb)
             if len(community_nodes) + len(community_edges) < min_units_per_community:
                 return None
             return Community(
+                id=seed_unit[1],
                 nodes=list(community_nodes.keys()),
                 edges=[(u, v) for (u, v), _ in community_edges.items()],
             )
+        for unit in tqdm(all_units, desc="ECE partition"):
             utype, uid, _ = unit
             if (utype == NODE_UNIT and uid in used_n) or (
                 utype == EDGE_UNIT and uid in used_e
             ):
                 continue
+            comm = _grow_community(unit)
+            if comm:
+                yield comm

graphgen/models/partitioner/leiden_partitioner.py CHANGED Viewed

@@ -13,7 +13,7 @@ class LeidenPartitioner(BasePartitioner):
     Leiden partitioner that partitions the graph into communities using the Leiden algorithm.
     """
-    async def partition(
         self,
         g: BaseGraphStorage,
         max_size: int = 20,
@@ -37,12 +37,10 @@ class LeidenPartitioner(BasePartitioner):
         nodes = g.get_all_nodes()  # List[Tuple[str, dict]]
         edges = g.get_all_edges()  # List[Tuple[str, str, dict]]
-        node2cid: Dict[str, int] = await self._run_leiden(
-            nodes, edges, use_lcc, random_seed
-        )
         if max_size is not None and max_size > 0:
-            node2cid = await self._split_communities(node2cid, max_size)
         cid2nodes: Dict[int, List[str]] = defaultdict(list)
         for n, cid in node2cid.items():
@@ -58,7 +56,7 @@ class LeidenPartitioner(BasePartitioner):
         return communities
     @staticmethod
-    async def _run_leiden(
         nodes: List[Tuple[str, dict]],
         edges: List[Tuple[str, str, dict]],
         use_lcc: bool = False,
@@ -92,9 +90,7 @@ class LeidenPartitioner(BasePartitioner):
         return node2cid
     @staticmethod
-    async def _split_communities(
-        node2cid: Dict[str, int], max_size: int
-    ) -> Dict[str, int]:
         """
         Split communities larger than max_size into smaller sub-communities.
         """

     Leiden partitioner that partitions the graph into communities using the Leiden algorithm.
     """
+    def partition(
         self,
         g: BaseGraphStorage,
         max_size: int = 20,
         nodes = g.get_all_nodes()  # List[Tuple[str, dict]]
         edges = g.get_all_edges()  # List[Tuple[str, str, dict]]
+        node2cid: Dict[str, int] = self._run_leiden(nodes, edges, use_lcc, random_seed)
         if max_size is not None and max_size > 0:
+            node2cid = self._split_communities(node2cid, max_size)
         cid2nodes: Dict[int, List[str]] = defaultdict(list)
         for n, cid in node2cid.items():
         return communities
     @staticmethod
+    def _run_leiden(
         nodes: List[Tuple[str, dict]],
         edges: List[Tuple[str, str, dict]],
         use_lcc: bool = False,
         return node2cid
     @staticmethod
+    def _split_communities(node2cid: Dict[str, int], max_size: int) -> Dict[str, int]:
         """
         Split communities larger than max_size into smaller sub-communities.
         """

graphgen/models/reader/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from .csv_reader import CSVReader
 from .json_reader import JSONReader
-from .jsonl_reader import JSONLReader
 from .parquet_reader import ParquetReader
 from .pdf_reader import PDFReader
 from .pickle_reader import PickleReader

 from .csv_reader import CSVReader
 from .json_reader import JSONReader
 from .parquet_reader import ParquetReader
 from .pdf_reader import PDFReader
 from .pickle_reader import PickleReader

graphgen/models/reader/csv_reader.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from typing import Any, Dict, List
-import pandas as pd
 from graphgen.bases.base_reader import BaseReader
@@ -13,13 +14,15 @@ class CSVReader(BaseReader):
         - if type is "text", "content" column must be present.
     """
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        df = pd.read_csv(file_path)
-        for _, row in df.iterrows():
-            assert "type" in row, f"Missing 'type' column in document: {row.to_dict()}"
-            if row["type"] == "text" and self.text_column not in row:
-                raise ValueError(
-                    f"Missing '{self.text_column}' in document: {row.to_dict()}"
-                )
-        return self.filter(df.to_dict(orient="records"))

+from typing import List, Union
+import ray
+from ray.data import Dataset
 from graphgen.bases.base_reader import BaseReader
         - if type is "text", "content" column must be present.
     """
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
+        """
+        Read CSV files and return Ray Dataset.
+        :param input_path: Path to CSV file or list of CSV files.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+        ds = ray.data.read_csv(input_path)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds

graphgen/models/reader/json_reader.py CHANGED Viewed

@@ -1,26 +1,53 @@
 import json
-from typing import Any, Dict, List
 from graphgen.bases.base_reader import BaseReader
 class JSONReader(BaseReader):
     """
-    Reader for JSON files.
     Columns:
         - type: The type of the document (e.g., "text", "image", etc.)
         - if type is "text", "content" column must be present.
     """
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-            if isinstance(data, list):
-                for doc in data:
-                    assert "type" in doc, f"Missing 'type' in document: {doc}"
-                    if doc.get("type") == "text" and self.text_column not in doc:
-                        raise ValueError(
-                            f"Missing '{self.text_column}' in document: {doc}"
-                        )
-                return self.filter(data)
-            raise ValueError("JSON file must contain a list of documents.")

 import json
+from typing import List, Union
+import ray
+import ray.data
 from graphgen.bases.base_reader import BaseReader
 class JSONReader(BaseReader):
     """
+    Reader for JSON and JSONL files.
     Columns:
         - type: The type of the document (e.g., "text", "image", etc.)
         - if type is "text", "content" column must be present.
     """
+    def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset:
+        """
+        Read JSON file and return Ray Dataset.
+        :param input_path: Path to JSON/JSONL file or list of JSON/JSONL files.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+        if self.modalities and len(self.modalities) >= 2:
+            ds: ray.data.Dataset = ray.data.from_items([])
+            for file in input_path if isinstance(input_path, list) else [input_path]:
+                data = []
+                if file.endswith(".jsonl"):
+                    with open(file, "r", encoding="utf-8") as f:
+                        for line in f:
+                            item = json.loads(line)
+                            data.append(item)
+                else:
+                    with open(file, "r", encoding="utf-8") as f:
+                        data = json.load(f)
+                        data = self._unify_schema(data)
+                file_ds: ray.data.Dataset = ray.data.from_items(data)
+                ds = ds.union(file_ds)  # type: ignore
+        else:
+            ds = ray.data.read_json(input_path)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
+    @staticmethod
+    def _unify_schema(data):
+        """
+        Unify schema for JSON data.
+        """
+        for item in data:
+            if "content" in item and isinstance(item["content"], dict):
+                item["content"] = json.dumps(item["content"])
+        return data

graphgen/models/reader/jsonl_reader.py DELETED Viewed

@@ -1,30 +0,0 @@
-import json
-from typing import Any, Dict, List
-from graphgen.bases.base_reader import BaseReader
-from graphgen.utils import logger
-class JSONLReader(BaseReader):
-    """
-    Reader for JSONL files.
-    Columns:
-        - type: The type of the document (e.g., "text", "image", etc.)
-        - if type is "text", "content" column must be present.
-    """
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        docs = []
-        with open(file_path, "r", encoding="utf-8") as f:
-            for line in f:
-                try:
-                    doc = json.loads(line)
-                    assert "type" in doc, f"Missing 'type' in document: {doc}"
-                    if doc.get("type") == "text" and self.text_column not in doc:
-                        raise ValueError(
-                            f"Missing '{self.text_column}' in document: {doc}"
-                        )
-                    docs.append(doc)
-                except json.JSONDecodeError as e:
-                    logger.error("Error decoding JSON line: %s. Error: %s", line, e)
-        return self.filter(docs)

graphgen/models/reader/parquet_reader.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from typing import Any, Dict, List
-import pandas as pd
 from graphgen.bases.base_reader import BaseReader
@@ -13,12 +14,17 @@ class ParquetReader(BaseReader):
     - if type is "text", "content" column must be present.
     """
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        df = pd.read_parquet(file_path)
-        data: List[Dict[str, Any]] = df.to_dict(orient="records")
-        for doc in data:
-            assert "type" in doc, f"Missing 'type' in document: {doc}"
-            if doc.get("type") == "text" and self.text_column not in doc:
-                raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
-        return self.filter(data)

+from typing import List, Union
+import ray
+from ray.data import Dataset
 from graphgen.bases.base_reader import BaseReader
     - if type is "text", "content" column must be present.
     """
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
+        """
+        Read Parquet files using Ray Data.
+        :param input_path: Path to Parquet file or list of Parquet files.
+        :return: Ray Dataset containing validated documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+        ds = ray.data.read_parquet(input_path)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds

graphgen/models/reader/pdf_reader.py CHANGED Viewed

@@ -5,6 +5,9 @@ import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 from graphgen.bases.base_reader import BaseReader
 from graphgen.models.reader.txt_reader import TXTReader
 from graphgen.utils import logger, pick_device
@@ -62,19 +65,31 @@ class PDFReader(BaseReader):
         self.parser = MinerUParser()
         self.txt_reader = TXTReader()
-    def read(self, file_path: str, **override) -> List[Dict[str, Any]]:
-        """
-        file_path
-        **override: override MinerU parameters
-        """
-        pdf_path = Path(file_path).expanduser().resolve()
-        if not pdf_path.is_file():
-            raise FileNotFoundError(pdf_path)
-        kwargs = {**self._default_kwargs, **override}
-        mineru_result = self._call_mineru(pdf_path, kwargs)
-        return self.filter(mineru_result)
     def _call_mineru(
         self, pdf_path: Path, kwargs: Dict[str, Any]
@@ -161,18 +176,18 @@ class MinerUParser:
         base = os.path.dirname(json_file)
         results = []
-        for item in data:
             for key in ("img_path", "table_img_path", "equation_img_path"):
-                rel_path = item.get(key)
                 if rel_path:
-                    item[key] = str(Path(base).joinpath(rel_path).resolve())
-            if item["type"] == "text":
-                item["content"] = item["text"]
-                del item["text"]
             for key in ("page_idx", "bbox", "text_level"):
-                if item.get(key) is not None:
-                    del item[key]
-            results.append(item)
         return results
     @staticmethod

 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
+import ray
+from ray.data import Dataset
 from graphgen.bases.base_reader import BaseReader
 from graphgen.models.reader.txt_reader import TXTReader
 from graphgen.utils import logger, pick_device
         self.parser = MinerUParser()
         self.txt_reader = TXTReader()
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        **override,
+    ) -> Dataset:
+        # Ensure input_path is a list
+        if isinstance(input_path, str):
+            input_path = [input_path]
+        paths_ds = ray.data.from_items(input_path)
+        def process_pdf(row: Dict[str, Any]) -> List[Dict[str, Any]]:
+            try:
+                pdf_path = row["item"]
+                kwargs = {**self._default_kwargs, **override}
+                return self._call_mineru(Path(pdf_path), kwargs)
+            except Exception as e:
+                logger.error("Failed to process %s: %s", row, e)
+                return []
+        docs_ds = paths_ds.flat_map(process_pdf)
+        docs_ds = docs_ds.filter(self._should_keep_item)
+        return docs_ds
     def _call_mineru(
         self, pdf_path: Path, kwargs: Dict[str, Any]
         base = os.path.dirname(json_file)
         results = []
+        for it in data:
             for key in ("img_path", "table_img_path", "equation_img_path"):
+                rel_path = it.get(key)
                 if rel_path:
+                    it[key] = str(Path(base).joinpath(rel_path).resolve())
+            if it["type"] == "text":
+                it["content"] = it["text"]
+                del it["text"]
             for key in ("page_idx", "bbox", "text_level"):
+                if it.get(key) is not None:
+                    del it[key]
+            results.append(it)
         return results
     @staticmethod

graphgen/models/reader/pickle_reader.py CHANGED Viewed

@@ -1,30 +1,78 @@
 import pickle
-from typing import Any, Dict, List
 from graphgen.bases.base_reader import BaseReader
 class PickleReader(BaseReader):
     """
-    Read pickle files, requiring the top-level object to be List[Dict[str, Any]].
-    Columns:
     - type: The type of the document (e.g., "text", "image", etc.)
     - if type is "text", "content" column must be present.
     """
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "rb") as f:
-            data = pickle.load(f)
-        if not isinstance(data, list):
-            raise ValueError("Pickle file must contain a list of documents.")
-        for doc in data:
-            if not isinstance(doc, dict):
-                raise ValueError("Every item in the list must be a dict.")
-            assert "type" in doc, f"Missing 'type' in document: {doc}"
-            if doc.get("type") == "text" and self.text_column not in doc:
-                raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
-        return self.filter(data)

 import pickle
+from typing import List, Union
+import pandas as pd
+import ray
+from ray.data import Dataset
 from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
 class PickleReader(BaseReader):
     """
+    Read pickle files, requiring the schema to be restored to List[Dict[str, Any]].
+    Each pickle file should contain a list of dictionaries with at least:
     - type: The type of the document (e.g., "text", "image", etc.)
     - if type is "text", "content" column must be present.
+    Note: Uses ray.data.read_binary_files as ray.data.read_pickle is not available.
+    For Ray >= 2.5, consider using read_pickle if available in your version.
     """
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+    ) -> Dataset:
+        """
+        Read Pickle files using Ray Data.
+        :param input_path: Path to pickle file or list of pickle files.
+        :return: Ray Dataset containing validated documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+        # Use read_binary_files as a reliable alternative to read_pickle
+        ds = ray.data.read_binary_files(input_path, include_paths=True)
+        # Deserialize pickle files and flatten into individual records
+        def deserialize_batch(batch: pd.DataFrame) -> pd.DataFrame:
+            all_records = []
+            for _, row in batch.iterrows():
+                try:
+                    # Load pickle data from bytes
+                    data = pickle.loads(row["bytes"])
+                    # Validate structure
+                    if not isinstance(data, list):
+                        logger.error(
+                            "Pickle file {row['path']} must contain a list, got {type(data)}"
+                        )
+                        continue
+                    if not all(isinstance(item, dict) for item in data):
+                        logger.error(
+                            "Pickle file {row['path']} must contain a list of dictionaries"
+                        )
+                        continue
+                    # Flatten: each dict in the list becomes a separate row
+                    all_records.extend(data)
+                except Exception as e:
+                    logger.error(
+                        "Failed to deserialize pickle file %s: %s", row["path"], str(e)
+                    )
+                    continue
+            return pd.DataFrame(all_records)
+        # Apply deserialization and flattening
+        ds = ds.map_batches(deserialize_batch, batch_format="pandas")
+        # Validate the schema
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        # Filter valid items
+        ds = ds.filter(self._should_keep_item)
+        return ds

graphgen/models/reader/rdf_reader.py CHANGED Viewed

@@ -1,48 +1,128 @@
-from typing import Any, Dict, List
 import rdflib
 from rdflib import Literal
 from rdflib.util import guess_format
 from graphgen.bases.base_reader import BaseReader
 class RDFReader(BaseReader):
     """
     Reader for RDF files that extracts triples and represents them as dictionaries.
     """
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
         g = rdflib.Graph()
-        fmt = guess_format(file_path)
         try:
-            g.parse(file_path, format=fmt)
         except Exception as e:
             raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e
         docs: List[Dict[str, Any]] = []
-        text_col = self.text_column
         for subj in set(g.subjects()):
             literals = []
             props = {}
             for _, pred, obj in g.triples((subj, None, None)):
                 pred_str = str(pred)
                 if isinstance(obj, Literal):
-                    literals.append(str(obj))
-                props.setdefault(pred_str, []).append(str(obj))
             text = " ".join(literals).strip()
             if not text:
-                raise ValueError(
-                    f"Subject {subj} has no literal values; "
-                    f"missing '{text_col}' for text column."
                 )
-            doc = {"id": str(subj), text_col: text, "properties": props}
             docs.append(doc)
         if not docs:
-            raise ValueError("RDF file contains no valid documents.")
-        return self.filter(docs)

+from pathlib import Path
+from typing import Any, Dict, List, Union
+import ray
 import rdflib
+from ray.data import Dataset
 from rdflib import Literal
 from rdflib.util import guess_format
 from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
 class RDFReader(BaseReader):
     """
     Reader for RDF files that extracts triples and represents them as dictionaries.
+    Uses Ray Data for distributed processing of multiple RDF files.
     """
+    def __init__(self, *, text_column: str = "content", **kwargs):
+        """
+        Initialize RDFReader.
+        :param text_column: The column name for text content (default: "content").
+        """
+        super().__init__(**kwargs)
+        self.text_column = text_column
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+    ) -> Dataset:
+        """
+        Read RDF file(s) using Ray Data.
+        :param input_path: Path to RDF file or list of RDF files.
+        :return: Ray Dataset containing extracted documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+        # Ensure input_path is a list to prevent Ray from splitting string into characters
+        if isinstance(input_path, str):
+            input_path = [input_path]
+        # Create dataset from file paths
+        paths_ds = ray.data.from_items(input_path)
+        def process_rdf(row: Dict[str, Any]) -> List[Dict[str, Any]]:
+            """Process a single RDF file and return list of documents."""
+            try:
+                file_path = row["item"]
+                return self._parse_rdf_file(Path(file_path))
+            except Exception as e:
+                logger.error(
+                    "Failed to process RDF file %s: %s", row.get("item", "unknown"), e
+                )
+                return []
+        # Process files in parallel and flatten results
+        docs_ds = paths_ds.flat_map(process_rdf)
+        # Filter valid documents
+        docs_ds = docs_ds.filter(self._should_keep_item)
+        return docs_ds
+    def _parse_rdf_file(self, file_path: Path) -> List[Dict[str, Any]]:
+        """
+        Parse a single RDF file and extract documents.
+        :param file_path: Path to RDF file.
+        :return: List of document dictionaries.
+        """
+        if not file_path.is_file():
+            raise FileNotFoundError(f"RDF file not found: {file_path}")
         g = rdflib.Graph()
+        fmt = guess_format(str(file_path))
         try:
+            g.parse(str(file_path), format=fmt)
         except Exception as e:
             raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e
         docs: List[Dict[str, Any]] = []
+        # Process each unique subject in the RDF graph
         for subj in set(g.subjects()):
             literals = []
             props = {}
+            # Extract all triples for this subject
             for _, pred, obj in g.triples((subj, None, None)):
                 pred_str = str(pred)
+                obj_str = str(obj)
+                # Collect literal values as text content
                 if isinstance(obj, Literal):
+                    literals.append(obj_str)
+                # Store all properties (including non-literals)
+                props.setdefault(pred_str, []).append(obj_str)
+            # Join all literal values as the text content
             text = " ".join(literals).strip()
             if not text:
+                logger.warning(
+                    "Subject %s in %s has no literal values; document will have empty '%s' field.",
+                    subj,
+                    file_path,
+                    self.text_column,
                 )
+            # Create document dictionary
+            doc = {
+                "id": str(subj),
+                self.text_column: text,
+                "properties": props,
+                "source_file": str(file_path),
+            }
             docs.append(doc)
         if not docs:
+            logger.warning("RDF file %s contains no valid documents.", file_path)
+        return docs

graphgen/models/reader/txt_reader.py CHANGED Viewed

@@ -1,10 +1,32 @@
-from typing import Any, Dict, List
 from graphgen.bases.base_reader import BaseReader
 class TXTReader(BaseReader):
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "r", encoding="utf-8") as f:
-            docs = [{"type": "text", self.text_column: f.read()}]
-        return self.filter(docs)

+from typing import List, Union
+import ray
+from ray.data import Dataset
 from graphgen.bases.base_reader import BaseReader
 class TXTReader(BaseReader):
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+    ) -> Dataset:
+        """
+        Read text files from the specified input path.
+        :param input_path: Path to the input text file or list of text files.
+        :return: Ray Dataset containing the read text data.
+        """
+        docs_ds = ray.data.read_binary_files(
+            input_path,
+            include_paths=False,
+        )
+        docs_ds = docs_ds.map(
+            lambda row: {
+                "type": "text",
+                self.text_column: row["bytes"].decode("utf-8"),
+            }
+        )
+        docs_ds = docs_ds.filter(self._should_keep_item)
+        return docs_ds

graphgen/models/splitter/character_splitter.py CHANGED Viewed

@@ -17,7 +17,7 @@ class CharacterSplitter(BaseSplitter):
     def split_text(self, text: str) -> List[str]:
         """Split incoming text and return chunks."""
-        # First we naively split the large input into a bunch of smaller ones.
         separator = (
             self._separator if self._is_separator_regex else re.escape(self._separator)
         )

     def split_text(self, text: str) -> List[str]:
         """Split incoming text and return chunks."""
+        # First we naively chunk the large input into a bunch of smaller ones.
         separator = (
             self._separator if self._is_separator_regex else re.escape(self._separator)
         )

graphgen/models/splitter/markdown_splitter.py CHANGED Viewed

@@ -6,12 +6,12 @@ from graphgen.models.splitter.recursive_character_splitter import (
 class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
-    """Attempts to split the text along Markdown-formatted headings."""
     def __init__(self, **kwargs: Any) -> None:
         """Initialize a MarkdownTextRefSplitter."""
         separators = [
-            # First, try to split along Markdown headings (starting with level 2)
             "\n#{1,6} ",
             # Note the alternative syntax for headings (below) is not handled here
             # Heading level 2

 class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
+    """Attempts to chunk the text along Markdown-formatted headings."""
     def __init__(self, **kwargs: Any) -> None:
         """Initialize a MarkdownTextRefSplitter."""
         separators = [
+            # First, try to chunk along Markdown headings (starting with level 2)
             "\n#{1,6} ",
             # Note the alternative syntax for headings (below) is not handled here
             # Heading level 2

graphgen/models/splitter/recursive_character_splitter.py CHANGED Viewed

@@ -7,7 +7,7 @@ from graphgen.bases.base_splitter import BaseSplitter
 class RecursiveCharacterSplitter(BaseSplitter):
     """Splitting text by recursively look at characters.
-    Recursively tries to split by different characters to find one that works.
     """
     def __init__(
@@ -88,7 +88,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterSplitter):
     def _split_text_with_regex_from_end(
         self, text: str, separator: str, keep_separator: bool
     ) -> List[str]:
-        # Now that we have the separator, split the text
         if separator:
             if keep_separator:
                 # The parentheses in the pattern keep the delimiters in the result.

 class RecursiveCharacterSplitter(BaseSplitter):
     """Splitting text by recursively look at characters.
+    Recursively tries to chunk by different characters to find one that works.
     """
     def __init__(
     def _split_text_with_regex_from_end(
         self, text: str, separator: str, keep_separator: bool
     ) -> List[str]:
+        # Now that we have the separator, chunk the text
         if separator:
             if keep_separator:
                 # The parentheses in the pattern keep the delimiters in the result.

graphgen/models/storage/__init__.py CHANGED Viewed

@@ -1,3 +1,6 @@
-from .json_storage import JsonKVStorage, JsonListStorage
-from .networkx_storage import NetworkXStorage
 from .rocksdb_cache import RocksDBCache

+from graphgen.models.storage.graph.kuzu_storage import KuzuStorage
+from graphgen.models.storage.graph.networkx_storage import NetworkXStorage
+from graphgen.models.storage.kv.json_storage import JsonKVStorage
+from graphgen.models.storage.kv.rocksdb_storage import RocksDBKVStorage
 from .rocksdb_cache import RocksDBCache

graphgen/{configs → models/storage/graph}/__init__.py RENAMED Viewed

File without changes

graphgen/models/storage/graph/kuzu_storage.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import json
+import os
+import shutil
+from dataclasses import dataclass
+from typing import Any
+try:
+    import kuzu
+except ImportError:
+    kuzu = None
+from graphgen.bases.base_storage import BaseGraphStorage
+@dataclass
+class KuzuStorage(BaseGraphStorage):
+    """
+    Graph storage implementation based on KuzuDB.
+    Since KuzuDB is a structured graph database and GraphGen uses dynamic dictionaries for properties,
+    we map the data to a generic schema:
+    - Node Table 'Entity': {id: STRING, data: STRING (JSON)}
+    - Rel Table 'Relation': {FROM Entity TO Entity, data: STRING (JSON)}
+    """
+    working_dir: str = None
+    namespace: str = None
+    _db: Any = None
+    _conn: Any = None
+    def __post_init__(self):
+        if kuzu is None:
+            raise ImportError(
+                "KuzuDB is not installed. Please install it via `pip install kuzu`."
+            )
+        self.db_path = os.path.join(self.working_dir, f"{self.namespace}_kuzu")
+        self._init_db()
+    def _init_db(self):
+        # KuzuDB automatically creates the directory
+        self._db = kuzu.Database(self.db_path)
+        self._conn = kuzu.Connection(self._db)
+        self._init_schema()
+        print(f"KuzuDB initialized at {self.db_path}")
+    def _init_schema(self):
+        """Initialize the generic Node and Edge tables if they don't exist."""
+        # Check and create Node table
+        try:
+            # We use a generic table name "Entity" to store all nodes
+            self._conn.execute(
+                "CREATE NODE TABLE Entity(id STRING, data STRING, PRIMARY KEY(id))"
+            )
+            print("Created KuzuDB Node Table 'Entity'")
+        except RuntimeError as e:
+            # Usually throws if table exists, verify safely or ignore
+            print("Node Table 'Entity' already exists or error:", e)
+        # Check and create Edge table
+        try:
+            # We use a generic table name "Relation" to store all edges
+            self._conn.execute(
+                "CREATE REL TABLE Relation(FROM Entity TO Entity, data STRING)"
+            )
+            print("Created KuzuDB Rel Table 'Relation'")
+        except RuntimeError as e:
+            print("Rel Table 'Relation' already exists or error:", e)
+    def index_done_callback(self):
+        """KuzuDB is ACID, changes are immediate, but we can verify generic persistence here."""
+    def has_node(self, node_id: str) -> bool:
+        result = self._conn.execute(
+            "MATCH (a:Entity {id: $id}) RETURN count(a)", {"id": node_id}
+        )
+        count = result.get_next()[0]
+        return count > 0
+    def has_edge(self, source_node_id: str, target_node_id: str):
+        result = self._conn.execute(
+            "MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst}) RETURN count(e)",
+            {"src": source_node_id, "dst": target_node_id},
+        )
+        count = result.get_next()[0]
+        return count > 0
+    def node_degree(self, node_id: str) -> int:
+        # Calculate total degree (incoming + outgoing)
+        query = """
+            MATCH (a:Entity {id: $id})-[e:Relation]-(b:Entity)
+            RETURN count(e)
+        """
+        result = self._conn.execute(query, {"id": node_id})
+        if result.has_next():
+            return result.get_next()[0]
+        return 0
+    def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        # In this context, usually checks existence or multiplicity.
+        # Kuzu supports multi-edges, so we count them.
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst})
+            RETURN count(e)
+        """
+        result = self._conn.execute(query, {"src": src_id, "dst": tgt_id})
+        if result.has_next():
+            return result.get_next()[0]
+        return 0
+    def get_node(self, node_id: str) -> Any:
+        result = self._conn.execute(
+            "MATCH (a:Entity {id: $id}) RETURN a.data", {"id": node_id}
+        )
+        if result.has_next():
+            data_str = result.get_next()[0]
+            return json.loads(data_str) if data_str else {}
+        return None
+    def update_node(self, node_id: str, node_data: dict[str, str]):
+        current_data = self.get_node(node_id)
+        if current_data is None:
+            print(f"Node {node_id} not found for update.")
+            return
+        # Merge existing data with new data
+        current_data.update(node_data)
+        json_data = json.dumps(current_data, ensure_ascii=False)
+        self._conn.execute(
+            "MATCH (a:Entity {id: $id}) SET a.data = $data",
+            {"id": node_id, "data": json_data},
+        )
+    def get_all_nodes(self) -> Any:
+        """Returns List[Tuple[id, data_dict]]"""
+        result = self._conn.execute("MATCH (a:Entity) RETURN a.id, a.data")
+        nodes = []
+        while result.has_next():
+            row = result.get_next()
+            nodes.append((row[0], json.loads(row[1])))
+        return nodes
+    def get_edge(self, source_node_id: str, target_node_id: str):
+        # Warning: If multiple edges exist, this returns the first one found
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst})
+            RETURN e.data
+        """
+        result = self._conn.execute(
+            query, {"src": source_node_id, "dst": target_node_id}
+        )
+        if result.has_next():
+            data_str = result.get_next()[0]
+            return json.loads(data_str) if data_str else {}
+        return None
+    def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        current_data = self.get_edge(source_node_id, target_node_id)
+        if current_data is None:
+            print(f"Edge {source_node_id}->{target_node_id} not found for update.")
+            return
+        current_data.update(edge_data)
+        json_data = json.dumps(current_data, ensure_ascii=False)
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity {id: $dst})
+            SET e.data = $data
+        """
+        self._conn.execute(
+            query, {"src": source_node_id, "dst": target_node_id, "data": json_data}
+        )
+    def get_all_edges(self) -> Any:
+        """Returns List[Tuple[src, dst, data_dict]]"""
+        query = "MATCH (a:Entity)-[e:Relation]->(b:Entity) RETURN a.id, b.id, e.data"
+        result = self._conn.execute(query)
+        edges = []
+        while result.has_next():
+            row = result.get_next()
+            edges.append((row[0], row[1], json.loads(row[2])))
+        return edges
+    def get_node_edges(self, source_node_id: str) -> Any:
+        """Returns generic edges connected to this node (outgoing)"""
+        query = """
+            MATCH (a:Entity {id: $src})-[e:Relation]->(b:Entity)
+            RETURN a.id, b.id, e.data
+        """
+        result = self._conn.execute(query, {"src": source_node_id})
+        edges = []
+        while result.has_next():
+            row = result.get_next()
+            edges.append((row[0], row[1], json.loads(row[2])))
+        return edges
+    def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        """
+        Insert or Update node.
+        Kuzu supports MERGE clause (similar to Neo4j) to handle upserts.
+        """
+        json_data = json.dumps(node_data, ensure_ascii=False)
+        query = """
+            MERGE (a:Entity {id: $id})
+            ON MATCH SET a.data = $data
+            ON CREATE SET a.data = $data
+        """
+        self._conn.execute(query, {"id": node_id, "data": json_data})
+    def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        """
+        Insert or Update edge.
+        Note: We explicitly ensure nodes exist before merging the edge to avoid errors,
+        although GraphGen generally creates nodes before edges.
+        """
+        # Ensure source node exists
+        if not self.has_node(source_node_id):
+            self.upsert_node(source_node_id, {})
+        # Ensure target node exists
+        if not self.has_node(target_node_id):
+            self.upsert_node(target_node_id, {})
+        json_data = json.dumps(edge_data, ensure_ascii=False)
+        query = """
+            MATCH (a:Entity {id: $src}), (b:Entity {id: $dst})
+            MERGE (a)-[e:Relation]->(b)
+            ON MATCH SET e.data = $data
+            ON CREATE SET e.data = $data
+        """
+        self._conn.execute(
+            query, {"src": source_node_id, "dst": target_node_id, "data": json_data}
+        )
+    def delete_node(self, node_id: str):
+        # DETACH DELETE removes the node and all connected edges
+        query = "MATCH (a:Entity {id: $id}) DETACH DELETE a"
+        self._conn.execute(query, {"id": node_id})
+        print(f"Node {node_id} deleted from KuzuDB.")
+    def clear(self):
+        """Clear all data but keep schema (or drop tables)."""
+        self._conn.execute("MATCH (n) DETACH DELETE n")
+        print(f"Graph {self.namespace} cleared.")
+    def reload(self):
+        """For databases that need reloading, KuzuDB auto-manages this."""
+    def drop(self):
+        """Completely remove the database folder."""
+        if self.db_path and os.path.exists(self.db_path):
+            shutil.rmtree(self.db_path)
+            print(f"Dropped KuzuDB at {self.db_path}")

graphgen/models/storage/{networkx_storage.py → graph/networkx_storage.py} RENAMED Viewed

@@ -6,7 +6,6 @@ from typing import Any, Optional, Union, cast
 import networkx as nx
 from graphgen.bases.base_storage import BaseGraphStorage
-from graphgen.utils import logger
 @dataclass
@@ -19,11 +18,6 @@ class NetworkXStorage(BaseGraphStorage):
     @staticmethod
     def write_nx_graph(graph: nx.Graph, file_name):
-        logger.info(
-            "Writing graph with %d nodes, %d edges",
-            graph.number_of_nodes(),
-            graph.number_of_edges(),
-        )
         nx.write_graphml(graph, file_name)
     @staticmethod
@@ -82,12 +76,11 @@ class NetworkXStorage(BaseGraphStorage):
             self.working_dir, f"{self.namespace}.graphml"
         )
         preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
-        if preloaded_graph is not None:
-            logger.info(
-                "Loaded graph from %s with %d nodes, %d edges",
-                self._graphml_xml_file,
-                preloaded_graph.number_of_nodes(),
-                preloaded_graph.number_of_edges(),
             )
         self._graph = preloaded_graph or nx.Graph()
@@ -133,7 +126,7 @@ class NetworkXStorage(BaseGraphStorage):
         if self._graph.has_node(node_id):
             self._graph.nodes[node_id].update(node_data)
         else:
-            logger.warning("Node %s not found in the graph for update.", node_id)
     def upsert_edge(
         self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
@@ -146,10 +139,8 @@ class NetworkXStorage(BaseGraphStorage):
         if self._graph.has_edge(source_node_id, target_node_id):
             self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
         else:
-            logger.warning(
-                "Edge %s -> %s not found in the graph for update.",
-                source_node_id,
-                target_node_id,
             )
     def delete_node(self, node_id: str):
@@ -160,13 +151,19 @@ class NetworkXStorage(BaseGraphStorage):
         """
         if self._graph.has_node(node_id):
             self._graph.remove_node(node_id)
-            logger.info("Node %s deleted from the graph.", node_id)
         else:
-            logger.warning("Node %s not found in the graph for deletion.", node_id)
     def clear(self):
         """
         Clear the graph by removing all nodes and edges.
         """
         self._graph.clear()
-        logger.info("Graph %s cleared.", self.namespace)

 import networkx as nx
 from graphgen.bases.base_storage import BaseGraphStorage
 @dataclass
     @staticmethod
     def write_nx_graph(graph: nx.Graph, file_name):
         nx.write_graphml(graph, file_name)
     @staticmethod
             self.working_dir, f"{self.namespace}.graphml"
         )
         preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
+        if preloaded_graph:
+            print(
+                f"Loaded graph from {self._graphml_xml_file} with "
+                f"{preloaded_graph.number_of_nodes()} nodes, "
+                f"{preloaded_graph.number_of_edges()} edges"
             )
         self._graph = preloaded_graph or nx.Graph()
         if self._graph.has_node(node_id):
             self._graph.nodes[node_id].update(node_data)
         else:
+            print(f"Node {node_id} not found in the graph for update.")
     def upsert_edge(
         self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
         if self._graph.has_edge(source_node_id, target_node_id):
             self._graph.edges[(source_node_id, target_node_id)].update(edge_data)
         else:
+            print(
+                f"Edge {source_node_id} -> {target_node_id} not found in the graph for update."
             )
     def delete_node(self, node_id: str):
         """
         if self._graph.has_node(node_id):
             self._graph.remove_node(node_id)
+            print(f"Node {node_id} deleted from the graph.")
         else:
+            print(f"Node {node_id} not found in the graph for deletion.")
     def clear(self):
         """
         Clear the graph by removing all nodes and edges.
         """
         self._graph.clear()
+        print(f"Graph {self.namespace} cleared.")
+    def reload(self):
+        """
+        Reload the graph from the GraphML file.
+        """
+        self.__post_init__()

graphgen/models/storage/kv/__init__.py ADDED Viewed

File without changes