Spaces:

chenzihong
/

GraphGen

Sleeping

App Files Files Community

github-actions[bot] commited on Oct 15

Commit

799ac7c

1 Parent(s): 37f0321

Auto-sync from demo at Wed Oct 15 06:28:02 UTC 2025

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +1 -1
graphgen/bases/__init__.py +2 -0
graphgen/bases/base_generator.py +84 -0
graphgen/bases/base_partitioner.py +76 -0
graphgen/bases/base_storage.py +2 -2
graphgen/bases/datatypes.py +8 -0
graphgen/configs/aggregated_config.yaml +4 -8
graphgen/configs/atomic_config.yaml +2 -9
graphgen/configs/cot_config.yaml +3 -3
graphgen/configs/multi_hop_config.yaml +4 -8
graphgen/graphgen.py +14 -52
graphgen/models/__init__.py +15 -8
graphgen/models/community/__init__.py +0 -0
graphgen/models/community/community_detector.py +0 -95
graphgen/models/evaluate/__init__.py +0 -0
graphgen/models/evaluator/__init__.py +4 -0
graphgen/models/{evaluate → evaluator}/base_evaluator.py +0 -0
graphgen/models/{evaluate → evaluator}/length_evaluator.py +1 -1
graphgen/models/{evaluate → evaluator}/mtld_evaluator.py +1 -1
graphgen/models/{evaluate → evaluator}/reward_evaluator.py +0 -0
graphgen/models/{evaluate → evaluator}/uni_evaluator.py +0 -0
graphgen/models/generator/__init__.py +4 -0
graphgen/models/generator/aggregated_generator.py +127 -0
graphgen/models/generator/atomic_generator.py +52 -0
graphgen/models/generator/cot_generator.py +122 -0
graphgen/models/generator/multi_hop_generator.py +55 -0
graphgen/models/kg_builder/__init__.py +1 -0
graphgen/models/llm/limitter.py +27 -29
graphgen/models/llm/openai_client.py +4 -2
graphgen/models/partitioner/__init__.py +4 -0
graphgen/models/partitioner/bfs_partitioner.py +83 -0
graphgen/models/partitioner/dfs_partitioner.py +80 -0
graphgen/models/partitioner/ece_partitioner.py +163 -0
graphgen/models/partitioner/leiden_partitioner.py +120 -0
graphgen/models/storage/__init__.py +2 -0
graphgen/models/storage/networkx_storage.py +4 -4
graphgen/operators/__init__.py +4 -9
graphgen/operators/build_kg/__init__.py +1 -0
graphgen/operators/build_kg/split_kg.py +0 -382
graphgen/operators/generate/__init__.py +1 -0
graphgen/operators/generate/generate_cot.py +0 -117
graphgen/operators/generate/generate_qas.py +58 -0
graphgen/operators/partition/__init__.py +1 -0
graphgen/operators/partition/partition_kg.py +48 -0
graphgen/operators/partition/pre_tokenize.py +47 -0
graphgen/operators/search/__init__.py +1 -0
graphgen/operators/traverse_graph.py +0 -540
graphgen/templates/__init__.py +6 -3
graphgen/templates/community/__init__.py +0 -2
graphgen/templates/community/cot_generation.py +0 -87

app.py CHANGED Viewed

@@ -468,7 +468,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
                         label="TPM",
                         minimum=5000,
                         maximum=5000000,
-                        value=50000,
                         step=1000,
                         interactive=True,
                         visible=True,

                         label="TPM",
                         minimum=5000,
                         maximum=5000000,
+                        value=100000,
                         step=1000,
                         interactive=True,
                         visible=True,

graphgen/bases/__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_client import BaseLLMClient
 from .base_reader import BaseReader
 from .base_splitter import BaseSplitter
 from .base_storage import (

+from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_client import BaseLLMClient
+from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
 from .base_splitter import BaseSplitter
 from .base_storage import (

graphgen/bases/base_generator.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+from graphgen.bases.base_llm_client import BaseLLMClient
+@dataclass
+class BaseGenerator(ABC):
+    """
+    Generate QAs based on given prompts.
+    """
+    llm_client: BaseLLMClient
+    @staticmethod
+    @abstractmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """Build prompt for LLM based on the given batch"""
+    @staticmethod
+    @abstractmethod
+    def parse_response(response: str) -> Any:
+        """Parse the LLM response and return the generated QAs"""
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        result.update(qa_pairs)
+        return result
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {"from": "human", "value": v["question"]},
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {"role": "user", "content": v["question"]},
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results

graphgen/bases/base_partitioner.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, List
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+@dataclass
+class BasePartitioner(ABC):
+    @abstractmethod
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        **kwargs: Any,
+    ) -> List[Community]:
+        """
+        Graph -> Communities
+        :param g: Graph storage instance
+        :param kwargs: Additional parameters for partitioning
+        :return: List of communities
+        """
+    @staticmethod
+    async def community2batch(
+        communities: List[Community], g: BaseGraphStorage
+    ) -> list[
+        tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ]
+    ]:
+        """
+        Convert communities to batches of nodes and edges.
+        :param communities
+        :param g: Graph storage instance
+        :return: List of batches, each batch is a tuple of (nodes, edges)
+        """
+        batches = []
+        for comm in communities:
+            nodes = comm.nodes
+            edges = comm.edges
+            nodes_data = []
+            for node in nodes:
+                node_data = await g.get_node(node)
+                if node_data:
+                    nodes_data.append((node, node_data))
+            edges_data = []
+            for u, v in edges:
+                edge_data = await g.get_edge(u, v)
+                if edge_data:
+                    edges_data.append((u, v, edge_data))
+                else:
+                    edge_data = await g.get_edge(v, u)
+                    if edge_data:
+                        edges_data.append((v, u, edge_data))
+            batches.append((nodes_data, edges_data))
+        return batches
+    @staticmethod
+    def _build_adjacency_list(
+        nodes: List[tuple[str, dict]], edges: List[tuple[str, str, dict]]
+    ) -> tuple[dict[str, List[str]], set[tuple[str, str]]]:
+        """
+        Build adjacency list and edge set from nodes and edges.
+        :param nodes
+        :param edges
+        :return: adjacency list, edge set
+        """
+        adj: dict[str, List[str]] = {n[0]: [] for n in nodes}
+        edge_set: set[tuple[str, str]] = set()
+        for e in edges:
+            adj[e[0]].append(e[1])
+            adj[e[1]].append(e[0])
+            edge_set.add((e[0], e[1]))
+            edge_set.add((e[1], e[0]))
+        return adj, edge_set

graphgen/bases/base_storage.py CHANGED Viewed

@@ -78,7 +78,7 @@ class BaseGraphStorage(StorageNameSpace):
     async def update_node(self, node_id: str, node_data: dict[str, str]):
         raise NotImplementedError
-    async def get_all_nodes(self) -> Union[list[dict], None]:
         raise NotImplementedError
     async def get_edge(
@@ -91,7 +91,7 @@ class BaseGraphStorage(StorageNameSpace):
     ):
         raise NotImplementedError
-    async def get_all_edges(self) -> Union[list[dict], None]:
         raise NotImplementedError
     async def get_node_edges(

     async def update_node(self, node_id: str, node_data: dict[str, str]):
         raise NotImplementedError
+    async def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]:
         raise NotImplementedError
     async def get_edge(
     ):
         raise NotImplementedError
+    async def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]:
         raise NotImplementedError
     async def get_node_edges(

graphgen/bases/datatypes.py CHANGED Viewed

@@ -30,3 +30,11 @@ class Token:
     @property
     def logprob(self) -> float:
         return math.log(self.prob)

     @property
     def logprob(self) -> float:
         return math.log(self.prob)
+@dataclass
+class Community:
+    id: Union[int, str]
+    nodes: List[str] = field(default_factory=list)
+    edges: List[tuple] = field(default_factory=list)
+    metadata: dict = field(default_factory=dict)

graphgen/configs/aggregated_config.yaml CHANGED Viewed

@@ -13,14 +13,10 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 5 # maximum depth for graph traversal
-    max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
 generate:
   mode: aggregated # atomic, aggregated, multi_hop, cot
   data_format: ChatML # Alpaca, Sharegpt, ChatML

 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
+    max_units_per_community: 20 # max nodes and edges per community
+    min_units_per_community: 5 # min nodes and edges per community
+    max_tokens_per_community: 10240 # max tokens per community
+    unit_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
 generate:
   mode: aggregated # atomic, aggregated, multi_hop, cot
   data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/atomic_config.yaml CHANGED Viewed

@@ -11,16 +11,9 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
 partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 3 # maximum depth for graph traversal
-    max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
 generate:
   mode: atomic # atomic, aggregated, multi_hop, cot
   data_format: Alpaca # Alpaca, Sharegpt, ChatML

   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
 partition: # graph partition configuration
+  method: dfs # partition method, support: dfs, bfs, ece, leiden
   method_params:
+    max_units_per_community: 1 # atomic partition, one node or edge per community
 generate:
   mode: atomic # atomic, aggregated, multi_hop, cot
   data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml CHANGED Viewed

@@ -9,11 +9,11 @@ search: # web search configuration
 quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: false
 partition: # graph partition configuration
-  method: leiden # leiden is a community detection algorithm
   method_params:
     max_size: 20 # Maximum size of communities
-    use_lcc: false
-    random_seed: 42
 generate:
   mode: cot # atomic, aggregated, multi_hop, cot
   data_format: Sharegpt # Alpaca, Sharegpt, ChatML

 quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: false
 partition: # graph partition configuration
+  method: leiden # leiden is a partitioner detection algorithm
   method_params:
     max_size: 20 # Maximum size of communities
+    use_lcc: false # whether to use the largest connected component
+    random_seed: 42 # random seed for partitioning
 generate:
   mode: cot # atomic, aggregated, multi_hop, cot
   data_format: Sharegpt # Alpaca, Sharegpt, ChatML

graphgen/configs/multi_hop_config.yaml CHANGED Viewed

@@ -13,14 +13,10 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 1 # maximum depth for graph traversal
-    max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
 generate:
   mode: multi_hop # strategy for generating multi-hop QA pairs
   data_format: ChatML # Alpaca, Sharegpt, ChatML

 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
+    max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
+    min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
+    max_tokens_per_community: 10240 # max tokens per community
+    unit_sampling: random # edge sampling strategy, support: random, max_loss, min_loss
 generate:
   mode: multi_hop # strategy for generating multi-hop QA pairs
   data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/graphgen.py CHANGED Viewed

@@ -18,21 +18,14 @@ from graphgen.models import (
 from graphgen.operators import (
     build_kg,
     chunk_documents,
-    generate_cot,
     judge_statement,
     quiz,
     read_files,
     search_all,
-    traverse_graph_for_aggregated,
-    traverse_graph_for_atomic,
-    traverse_graph_for_multi_hop,
-)
-from graphgen.utils import (
-    async_to_sync_method,
-    compute_content_hash,
-    format_generation_results,
-    logger,
 )
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -238,51 +231,20 @@ class GraphGen:
     @async_to_sync_method
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
-        # TODO: implement graph partitioning, e.g. Partitioner().partition(self.graph_storage)
-        mode = generate_config["mode"]
-        if mode == "atomic":
-            results = await traverse_graph_for_atomic(
-                self.synthesizer_llm_client,
-                self.tokenizer_instance,
-                self.graph_storage,
-                partition_config["method_params"],
-                self.text_chunks_storage,
-                self.progress_bar,
-            )
-        elif mode == "multi_hop":
-            results = await traverse_graph_for_multi_hop(
-                self.synthesizer_llm_client,
-                self.tokenizer_instance,
-                self.graph_storage,
-                partition_config["method_params"],
-                self.text_chunks_storage,
-                self.progress_bar,
-            )
-        elif mode == "aggregated":
-            results = await traverse_graph_for_aggregated(
-                self.synthesizer_llm_client,
-                self.tokenizer_instance,
-                self.graph_storage,
-                partition_config["method_params"],
-                self.text_chunks_storage,
-                self.progress_bar,
-            )
-        elif mode == "cot":
-            results = await generate_cot(
-                self.graph_storage,
-                self.synthesizer_llm_client,
-                method_params=partition_config["method_params"],
-            )
-        else:
-            raise ValueError(f"Unknown generation mode: {mode}")
-        # Step 2： generate QA pairs
-        # TODO
-        # Step 3: format
-        results = format_generation_results(
-            results, output_data_format=generate_config["data_format"]
         )
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()

 from graphgen.operators import (
     build_kg,
     chunk_documents,
+    generate_qas,
     judge_statement,
+    partition_kg,
     quiz,
     read_files,
     search_all,
 )
+from graphgen.utils import async_to_sync_method, compute_content_hash, logger
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
     @async_to_sync_method
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
+        batches = await partition_kg(
+            self.graph_storage, self.tokenizer_instance, partition_config
+        )
+        # Step 2： generate QA pairs
+        results = await generate_qas(
+            self.synthesizer_llm_client, batches, generate_config
         )
+        if not results:
+            logger.warning("No QA pairs generated")
+            return
+        # Step 3: store the generated QA pairs
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()

graphgen/models/__init__.py CHANGED Viewed

@@ -1,17 +1,24 @@
-from .community.community_detector import CommunityDetector
-from .evaluate.length_evaluator import LengthEvaluator
-from .evaluate.mtld_evaluator import MTLDEvaluator
-from .evaluate.reward_evaluator import RewardEvaluator
-from .evaluate.uni_evaluator import UniEvaluator
-from .kg_builder.light_rag_kg_builder import LightRAGKGBuilder
 from .llm.openai_client import OpenAIClient
 from .llm.topk_token_model import TopkTokenModel
 from .reader import CsvReader, JsonlReader, JsonReader, TxtReader
 from .search.db.uniprot_search import UniProtSearch
 from .search.kg.wiki_search import WikiSearch
 from .search.web.bing_search import BingSearch
 from .search.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
-from .storage.json_storage import JsonKVStorage, JsonListStorage
-from .storage.networkx_storage import NetworkXStorage
 from .tokenizer import Tokenizer

+from .evaluator import LengthEvaluator, MTLDEvaluator, RewardEvaluator, UniEvaluator
+from .generator import (
+    AggregatedGenerator,
+    AtomicGenerator,
+    CoTGenerator,
+    MultiHopGenerator,
+)
+from .kg_builder import LightRAGKGBuilder
 from .llm.openai_client import OpenAIClient
 from .llm.topk_token_model import TopkTokenModel
+from .partitioner import (
+    BFSPartitioner,
+    DFSPartitioner,
+    ECEPartitioner,
+    LeidenPartitioner,
+)
 from .reader import CsvReader, JsonlReader, JsonReader, TxtReader
 from .search.db.uniprot_search import UniProtSearch
 from .search.kg.wiki_search import WikiSearch
 from .search.web.bing_search import BingSearch
 from .search.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
+from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
 from .tokenizer import Tokenizer

graphgen/models/community/__init__.py DELETED Viewed

File without changes

graphgen/models/community/community_detector.py DELETED Viewed

@@ -1,95 +0,0 @@
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import Any, Dict, List
-from graphgen.models.storage.networkx_storage import NetworkXStorage
-@dataclass
-class CommunityDetector:
-    """Class for community detection algorithms."""
-    graph_storage: NetworkXStorage = None
-    method: str = "leiden"
-    method_params: Dict[str, Any] = None
-    async def detect_communities(self) -> Dict[str, int]:
-        if self.method == "leiden":
-            return await self._leiden_communities(**self.method_params or {})
-        raise ValueError(f"Unknown community detection method: {self.method}")
-    async def get_graph(self):
-        return await self.graph_storage.get_graph()
-    async def _leiden_communities(
-        self, max_size: int = None, **kwargs
-    ) -> Dict[str, int]:
-        """
-        Detect communities using the Leiden algorithm.
-        If max_size is given, any community larger than max_size will be split
-        into smaller sub-communities each having at most max_size nodes.
-        """
-        import igraph as ig
-        import networkx as nx
-        from leidenalg import ModularityVertexPartition, find_partition
-        graph = await self.get_graph()
-        graph.remove_nodes_from(list(nx.isolates(graph)))
-        ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
-        random_seed = kwargs.get("random_seed", 42)
-        use_lcc = kwargs.get("use_lcc", False)
-        communities: Dict[str, int] = {}
-        if use_lcc:
-            lcc = ig_graph.components().giant()
-            partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
-            for part, cluster in enumerate(partition):
-                for v in cluster:
-                    communities[lcc.vs[v]["name"]] = part
-        else:
-            offset = 0
-            for component in ig_graph.components():
-                subgraph = ig_graph.induced_subgraph(component)
-                partition = find_partition(
-                    subgraph, ModularityVertexPartition, seed=random_seed
-                )
-                for part, cluster in enumerate(partition):
-                    for v in cluster:
-                        original_node = subgraph.vs[v]["name"]
-                        communities[original_node] = part + offset
-                offset += len(partition)
-        # split large communities if max_size is specified
-        if max_size is None or max_size <= 0:
-            return communities
-        return await self._split_communities(communities, max_size)
-    @staticmethod
-    async def _split_communities(
-        communities: Dict[str, int], max_size: int
-    ) -> Dict[str, int]:
-        """
-        Split communities larger than max_size into smaller sub-communities.
-        """
-        cid2nodes: Dict[int, List[str]] = defaultdict(list)
-        for node, cid in communities.items():
-            cid2nodes[cid].append(node)
-        new_communities: Dict[str, int] = {}
-        new_cid = 0
-        for cid, nodes in cid2nodes.items():
-            if len(nodes) <= max_size:
-                for n in nodes:
-                    new_communities[n] = new_cid
-                new_cid += 1
-            else:
-                for start in range(0, len(nodes), max_size):
-                    sub = nodes[start : start + max_size]
-                    for n in sub:
-                        new_communities[n] = new_cid
-                    new_cid += 1
-        return new_communities

graphgen/models/evaluate/__init__.py DELETED Viewed

File without changes

graphgen/models/evaluator/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .length_evaluator import LengthEvaluator
+from .mtld_evaluator import MTLDEvaluator
+from .reward_evaluator import RewardEvaluator
+from .uni_evaluator import UniEvaluator

graphgen/models/{evaluate → evaluator}/base_evaluator.py RENAMED Viewed

File without changes

graphgen/models/{evaluate → evaluator}/length_evaluator.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from graphgen.bases.datatypes import QAPair
-from graphgen.models.evaluate.base_evaluator import BaseEvaluator
 from graphgen.models.tokenizer import Tokenizer
 from graphgen.utils import create_event_loop

 from dataclasses import dataclass
 from graphgen.bases.datatypes import QAPair
+from graphgen.models.evaluator.base_evaluator import BaseEvaluator
 from graphgen.models.tokenizer import Tokenizer
 from graphgen.utils import create_event_loop

graphgen/models/{evaluate → evaluator}/mtld_evaluator.py RENAMED Viewed

@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
 from typing import Set
 from graphgen.bases.datatypes import QAPair
-from graphgen.models.evaluate.base_evaluator import BaseEvaluator
 from graphgen.utils import NLTKHelper, create_event_loop, detect_main_language
 nltk_helper = NLTKHelper()

 from typing import Set
 from graphgen.bases.datatypes import QAPair
+from graphgen.models.evaluator.base_evaluator import BaseEvaluator
 from graphgen.utils import NLTKHelper, create_event_loop, detect_main_language
 nltk_helper = NLTKHelper()

graphgen/models/{evaluate → evaluator}/reward_evaluator.py RENAMED Viewed

File without changes

graphgen/models/{evaluate → evaluator}/uni_evaluator.py RENAMED Viewed

File without changes

graphgen/models/generator/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .aggregated_generator import AggregatedGenerator
+from .atomic_generator import AtomicGenerator
+from .cot_generator import CoTGenerator
+from .multi_hop_generator import MultiHopGenerator

graphgen/models/generator/aggregated_generator.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from dataclasses import dataclass
+from typing import Any
+from graphgen.bases import BaseGenerator
+from graphgen.templates import AGGREGATED_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+@dataclass
+class AggregatedGenerator(BaseGenerator):
+    """
+    Aggregated Generator follows a TWO-STEP process:
+    1. rephrase: Rephrase the input nodes and edges into a coherent text that maintains the original meaning.
+                 The rephrased text is considered as answer to be used in the next step.
+    2. question generation: Generate relevant questions based on the rephrased text.
+    """
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompts for REPHRASE.
+        :param batch
+        :return:
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relations_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relations_str)
+        # TODO: configure add_context
+        #     if add_context:
+        #         original_ids = [
+        #             node["source_id"].split("<SEP>")[0] for node in _process_nodes
+        #         ] + [edge[2]["source_id"].split("<SEP>")[0] for edge in _process_edges]
+        #         original_ids = list(set(original_ids))
+        #         original_text = await text_chunks_storage.get_by_ids(original_ids)
+        #         original_text = "\n".join(
+        #             [
+        #                 f"{index + 1}. {text['content']}"
+        #                 for index, text in enumerate(original_text)
+        #             ]
+        #         )
+        prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format(
+            language=language, entities=entities_str, relationships=relations_str
+        )
+        return prompt
+    @staticmethod
+    def parse_rephrased_text(response: str) -> str:
+        """
+        Parse the rephrased text from the response.
+        :param response:
+        :return: rephrased text
+        """
+        if "Rephrased Text:" in response:
+            rephrased_text = response.split("Rephrased Text:")[1].strip()
+        elif "重述文本:" in response:
+            rephrased_text = response.split("重述文本:")[1].strip()
+        else:
+            rephrased_text = response.strip()
+        return rephrased_text.strip('"')
+    @staticmethod
+    def _build_prompt_for_question_generation(answer: str) -> str:
+        """
+        Build prompts for QUESTION GENERATION.
+        :param answer:
+        :return:
+        """
+        language = detect_main_language(answer)
+        prompt = AGGREGATED_GENERATION_PROMPT[language]["QUESTION_GENERATION"].format(
+            answer=answer
+        )
+        return prompt
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if response.startswith("Question:"):
+            question = response[len("Question:") :].strip()
+        elif response.startswith("问题："):
+            question = response[len("问题：") :].strip()
+        else:
+            question = response.strip()
+        return {
+            "question": question,
+        }
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        rephrasing_prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(rephrasing_prompt)
+        context = self.parse_rephrased_text(response)
+        question_generation_prompt = self._build_prompt_for_question_generation(context)
+        response = await self.llm_client.generate_answer(question_generation_prompt)
+        question = self.parse_response(response)["question"]
+        logger.info("Question: %s", question)
+        logger.info("Answer: %s", context)
+        qa_pairs = {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": context,
+            }
+        }
+        result.update(qa_pairs)
+        return result

graphgen/models/generator/atomic_generator.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from dataclasses import dataclass
+from typing import Any
+from graphgen.bases import BaseGenerator
+from graphgen.templates import ATOMIC_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+@dataclass
+class AtomicGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        context = ""
+        for node in nodes:
+            context += f"- {node[0]}: {node[1]['description']}\n"
+        for edge in edges:
+            context += f"- {edge[0]} - {edge[1]}: {edge[2]['description']}\n"
+        language = detect_main_language(context)
+        prompt = ATOMIC_GENERATION_PROMPT[language].format(context=context)
+        return prompt
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        """
+        AtomicGenerator normally generates one QA pair per response.
+        So we just need to parse one QA pair from the response.
+        :param response:
+        :return:
+        """
+        if "Question:" in response and "Answer:" in response:
+            question = response.split("Question:")[1].split("Answer:")[0].strip()
+            answer = response.split("Answer:")[1].strip()
+        elif "问题：" in response and "答案：" in response:
+            question = response.split("问题：")[1].split("答案：")[0].strip()
+            answer = response.split("答案：")[1].strip()
+        else:
+            logger.warning("Failed to parse response: %s", response)
+            return {}
+        question = question.strip('"')
+        answer = answer.strip('"')
+        logger.info("Question: %s", question)
+        logger.info("Answer: %s", answer)
+        return {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": answer,
+            }
+        }

graphgen/models/generator/cot_generator.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from dataclasses import dataclass
+from typing import Any
+from graphgen.bases import BaseGenerator
+from graphgen.templates import COT_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+@dataclass
+class CoTGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompts for COT Template Design.
+        :param batch:
+        :return:
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = COT_GENERATION_PROMPT[language]["COT_TEMPLATE_DESIGN"].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+    @staticmethod
+    def build_prompt_for_cot_generation(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]],
+        question: str,
+        reasoning_path: str,
+    ) -> str:
+        """
+        Build prompts for COT Generation.
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = COT_GENERATION_PROMPT[language]["COT_GENERATION"].format(
+            entities=entities_str,
+            relationships=relationships_str,
+            question=question,
+            reasoning_template=reasoning_path,
+        )
+        return prompt
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if "Question:" in response and "Reasoning-Path Design:" in response:
+            question = (
+                response.split("Question:")[1]
+                .split("Reasoning-Path Design:")[0]
+                .strip()
+            )
+            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
+        elif "问题：" in response and "推理路径设计：" in response:
+            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
+            reasoning_path = response.split("推理路径设计：")[1].strip()
+        else:
+            logger.warning("Failed to parse CoT template: %s", response)
+            return {}
+        question = question.strip('"')
+        reasoning_path = reasoning_path.strip('"')
+        logger.info("CoT Question: %s", question)
+        logger.info("CoT Reasoning Path: %s", reasoning_path)
+        return {
+            "question": question,
+            "reasoning_path": reasoning_path,
+        }
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        response = self.parse_response(response)
+        question, reasoning_path = response["question"], response["reasoning_path"]
+        prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
+        cot_answer = await self.llm_client.generate_answer(prompt)
+        logger.info("CoT Answer: %s", cot_answer)
+        qa_pairs = {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": cot_answer,
+                "reasoning_path": reasoning_path,
+            }
+        }
+        result.update(qa_pairs)
+        return result

graphgen/models/generator/multi_hop_generator.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from dataclasses import dataclass
+from typing import Any
+from graphgen.bases import BaseGenerator
+from graphgen.templates import MULTI_HOP_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+@dataclass
+class MultiHopGenerator(BaseGenerator):
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if "Question:" in response and "Answer:" in response:
+            question = response.split("Question:")[1].split("Answer:")[0].strip()
+            answer = response.split("Answer:")[1].strip()
+        elif "问题：" in response and "答案：" in response:
+            question = response.split("问题：")[1].split("答案：")[0].strip()
+            answer = response.split("答案：")[1].strip()
+        else:
+            logger.warning("Failed to parse response: %s", response)
+            return {}
+        question = question.strip('"')
+        answer = answer.strip('"')
+        logger.info("Question: %s", question)
+        logger.info("Answer: %s", answer)
+        return {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": answer,
+            }
+        }

graphgen/models/kg_builder/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from .light_rag_kg_builder import LightRAGKGBuilder

graphgen/models/llm/limitter.py CHANGED Viewed

@@ -1,17 +1,17 @@
 import time
 from datetime import datetime, timedelta
-import asyncio
 from graphgen.utils import logger
 class RPM:
     def __init__(self, rpm: int = 1000):
         self.rpm = rpm
-        self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
-    def get_minute_slot(self):
         current_time = time.time()
         dt_object = datetime.fromtimestamp(current_time)
         total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
@@ -22,37 +22,35 @@ class RPM:
         dt_object = datetime.fromtimestamp(current)
         minute_slot = self.get_minute_slot()
-        if self.record['rpm_slot'] == minute_slot:
             # check RPM exceed
-            if self.record['counter'] >= self.rpm:
                 # wait until next minute
-                next_minute = dt_object.replace(
-                    second=0, microsecond=0) + timedelta(minutes=1)
                 _next = next_minute.timestamp()
                 sleep_time = abs(_next - current)
                 if not silent:
-                    logger.info('RPM sleep %s', sleep_time)
                 await asyncio.sleep(sleep_time)
-                self.record = {
-                    'rpm_slot': self.get_minute_slot(),
-                    'counter': 0
-                }
         else:
-            self.record = {'rpm_slot': self.get_minute_slot(), 'counter': 0}
-        self.record['counter'] += 1
         if not silent:
             logger.debug(self.record)
 class TPM:
     def __init__(self, tpm: int = 20000):
         self.tpm = tpm
-        self.record = {'tpm_slot': self.get_minute_slot(), 'counter': 0}
-    def get_minute_slot(self):
         current_time = time.time()
         dt_object = datetime.fromtimestamp(current_time)
         total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
@@ -64,25 +62,25 @@ class TPM:
         minute_slot = self.get_minute_slot()
         # get next slot, skip
-        if self.record['tpm_slot'] != minute_slot:
-            self.record = {'tpm_slot': minute_slot, 'counter': token_count}
             return
         # check RPM exceed
-        self.record['counter'] += token_count
-        if self.record['counter'] > self.tpm:
             # wait until next minute
-            next_minute = dt_object.replace(
-                second=0, microsecond=0) + timedelta(minutes=1)
             _next = next_minute.timestamp()
             sleep_time = abs(_next - current)
-            logger.info('TPM sleep %s', sleep_time)
             await asyncio.sleep(sleep_time)
-            self.record = {
-                'tpm_slot': self.get_minute_slot(),
-                'counter': token_count
-            }
         if not silent:
             logger.debug(self.record)

+import asyncio
 import time
 from datetime import datetime, timedelta
 from graphgen.utils import logger
 class RPM:
     def __init__(self, rpm: int = 1000):
         self.rpm = rpm
+        self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0}
+    @staticmethod
+    def get_minute_slot():
         current_time = time.time()
         dt_object = datetime.fromtimestamp(current_time)
         total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
         dt_object = datetime.fromtimestamp(current)
         minute_slot = self.get_minute_slot()
+        if self.record["rpm_slot"] == minute_slot:
             # check RPM exceed
+            if self.record["counter"] >= self.rpm:
                 # wait until next minute
+                next_minute = dt_object.replace(second=0, microsecond=0) + timedelta(
+                    minutes=1
+                )
                 _next = next_minute.timestamp()
                 sleep_time = abs(_next - current)
                 if not silent:
+                    logger.info("RPM sleep %s", sleep_time)
                 await asyncio.sleep(sleep_time)
+                self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0}
         else:
+            self.record = {"rpm_slot": self.get_minute_slot(), "counter": 0}
+        self.record["counter"] += 1
         if not silent:
             logger.debug(self.record)
 class TPM:
     def __init__(self, tpm: int = 20000):
         self.tpm = tpm
+        self.record = {"tpm_slot": self.get_minute_slot(), "counter": 0}
+    @staticmethod
+    def get_minute_slot():
         current_time = time.time()
         dt_object = datetime.fromtimestamp(current_time)
         total_minutes_since_midnight = dt_object.hour * 60 + dt_object.minute
         minute_slot = self.get_minute_slot()
         # get next slot, skip
+        if self.record["tpm_slot"] != minute_slot:
+            self.record = {"tpm_slot": minute_slot, "counter": token_count}
             return
         # check RPM exceed
+        old_counter = self.record["counter"]
+        self.record["counter"] += token_count
+        if self.record["counter"] > self.tpm:
+            logger.info("Current TPM: %s, limit: %s", old_counter, self.tpm)
             # wait until next minute
+            next_minute = dt_object.replace(second=0, microsecond=0) + timedelta(
+                minutes=1
+            )
             _next = next_minute.timestamp()
             sleep_time = abs(_next - current)
+            logger.warning("TPM limit exceeded, wait %s seconds", sleep_time)
             await asyncio.sleep(sleep_time)
+            self.record = {"tpm_slot": self.get_minute_slot(), "counter": token_count}
         if not silent:
             logger.debug(self.record)

graphgen/models/llm/openai_client.py CHANGED Viewed

@@ -39,6 +39,8 @@ class OpenAIClient(BaseLLMClient):
         seed: Optional[int] = None,
         topk_per_token: int = 5,  # number of topk tokens to generate for each token
         request_limit: bool = False,
         **kwargs: Any,
     ):
         super().__init__(**kwargs)
@@ -51,8 +53,8 @@ class OpenAIClient(BaseLLMClient):
         self.token_usage: list = []
         self.request_limit = request_limit
-        self.rpm = RPM(rpm=1000)
-        self.tpm = TPM(tpm=50000)
         self.__post_init__()

         seed: Optional[int] = None,
         topk_per_token: int = 5,  # number of topk tokens to generate for each token
         request_limit: bool = False,
+        rpm: Optional[RPM] = None,
+        tpm: Optional[TPM] = None,
         **kwargs: Any,
     ):
         super().__init__(**kwargs)
         self.token_usage: list = []
         self.request_limit = request_limit
+        self.rpm = rpm or RPM()
+        self.tpm = tpm or TPM()
         self.__post_init__()

graphgen/models/partitioner/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .bfs_partitioner import BFSPartitioner
+from .dfs_partitioner import DFSPartitioner
+from .ece_partitioner import ECEPartitioner
+from .leiden_partitioner import LeidenPartitioner

graphgen/models/partitioner/bfs_partitioner.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import random
+from collections import deque
+from dataclasses import dataclass
+from typing import Any, List
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+@dataclass
+class BFSPartitioner(BasePartitioner):
+    """
+    BFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a unit.
+    2. Expand the community using BFS until the max unit size is reached.
+    (A unit is a node or an edge.)
+    """
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> List[Community]:
+        nodes = await g.get_all_nodes()
+        edges = await g.get_all_edges()
+        adj, _ = self._build_adjacency_list(nodes, edges)
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+        communities: List[Community] = []
+        units = [(NODE_UNIT, n[0]) for n in nodes] + [
+            (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
+        ]
+        random.shuffle(units)
+        for kind, seed in units:
+            if (kind == NODE_UNIT and seed in used_n) or (
+                kind == EDGE_UNIT and seed in used_e
+            ):
+                continue
+            comm_n: List[str] = []
+            comm_e: List[tuple[str, str]] = []
+            queue: deque[tuple[str, Any]] = deque([(kind, seed)])
+            cnt = 0
+            while queue and cnt < max_units_per_community:
+                k, it = queue.popleft()
+                if k == NODE_UNIT:
+                    if it in used_n:
+                        continue
+                    used_n.add(it)
+                    comm_n.append(it)
+                    cnt += 1
+                    for nei in adj[it]:
+                        e_key = frozenset((it, nei))
+                        if e_key not in used_e:
+                            queue.append((EDGE_UNIT, e_key))
+                else:
+                    if it in used_e:
+                        continue
+                    used_e.add(it)
+                    u, v = it
+                    comm_e.append((u, v))
+                    cnt += 1
+                    # push nodes that are not visited
+                    for n in it:
+                        if n not in used_n:
+                            queue.append((NODE_UNIT, n))
+            if comm_n or comm_e:
+                communities.append(
+                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
+                )
+        return communities

graphgen/models/partitioner/dfs_partitioner.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import random
+from dataclasses import dataclass
+from typing import Any, List
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+@dataclass
+class DFSPartitioner(BasePartitioner):
+    """
+    DFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a unit.
+    2. Random walk using DFS until the community reaches the max unit size.
+    (In GraphGen, a unit is defined as a node or an edge.)
+    """
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> List[Community]:
+        nodes = await g.get_all_nodes()
+        edges = await g.get_all_edges()
+        adj, _ = self._build_adjacency_list(nodes, edges)
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+        communities: List[Community] = []
+        units = [(NODE_UNIT, n[0]) for n in nodes] + [
+            (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
+        ]
+        random.shuffle(units)
+        for kind, seed in units:
+            if (kind == NODE_UNIT and seed in used_n) or (
+                kind == EDGE_UNIT and seed in used_e
+            ):
+                continue
+            comm_n, comm_e = [], []
+            stack = [(kind, seed)]
+            cnt = 0
+            while stack and cnt < max_units_per_community:
+                k, it = stack.pop()
+                if k == NODE_UNIT:
+                    if it in used_n:
+                        continue
+                    used_n.add(it)
+                    comm_n.append(it)
+                    cnt += 1
+                    for nei in adj[it]:
+                        e_key = frozenset((it, nei))
+                        if e_key not in used_e:
+                            stack.append((EDGE_UNIT, e_key))
+                            break
+                else:
+                    if it in used_e:
+                        continue
+                    used_e.add(it)
+                    comm_e.append(tuple(it))
+                    cnt += 1
+                    # push neighboring nodes
+                    for n in it:
+                        if n not in used_n:
+                            stack.append((NODE_UNIT, n))
+            if comm_n or comm_e:
+                communities.append(
+                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
+                )
+        return communities

graphgen/models/partitioner/ece_partitioner.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import asyncio
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set, Tuple
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.bases import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+from graphgen.models.partitioner.bfs_partitioner import BFSPartitioner
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+@dataclass
+class ECEPartitioner(BFSPartitioner):
+    """
+    ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE).
+    We calculate ECE for edges in KG (represented as 'comprehension loss')
+    and group edges with similar ECE values into the same community.
+    1. Select a sampling strategy.
+    2. Choose a unit based on the sampling strategy.
+    2. Expand the community using BFS.
+    3. When expending, prefer to add units with the sampling strategy.
+    4. Stop when the max unit size is reached or the max input length is reached.
+    (A unit is a node or an edge.)
+    """
+    @staticmethod
+    def _sort_units(units: list, edge_sampling: str) -> list:
+        """
+        Sort units with edge sampling strategy
+        :param units: total units
+        :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+        :return: sorted units
+        """
+        if edge_sampling == "random":
+            random.shuffle(units)
+        elif edge_sampling == "min_loss":
+            units = sorted(
+                units,
+                key=lambda x: x[-1]["loss"],
+            )
+        elif edge_sampling == "max_loss":
+            units = sorted(
+                units,
+                key=lambda x: x[-1]["loss"],
+                reverse=True,
+            )
+        else:
+            raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+        return units
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 10,
+        min_units_per_community: int = 1,
+        max_tokens_per_community: int = 10240,
+        unit_sampling: str = "random",
+        **kwargs: Any,
+    ) -> List[Community]:
+        nodes: List[Tuple[str, dict]] = await g.get_all_nodes()
+        edges: List[Tuple[str, str, dict]] = await g.get_all_edges()
+        adj, _ = self._build_adjacency_list(nodes, edges)
+        node_dict = dict(nodes)
+        edge_dict = {frozenset((u, v)): d for u, v, d in edges}
+        all_units: List[Tuple[str, Any, dict]] = [
+            (NODE_UNIT, nid, d) for nid, d in nodes
+        ] + [(EDGE_UNIT, frozenset((u, v)), d) for u, v, d in edges]
+        used_n: Set[str] = set()
+        used_e: Set[frozenset[str]] = set()
+        communities: List = []
+        all_units = self._sort_units(all_units, unit_sampling)
+        async def _grow_community(
+            seed_unit: Tuple[str, Any, dict]
+        ) -> Optional[Community]:
+            nonlocal used_n, used_e
+            community_nodes: Dict[str, dict] = {}
+            community_edges: Dict[frozenset[str], dict] = {}
+            queue: asyncio.Queue = asyncio.Queue()
+            token_sum = 0
+            async def _add_unit(u):
+                nonlocal token_sum
+                t, i, d = u
+                if t == NODE_UNIT:  # node
+                    if i in used_n or i in community_nodes:
+                        return False
+                    community_nodes[i] = d
+                    used_n.add(i)
+                else:  # edge
+                    if i in used_e or i in community_edges:
+                        return False
+                    community_edges[i] = d
+                    used_e.add(i)
+                token_sum += d.get("length", 0)
+                return True
+            await _add_unit(seed_unit)
+            await queue.put(seed_unit)
+            # BFS
+            while not queue.empty():
+                if (
+                    len(community_nodes) + len(community_edges)
+                    >= max_units_per_community
+                    or token_sum >= max_tokens_per_community
+                ):
+                    break
+                cur_type, cur_id, _ = await queue.get()
+                neighbors: List[Tuple[str, Any, dict]] = []
+                if cur_type == NODE_UNIT:
+                    for nb_id in adj.get(cur_id, []):
+                        e_key = frozenset((cur_id, nb_id))
+                        if e_key not in used_e and e_key not in community_edges:
+                            neighbors.append((EDGE_UNIT, e_key, edge_dict[e_key]))
+                else:
+                    for n_id in cur_id:
+                        if n_id not in used_n and n_id not in community_nodes:
+                            neighbors.append((NODE_UNIT, n_id, node_dict[n_id]))
+                neighbors = self._sort_units(neighbors, unit_sampling)
+                for nb in neighbors:
+                    if (
+                        len(community_nodes) + len(community_edges)
+                        >= max_units_per_community
+                        or token_sum >= max_tokens_per_community
+                    ):
+                        break
+                    if await _add_unit(nb):
+                        await queue.put(nb)
+            if len(community_nodes) + len(community_edges) < min_units_per_community:
+                return None
+            return Community(
+                id=len(communities),
+                nodes=list(community_nodes.keys()),
+                edges=[(u, v) for (u, v), _ in community_edges.items()],
+            )
+        async for unit in tqdm_async(all_units, desc="ECE partition"):
+            utype, uid, _ = unit
+            if (utype == NODE_UNIT and uid in used_n) or (
+                utype == EDGE_UNIT and uid in used_e
+            ):
+                continue
+            comm = await _grow_community(unit)
+            if comm is not None:
+                communities.append(comm)
+        return communities

graphgen/models/partitioner/leiden_partitioner.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Set, Tuple
+import igraph as ig
+from leidenalg import ModularityVertexPartition, find_partition
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
+@dataclass
+class LeidenPartitioner(BasePartitioner):
+    """
+    Leiden partitioner that partitions the graph into communities using the Leiden algorithm.
+    """
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_size: int = 20,
+        use_lcc: bool = False,
+        random_seed: int = 42,
+        **kwargs: Any,
+    ) -> List[Community]:
+        """
+        Leiden Partition follows these steps:
+        1. export the graph from graph storage
+        2. use the leiden algorithm to detect communities, get {node: community_id}
+        3. split large communities if max_size is given
+        4. convert {node: community_id} to List[Community]
+        :param g
+        :param max_size: maximum size of each community, if None or <=0, no limit
+        :param use_lcc: whether to use the largest connected component only
+        :param random_seed
+        :param kwargs: other parameters for the leiden algorithm
+        :return:
+        """
+        nodes = await g.get_all_nodes()  # List[Tuple[str, dict]]
+        edges = await g.get_all_edges()  # List[Tuple[str, str, dict]]
+        node2cid: Dict[str, int] = await self._run_leiden(
+            nodes, edges, use_lcc, random_seed
+        )
+        if max_size is not None and max_size > 0:
+            node2cid = await self._split_communities(node2cid, max_size)
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for n, cid in node2cid.items():
+            cid2nodes[cid].append(n)
+        communities: List[Community] = []
+        for cid, nodes in cid2nodes.items():
+            node_set: Set[str] = set(nodes)
+            comm_edges: List[Tuple[str, str]] = [
+                (u, v) for u, v, _ in edges if u in node_set and v in node_set
+            ]
+            communities.append(Community(id=cid, nodes=nodes, edges=comm_edges))
+        return communities
+    @staticmethod
+    async def _run_leiden(
+        nodes: List[Tuple[str, dict]],
+        edges: List[Tuple[str, str, dict]],
+        use_lcc: bool = False,
+        random_seed: int = 42,
+    ) -> Dict[str, int]:
+        # build igraph
+        ig_graph = ig.Graph.TupleList(((u, v) for u, v, _ in edges), directed=False)
+        # remove isolated nodes
+        ig_graph.delete_vertices(ig_graph.vs.select(_degree_eq=0))
+        node2cid: Dict[str, int] = {}
+        if use_lcc:
+            lcc = ig_graph.components().giant()
+            partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
+            for part_id, cluster in enumerate(partition):
+                for v in cluster:
+                    node2cid[lcc.vs[v]["name"]] = part_id
+        else:
+            offset = 0
+            for component in ig_graph.components():
+                subgraph = ig_graph.induced_subgraph(component)
+                partition = find_partition(
+                    subgraph, ModularityVertexPartition, seed=random_seed
+                )
+                for part_id, cluster in enumerate(partition):
+                    for v in cluster:
+                        original_node = subgraph.vs[v]["name"]
+                        node2cid[original_node] = part_id + offset
+                offset += len(partition)
+        return node2cid
+    @staticmethod
+    async def _split_communities(
+        node2cid: Dict[str, int], max_size: int
+    ) -> Dict[str, int]:
+        """
+        Split communities larger than max_size into smaller sub-communities.
+        """
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for n, cid in node2cid.items():
+            cid2nodes[cid].append(n)
+        new_mapping: Dict[str, int] = {}
+        new_cid = 0
+        for nodes in cid2nodes.values():
+            if len(nodes) <= max_size:
+                for n in nodes:
+                    new_mapping[n] = new_cid
+                new_cid += 1
+            else:
+                for start in range(0, len(nodes), max_size):
+                    chunk = nodes[start : start + max_size]
+                    for n in chunk:
+                        new_mapping[n] = new_cid
+                    new_cid += 1
+        return new_mapping

graphgen/models/storage/__init__.py CHANGED Viewed

	@@ -0,0 +1,2 @@


1	+ from .json_storage import JsonKVStorage, JsonListStorage
2	+ from .networkx_storage import NetworkXStorage

graphgen/models/storage/networkx_storage.py CHANGED Viewed

@@ -102,8 +102,8 @@ class NetworkXStorage(BaseGraphStorage):
     async def get_node(self, node_id: str) -> Union[dict, None]:
         return self._graph.nodes.get(node_id)
-    async def get_all_nodes(self) -> Union[list[dict], None]:
-        return self._graph.nodes(data=True)
     async def node_degree(self, node_id: str) -> int:
         return self._graph.degree(node_id)
@@ -116,8 +116,8 @@ class NetworkXStorage(BaseGraphStorage):
     ) -> Union[dict, None]:
         return self._graph.edges.get((source_node_id, target_node_id))
-    async def get_all_edges(self) -> Union[list[dict], None]:
-        return self._graph.edges(data=True)
     async def get_node_edges(
         self, source_node_id: str

     async def get_node(self, node_id: str) -> Union[dict, None]:
         return self._graph.nodes.get(node_id)
+    async def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]:
+        return list(self._graph.nodes(data=True))
     async def node_degree(self, node_id: str) -> int:
         return self._graph.degree(node_id)
     ) -> Union[dict, None]:
         return self._graph.edges.get((source_node_id, target_node_id))
+    async def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]:
+        return list(self._graph.edges(data=True))
     async def get_node_edges(
         self, source_node_id: str

graphgen/operators/__init__.py CHANGED Viewed

@@ -1,13 +1,8 @@
-from graphgen.operators.build_kg.build_kg import build_kg
-from graphgen.operators.generate.generate_cot import generate_cot
-from graphgen.operators.search.search_all import search_all
 from .judge import judge_statement
 from .quiz import quiz
 from .read import read_files
 from .split import chunk_documents
-from .traverse_graph import (
-    traverse_graph_for_aggregated,
-    traverse_graph_for_atomic,
-    traverse_graph_for_multi_hop,
-)

+from .build_kg import build_kg
+from .generate import generate_qas
 from .judge import judge_statement
+from .partition import partition_kg
 from .quiz import quiz
 from .read import read_files
+from .search import search_all
 from .split import chunk_documents

graphgen/operators/build_kg/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from .build_kg import build_kg

graphgen/operators/build_kg/split_kg.py DELETED Viewed

@@ -1,382 +0,0 @@
-import random
-from collections import defaultdict
-from typing import Dict
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import NetworkXStorage
-from graphgen.utils import logger
-async def _get_node_info(
-    node_id: str,
-    graph_storage: NetworkXStorage,
-) -> dict:
-    """
-    Get node info
-    :param node_id: node id
-    :param graph_storage: graph storage instance
-    :return: node info
-    """
-    node_data = await graph_storage.get_node(node_id)
-    return {"node_id": node_id, **node_data}
-def _get_level_n_edges_by_max_width(
-    edge_adj_list: dict,
-    node_dict: dict,
-    edges: list,
-    nodes,
-    src_edge: tuple,
-    max_depth: int,
-    bidirectional: bool,
-    max_extra_edges: int,
-    edge_sampling: str,
-    loss_strategy: str = "only_edge",
-) -> list:
-    """
-    Get level n edges for an edge.
-    n is decided by max_depth in traverse_strategy
-    :param edge_adj_list
-    :param node_dict
-    :param edges
-    :param nodes
-    :param src_edge
-    :param max_depth
-    :param bidirectional
-    :param max_extra_edges
-    :param edge_sampling
-    :return: level n edges
-    """
-    src_id, tgt_id, _ = src_edge
-    level_n_edges = []
-    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
-    while max_depth > 0 and max_extra_edges > 0:
-        max_depth -= 1
-        candidate_edges = [
-            edges[edge_id]
-            for node in start_nodes
-            for edge_id in edge_adj_list[node]
-            if not edges[edge_id][2].get("visited", False)
-        ]
-        if not candidate_edges:
-            break
-        if len(candidate_edges) >= max_extra_edges:
-            if loss_strategy == "both":
-                er_tuples = [
-                    ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
-                    for edge in candidate_edges
-                ]
-                candidate_edges = _sort_tuples(er_tuples, edge_sampling)[
-                    :max_extra_edges
-                ]
-            elif loss_strategy == "only_edge":
-                candidate_edges = _sort_edges(candidate_edges, edge_sampling)[
-                    :max_extra_edges
-                ]
-            else:
-                raise ValueError(f"Invalid loss strategy: {loss_strategy}")
-            for edge in candidate_edges:
-                level_n_edges.append(edge)
-                edge[2]["visited"] = True
-            break
-        max_extra_edges -= len(candidate_edges)
-        new_start_nodes = set()
-        for edge in candidate_edges:
-            level_n_edges.append(edge)
-            edge[2]["visited"] = True
-            if not edge[0] in start_nodes:
-                new_start_nodes.add(edge[0])
-            if not edge[1] in start_nodes:
-                new_start_nodes.add(edge[1])
-        start_nodes = new_start_nodes
-    return level_n_edges
-def _get_level_n_edges_by_max_tokens(
-    edge_adj_list: dict,
-    node_dict: dict,
-    edges: list,
-    nodes: list,
-    src_edge: tuple,
-    max_depth: int,
-    bidirectional: bool,
-    max_tokens: int,
-    edge_sampling: str,
-    loss_strategy: str = "only_edge",
-) -> list:
-    """
-    Get level n edges for an edge.
-    n is decided by max_depth in traverse_strategy.
-    :param edge_adj_list
-    :param node_dict
-    :param edges
-    :param nodes
-    :param src_edge
-    :param max_depth
-    :param bidirectional
-    :param max_tokens
-    :param edge_sampling
-    :return: level n edges
-    """
-    src_id, tgt_id, src_edge_data = src_edge
-    max_tokens -= (
-        src_edge_data["length"]
-        + nodes[node_dict[src_id]][1]["length"]
-        + nodes[node_dict[tgt_id]][1]["length"]
-    )
-    level_n_edges = []
-    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
-    temp_nodes = {src_id, tgt_id}
-    while max_depth > 0 and max_tokens > 0:
-        max_depth -= 1
-        candidate_edges = [
-            edges[edge_id]
-            for node in start_nodes
-            for edge_id in edge_adj_list[node]
-            if not edges[edge_id][2].get("visited", False)
-        ]
-        if not candidate_edges:
-            break
-        if loss_strategy == "both":
-            er_tuples = [
-                ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
-                for edge in candidate_edges
-            ]
-            candidate_edges = _sort_tuples(er_tuples, edge_sampling)
-        elif loss_strategy == "only_edge":
-            candidate_edges = _sort_edges(candidate_edges, edge_sampling)
-        else:
-            raise ValueError(f"Invalid loss strategy: {loss_strategy}")
-        for edge in candidate_edges:
-            max_tokens -= edge[2]["length"]
-            if not edge[0] in temp_nodes:
-                max_tokens -= nodes[node_dict[edge[0]]][1]["length"]
-            if not edge[1] in temp_nodes:
-                max_tokens -= nodes[node_dict[edge[1]]][1]["length"]
-            if max_tokens < 0:
-                return level_n_edges
-            level_n_edges.append(edge)
-            edge[2]["visited"] = True
-            temp_nodes.add(edge[0])
-            temp_nodes.add(edge[1])
-        new_start_nodes = set()
-        for edge in candidate_edges:
-            if not edge[0] in start_nodes:
-                new_start_nodes.add(edge[0])
-            if not edge[1] in start_nodes:
-                new_start_nodes.add(edge[1])
-        start_nodes = new_start_nodes
-    return level_n_edges
-def _sort_tuples(er_tuples: list, edge_sampling: str) -> list:
-    """
-    Sort edges with edge sampling strategy
-    :param er_tuples: [(nodes:list, edge:tuple)]
-    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
-    :return: sorted edges
-    """
-    if edge_sampling == "random":
-        er_tuples = random.sample(er_tuples, len(er_tuples))
-    elif edge_sampling == "min_loss":
-        er_tuples = sorted(
-            er_tuples,
-            key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
-        )
-    elif edge_sampling == "max_loss":
-        er_tuples = sorted(
-            er_tuples,
-            key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
-            reverse=True,
-        )
-    else:
-        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
-    edges = [edge for _, edge in er_tuples]
-    return edges
-def _sort_edges(edges: list, edge_sampling: str) -> list:
-    """
-    Sort edges with edge sampling strategy
-    :param edges: total edges
-    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
-    :return: sorted edges
-    """
-    if edge_sampling == "random":
-        random.shuffle(edges)
-    elif edge_sampling == "min_loss":
-        edges = sorted(edges, key=lambda x: x[2]["loss"])
-    elif edge_sampling == "max_loss":
-        edges = sorted(edges, key=lambda x: x[2]["loss"], reverse=True)
-    else:
-        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
-    return edges
-async def get_batches_with_strategy(  # pylint: disable=too-many-branches
-    nodes: list,
-    edges: list,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: Dict,
-):
-    expand_method = traverse_strategy["expand_method"]
-    if expand_method == "max_width":
-        logger.info("Using max width strategy")
-    elif expand_method == "max_tokens":
-        logger.info("Using max tokens strategy")
-    else:
-        raise ValueError(f"Invalid expand method: {expand_method}")
-    max_depth = traverse_strategy["max_depth"]
-    edge_sampling = traverse_strategy["edge_sampling"]
-    # 构建临接矩阵
-    edge_adj_list = defaultdict(list)
-    node_dict = {}
-    processing_batches = []
-    node_cache = {}
-    async def get_cached_node_info(node_id: str) -> dict:
-        if node_id not in node_cache:
-            node_cache[node_id] = await _get_node_info(node_id, graph_storage)
-        return node_cache[node_id]
-    for i, (node_name, _) in enumerate(nodes):
-        node_dict[node_name] = i
-    if traverse_strategy["loss_strategy"] == "both":
-        er_tuples = [
-            ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
-            for edge in edges
-        ]
-        edges = _sort_tuples(er_tuples, edge_sampling)
-    elif traverse_strategy["loss_strategy"] == "only_edge":
-        edges = _sort_edges(edges, edge_sampling)
-    else:
-        raise ValueError(f"Invalid loss strategy: {traverse_strategy['loss_strategy']}")
-    for i, (src, tgt, _) in enumerate(edges):
-        edge_adj_list[src].append(i)
-        edge_adj_list[tgt].append(i)
-    for edge in tqdm_async(edges, desc="Preparing batches"):
-        if "visited" in edge[2] and edge[2]["visited"]:
-            continue
-        edge[2]["visited"] = True
-        _process_nodes = []
-        _process_edges = []
-        src_id = edge[0]
-        tgt_id = edge[1]
-        _process_nodes.extend(
-            [await get_cached_node_info(src_id), await get_cached_node_info(tgt_id)]
-        )
-        _process_edges.append(edge)
-        if expand_method == "max_width":
-            level_n_edges = _get_level_n_edges_by_max_width(
-                edge_adj_list,
-                node_dict,
-                edges,
-                nodes,
-                edge,
-                max_depth,
-                traverse_strategy["bidirectional"],
-                traverse_strategy["max_extra_edges"],
-                edge_sampling,
-                traverse_strategy["loss_strategy"],
-            )
-        else:
-            level_n_edges = _get_level_n_edges_by_max_tokens(
-                edge_adj_list,
-                node_dict,
-                edges,
-                nodes,
-                edge,
-                max_depth,
-                traverse_strategy["bidirectional"],
-                traverse_strategy["max_tokens"],
-                edge_sampling,
-                traverse_strategy["loss_strategy"],
-            )
-        for _edge in level_n_edges:
-            _process_nodes.append(await get_cached_node_info(_edge[0]))
-            _process_nodes.append(await get_cached_node_info(_edge[1]))
-            _process_edges.append(_edge)
-        # 去重
-        _process_nodes = list(
-            {node["node_id"]: node for node in _process_nodes}.values()
-        )
-        _process_edges = list(
-            {(edge[0], edge[1]): edge for edge in _process_edges}.values()
-        )
-        processing_batches.append((_process_nodes, _process_edges))
-    logger.info("Processing batches: %d", len(processing_batches))
-    # isolate nodes
-    isolated_node_strategy = traverse_strategy["isolated_node_strategy"]
-    if isolated_node_strategy == "add":
-        processing_batches = await _add_isolated_nodes(
-            nodes, processing_batches, graph_storage
-        )
-        logger.info(
-            "Processing batches after adding isolated nodes: %d",
-            len(processing_batches),
-        )
-    return processing_batches
-async def _add_isolated_nodes(
-    nodes: list,
-    processing_batches: list,
-    graph_storage: NetworkXStorage,
-) -> list:
-    visited_nodes = set()
-    for _process_nodes, _process_edges in processing_batches:
-        for node in _process_nodes:
-            visited_nodes.add(node["node_id"])
-    for node in nodes:
-        if node[0] not in visited_nodes:
-            _process_nodes = [await _get_node_info(node[0], graph_storage)]
-            processing_batches.append((_process_nodes, []))
-    return processing_batches

graphgen/operators/generate/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from .generate_qas import generate_qas

graphgen/operators/generate/generate_cot.py DELETED Viewed

@@ -1,117 +0,0 @@
-import asyncio
-from typing import Dict, List, Tuple
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import CommunityDetector, NetworkXStorage, OpenAIClient
-from graphgen.templates import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT
-from graphgen.utils import compute_content_hash, detect_main_language
-async def generate_cot(
-    graph_storage: NetworkXStorage,
-    synthesizer_llm_client: OpenAIClient,
-    method_params: Dict = None,
-):
-    method = method_params.get("method", "leiden")
-    detector = CommunityDetector(
-        graph_storage=graph_storage, method=method, method_params=method_params
-    )
-    results = await detector.detect_communities()
-    # Convert results to a format suitable for summarization
-    communities = {}
-    for node, community_id in results.items():
-        if community_id not in communities:
-            communities[community_id] = []
-        communities[community_id].append(node)
-    if not communities:
-        return {}
-    semaphore = asyncio.Semaphore(value=1000)
-    async def _generate_from_single_community(
-        c_id: int, nodes: List[str]
-    ) -> Tuple[int, Tuple[str, str, str]]:
-        """Summarize a single community."""
-        async with semaphore:
-            entities: List[str] = []
-            relationships: List[str] = []
-            for n in nodes:
-                node_data = await graph_storage.get_node(n)
-                if node_data is not None:
-                    entities.append(f"({n}: {node_data.get('description')})")
-                edges = await graph_storage.get_node_edges(n)
-                for edge in edges:
-                    target = edge[1]
-                    if target in nodes:
-                        edge_data = await graph_storage.get_edge(n, target)
-                        relationships.append(
-                            f"({n}) - [{edge_data['description']}] -> ({target})"
-                        )
-            entities_str = "\n".join(entities)
-            relationships_str = "\n".join(relationships)
-            language = (
-                "English"
-                if detect_main_language(entities_str + relationships_str) == "en"
-                else "Chinese"
-            )
-            prompt = COT_TEMPLATE_DESIGN_PROMPT[language]["TEMPLATE"].format(
-                entities=entities_str,
-                relationships=relationships_str,
-            )
-            cot_template = await synthesizer_llm_client.generate_answer(prompt)
-            if "问题：" in cot_template and "推理路径设计：" in cot_template:
-                question = cot_template.split("问题：")[1].split("推理路径设计：")[0].strip()
-                reasoning_path = cot_template.split("推理路径设计：")[1].strip()
-            elif (
-                "Question:" in cot_template and "Reasoning-Path Design:" in cot_template
-            ):
-                question = (
-                    cot_template.split("Question:")[1]
-                    .split("Reasoning-Path Design:")[0]
-                    .strip()
-                )
-                reasoning_path = cot_template.split("Reasoning-Path Design:")[1].strip()
-            else:
-                raise ValueError("COT template format is incorrect.")
-            prompt = COT_GENERATION_PROMPT[language]["TEMPLATE"].format(
-                entities=entities_str,
-                relationships=relationships_str,
-                question=question,
-                reasoning_template=reasoning_path,
-            )
-            cot_answer = await synthesizer_llm_client.generate_answer(prompt)
-            return c_id, (question, reasoning_path, cot_answer)
-    cid_nodes = list(communities.items())
-    results: Dict = {}
-    async for coro in tqdm_async(
-        asyncio.as_completed(
-            [_generate_from_single_community(cid, nodes) for cid, nodes in cid_nodes]
-        ),
-        total=len(cid_nodes),
-        desc="[Generating COT] Generating CoT data from communities",
-        unit="community",
-    ):
-        cid, (q, r, a) = await coro
-        results[compute_content_hash(q)] = {
-            "question": q,
-            "reasoning_path": r,
-            "answer": a,
-        }
-    return results

graphgen/operators/generate/generate_qas.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import Any
+from graphgen.bases import BaseLLMClient
+from graphgen.models import (
+    AggregatedGenerator,
+    AtomicGenerator,
+    CoTGenerator,
+    MultiHopGenerator,
+)
+from graphgen.utils import logger, run_concurrent
+async def generate_qas(
+    llm_client: BaseLLMClient,
+    batches: list[
+        tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ]
+    ],
+    generation_config: dict,
+) -> list[dict[str, Any]]:
+    """
+    Generate question-answer pairs based on nodes and edges.
+    :param llm_client: LLM client
+    :param batches
+    :param generation_config
+    :return: QA pairs
+    """
+    mode = generation_config["mode"]
+    logger.info("[Generation] mode: %s, batches: %d", mode, len(batches))
+    if mode == "atomic":
+        generator = AtomicGenerator(llm_client)
+    elif mode == "aggregated":
+        generator = AggregatedGenerator(llm_client)
+    elif mode == "multi_hop":
+        generator = MultiHopGenerator(llm_client)
+    elif mode == "cot":
+        generator = CoTGenerator(llm_client)
+    else:
+        raise ValueError(f"Unsupported generation mode: {mode}")
+    results = await run_concurrent(
+        generator.generate,
+        batches,
+        desc="[4/4]Generating QAs",
+        unit="batch",
+    )
+    # format
+    data_format = generation_config["data_format"]
+    logger.info("Output data format: %s", data_format)
+    results = generator.format_generation_results(
+        results, output_data_format=data_format
+    )
+    return results

graphgen/operators/partition/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .partition_kg import partition_kg

graphgen/operators/partition/partition_kg.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import Any
+from graphgen.bases import BaseGraphStorage, BaseTokenizer
+from graphgen.models import (
+    BFSPartitioner,
+    DFSPartitioner,
+    ECEPartitioner,
+    LeidenPartitioner,
+)
+from graphgen.utils import logger
+from .pre_tokenize import pre_tokenize
+async def partition_kg(
+    kg_instance: BaseGraphStorage,
+    tokenizer: Any = BaseTokenizer,
+    partition_config: dict = None,
+) -> list[
+    tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]]
+]:
+    method = partition_config["method"]
+    method_params = partition_config["method_params"]
+    if method == "bfs":
+        logger.info("Partitioning knowledge graph using BFS method.")
+        partitioner = BFSPartitioner()
+    elif method == "dfs":
+        logger.info("Partitioning knowledge graph using DFS method.")
+        partitioner = DFSPartitioner()
+    elif method == "ece":
+        logger.info("Partitioning knowledge graph using ECE method.")
+        # TODO： before ECE partitioning, we need to:
+        # 1. 'quiz and judge' to get the comprehension loss if unit_sampling is not random
+        # 2. pre-tokenize nodes and edges to get the token length
+        edges = await kg_instance.get_all_edges()
+        nodes = await kg_instance.get_all_nodes()
+        await pre_tokenize(kg_instance, tokenizer, edges, nodes)
+        partitioner = ECEPartitioner()
+    elif method == "leiden":
+        logger.info("Partitioning knowledge graph using Leiden method.")
+        partitioner = LeidenPartitioner()
+    else:
+        raise ValueError(f"Unsupported partition method: {method}")
+    communities = await partitioner.partition(g=kg_instance, **method_params)
+    logger.info("Partitioned the graph into %d communities.", len(communities))
+    batches = await partitioner.community2batch(communities, g=kg_instance)
+    return batches

graphgen/operators/partition/pre_tokenize.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import asyncio
+from typing import List, Tuple
+from graphgen.bases import BaseGraphStorage, BaseTokenizer
+from graphgen.utils import run_concurrent
+async def pre_tokenize(
+    graph_storage: BaseGraphStorage,
+    tokenizer: BaseTokenizer,
+    edges: List[Tuple],
+    nodes: List[Tuple],
+) -> Tuple[List, List]:
+    """为 edges/nodes 补 token-length 并回写存储，并发 1000，带进度条。"""
+    sem = asyncio.Semaphore(1000)
+    async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
+        async with sem:
+            data = obj[1] if is_node else obj[2]
+            if "length" not in data:
+                loop = asyncio.get_event_loop()
+                data["length"] = len(
+                    await loop.run_in_executor(
+                        None, tokenizer.encode, data["description"]
+                    )
+                )
+            if is_node:
+                await graph_storage.update_node(obj[0], obj[1])
+            else:
+                await graph_storage.update_edge(obj[0], obj[1], obj[2])
+            return obj
+    new_edges, new_nodes = await asyncio.gather(
+        run_concurrent(
+            lambda e: _patch_and_write(e, is_node=False),
+            edges,
+            desc="Pre-tokenizing edges",
+        ),
+        run_concurrent(
+            lambda n: _patch_and_write(n, is_node=True),
+            nodes,
+            desc="Pre-tokenizing nodes",
+        ),
+    )
+    await graph_storage.index_done_callback()
+    return new_edges, new_nodes

graphgen/operators/search/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from .search_all import search_all

graphgen/operators/traverse_graph.py DELETED Viewed

@@ -1,540 +0,0 @@
-import asyncio
-from typing import Dict
-import gradio as gr
-from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import JsonKVStorage, NetworkXStorage, OpenAIClient, Tokenizer
-from graphgen.operators.build_kg.split_kg import get_batches_with_strategy
-from graphgen.templates import (
-    ANSWER_REPHRASING_PROMPT,
-    MULTI_HOP_GENERATION_PROMPT,
-    QUESTION_GENERATION_PROMPT,
-)
-from graphgen.utils import compute_content_hash, detect_main_language, logger
-async def _pre_tokenize(
-    graph_storage: NetworkXStorage, tokenizer: Tokenizer, edges: list, nodes: list
-) -> tuple:
-    sem = asyncio.Semaphore(1000)
-    async def handle_edge(edge: tuple) -> tuple:
-        async with sem:
-            if "length" not in edge[2]:
-                edge[2]["length"] = len(
-                    await asyncio.get_event_loop().run_in_executor(
-                        None, tokenizer.encode, edge[2]["description"]
-                    )
-                )
-            return edge
-    async def handle_node(node: dict) -> dict:
-        async with sem:
-            if "length" not in node[1]:
-                node[1]["length"] = len(
-                    await asyncio.get_event_loop().run_in_executor(
-                        None, tokenizer.encode, node[1]["description"]
-                    )
-                )
-            return node
-    new_edges = []
-    new_nodes = []
-    for result in tqdm_async(
-        asyncio.as_completed([handle_edge(edge) for edge in edges]),
-        total=len(edges),
-        desc="Pre-tokenizing edges",
-    ):
-        new_edge = await result
-        await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2])
-        new_edges.append(new_edge)
-    for result in tqdm_async(
-        asyncio.as_completed([handle_node(node) for node in nodes]),
-        total=len(nodes),
-        desc="Pre-tokenizing nodes",
-    ):
-        new_node = await result
-        await graph_storage.update_node(new_node[0], new_node[1])
-        new_nodes.append(new_node)
-    await graph_storage.index_done_callback()
-    return new_edges, new_nodes
-async def _construct_rephrasing_prompt(
-    _process_nodes: list,
-    _process_edges: list,
-    text_chunks_storage: JsonKVStorage,
-    add_context: bool = False,
-) -> str:
-    entities = [
-        f"{_process_node['node_id']}: {_process_node['description']}"
-        for _process_node in _process_nodes
-    ]
-    relations = [
-        f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
-        for _process_edge in _process_edges
-    ]
-    entities_str = "\n".join(
-        [f"{index + 1}. {entity}" for index, entity in enumerate(entities)]
-    )
-    relations_str = "\n".join(
-        [f"{index + 1}. {relation}" for index, relation in enumerate(relations)]
-    )
-    language = (
-        "Chinese"
-        if detect_main_language(entities_str + relations_str) == "zh"
-        else "English"
-    )
-    if add_context:
-        original_ids = [
-            node["source_id"].split("<SEP>")[0] for node in _process_nodes
-        ] + [edge[2]["source_id"].split("<SEP>")[0] for edge in _process_edges]
-        original_ids = list(set(original_ids))
-        original_text = await text_chunks_storage.get_by_ids(original_ids)
-        original_text = "\n".join(
-            [
-                f"{index + 1}. {text['content']}"
-                for index, text in enumerate(original_text)
-            ]
-        )
-        prompt = ANSWER_REPHRASING_PROMPT[language]["CONTEXT_TEMPLATE"].format(
-            language=language,
-            original_text=original_text,
-            entities=entities_str,
-            relationships=relations_str,
-        )
-        return prompt
-    prompt = ANSWER_REPHRASING_PROMPT[language]["TEMPLATE"].format(
-        language=language, entities=entities_str, relationships=relations_str
-    )
-    return prompt
-def get_average_loss(batch: tuple, loss_strategy: str) -> float:
-    try:
-        if loss_strategy == "only_edge":
-            return sum(edge[2]["loss"] for edge in batch[1]) / len(batch[1])
-        if loss_strategy == "both":
-            return sum(edge[2]["loss"] for edge in batch[1]) + sum(
-                node["loss"] for node in batch[0]
-            ) / (len(batch[0]) + len(batch[1]))
-        raise ValueError("Invalid loss strategy")
-    except Exception as e:  # pylint: disable=broad-except
-        logger.warning(
-            "Loss not found in some nodes or edges, setting loss to -1.0: %s", e
-        )
-        return -1.0
-def _post_process_synthetic_data(data):
-    block = data.split("\n\n")
-    qas = []
-    for line in block:
-        if "Question:" in line and "Answer:" in line:
-            question = line.split("Question:")[1].split("Answer:")[0].strip()
-            answer = line.split("Answer:")[1].strip()
-            qas.append({"question": question, "answer": answer})
-        elif "问题：" in line and "答案：" in line:
-            question = line.split("问题：")[1].split("答案：")[0].strip()
-            answer = line.split("答案：")[1].strip()
-            qas.append({"question": question, "answer": answer})
-        elif "问题:" in line and "回答:" in line:
-            question = line.split("问题:")[1].split("回答:")[0].strip()
-            answer = line.split("回答:")[1].strip()
-            qas.append({"question": question, "answer": answer})
-    return qas
-async def traverse_graph_for_aggregated(
-    llm_client: OpenAIClient,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: Dict,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000,
-) -> dict:
-    """
-    Traverse the graph
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-    semaphore = asyncio.Semaphore(max_concurrent)
-    async def _process_nodes_and_edges(
-        _process_nodes: list,
-        _process_edges: list,
-    ) -> str:
-        prompt = await _construct_rephrasing_prompt(
-            _process_nodes, _process_edges, text_chunks_storage, add_context=False
-        )
-        context = await llm_client.generate_answer(prompt)
-        # post-process the context
-        if context.startswith("Rephrased Text:"):
-            context = context[len("Rephrased Text:") :].strip()
-        elif context.startswith("重述文本:"):
-            context = context[len("重述文本:") :].strip()
-        return context
-    async def _process_single_batch(
-        _process_batch: tuple, question_type: str = "single"
-    ) -> dict:
-        async with semaphore:
-            context = await _process_nodes_and_edges(
-                _process_batch[0],
-                _process_batch[1],
-            )
-            language = "Chinese" if detect_main_language(context) == "zh" else "English"
-            pre_length = sum(node["length"] for node in _process_batch[0]) + sum(
-                edge[2]["length"] for edge in _process_batch[1]
-            )
-            if question_type == "single":
-                question = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]["SINGLE_TEMPLATE"].format(
-                        answer=context
-                    )
-                )
-                if question.startswith("Question:"):
-                    question = question[len("Question:") :].strip()
-                elif question.startswith("问题："):
-                    question = question[len("问题：") :].strip()
-                logger.info(
-                    "%d nodes and %d edges processed",
-                    len(_process_batch[0]),
-                    len(_process_batch[1]),
-                )
-                logger.info("Pre-length: %s", pre_length)
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", context)
-                return {
-                    compute_content_hash(context): {
-                        "question": question,
-                        "answer": context,
-                        "loss": get_average_loss(
-                            _process_batch, traverse_strategy["loss_strategy"]
-                        ),
-                    }
-                }
-            content = await llm_client.generate_answer(
-                QUESTION_GENERATION_PROMPT[language]["MULTI_TEMPLATE"].format(
-                    doc=context
-                )
-            )
-            qas = _post_process_synthetic_data(content)
-            if len(qas) == 0:
-                logger.error(
-                    "Error occurred while processing batch, question or answer is None"
-                )
-                return {}
-            final_results = {}
-            logger.info(
-                "%d nodes and %d edges processed",
-                len(_process_batch[0]),
-                len(_process_batch[1]),
-            )
-            logger.info("Pre-length: %s", pre_length)
-            for qa in qas:
-                logger.info("Question: %s", qa["question"])
-                logger.info("Answer: %s", qa["answer"])
-                final_results[compute_content_hash(qa["question"])] = {
-                    "question": qa["question"],
-                    "answer": qa["answer"],
-                    "loss": get_average_loss(
-                        _process_batch, traverse_strategy["loss_strategy"]
-                    ),
-                }
-            return final_results
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-    processing_batches = await get_batches_with_strategy(
-        nodes, edges, graph_storage, traverse_strategy
-    )
-    for result in tqdm_async(
-        asyncio.as_completed(
-            [_process_single_batch(batch) for batch in processing_batches]
-        ),
-        total=len(processing_batches),
-        desc="[4/4]Generating QAs",
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(
-                    len(results) / len(processing_batches), desc="[4/4]Generating QAs"
-                )
-            results.update(await result)
-            if progress_bar is not None and len(results) == len(processing_batches):
-                progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e:  # pylint: disable=broad-except
-            logger.error("Error occurred while generating QA: %s", e)
-    return results
-# pylint: disable=too-many-branches, too-many-statements
-async def traverse_graph_for_atomic(
-    llm_client: OpenAIClient,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: Dict,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000,
-) -> dict:
-    """
-    Traverse the graph atomicly
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-    semaphore = asyncio.Semaphore(max_concurrent)
-    def _parse_qa(qa: str) -> tuple:
-        if "Question:" in qa and "Answer:" in qa:
-            question = qa.split("Question:")[1].split("Answer:")[0].strip()
-            answer = qa.split("Answer:")[1].strip()
-        elif "问题：" in qa and "答案：" in qa:
-            question = qa.split("问题：")[1].split("答案：")[0].strip()
-            answer = qa.split("答案：")[1].strip()
-        else:
-            return None, None
-        return question.strip('"'), answer.strip('"')
-    async def _generate_question(node_or_edge: tuple):
-        if len(node_or_edge) == 2:
-            des = node_or_edge[0] + ": " + node_or_edge[1]["description"]
-            loss = node_or_edge[1]["loss"] if "loss" in node_or_edge[1] else -1.0
-        else:
-            des = node_or_edge[2]["description"]
-            loss = node_or_edge[2]["loss"] if "loss" in node_or_edge[2] else -1.0
-        async with semaphore:
-            try:
-                language = "Chinese" if detect_main_language(des) == "zh" else "English"
-                qa = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]["SINGLE_QA_TEMPLATE"].format(
-                        doc=des
-                    )
-                )
-                question, answer = _parse_qa(qa)
-                if question is None or answer is None:
-                    return {}
-                question = question.strip('"')
-                answer = answer.strip('"')
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", answer)
-                return {
-                    compute_content_hash(question): {
-                        "question": question,
-                        "answer": answer,
-                        "loss": loss,
-                    }
-                }
-            except Exception as e:  # pylint: disable=broad-except
-                logger.error("Error occurred while generating question: %s", e)
-                return {}
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-    tasks = []
-    for node in nodes:
-        if "<SEP>" in node[1]["description"]:
-            description_list = node[1]["description"].split("<SEP>")
-            for item in description_list:
-                tasks.append((node[0], {"description": item}))
-                if "loss" in node[1]:
-                    tasks[-1][1]["loss"] = node[1]["loss"]
-        else:
-            tasks.append((node[0], node[1]))
-    for edge in edges:
-        if "<SEP>" in edge[2]["description"]:
-            description_list = edge[2]["description"].split("<SEP>")
-            for item in description_list:
-                tasks.append((edge[0], edge[1], {"description": item}))
-                if "loss" in edge[2]:
-                    tasks[-1][2]["loss"] = edge[2]["loss"]
-        else:
-            tasks.append((edge[0], edge[1], edge[2]))
-    for result in tqdm_async(
-        asyncio.as_completed([_generate_question(task) for task in tasks]),
-        total=len(tasks),
-        desc="[4/4]Generating QAs",
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(len(results) / len(tasks), desc="[4/4]Generating QAs")
-            results.update(await result)
-            if progress_bar is not None and len(results) == len(tasks):
-                progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e:  # pylint: disable=broad-except
-            logger.error("Error occurred while generating QA: %s", e)
-    return results
-async def traverse_graph_for_multi_hop(
-    llm_client: OpenAIClient,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: Dict,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000,
-) -> dict:
-    """
-    Traverse the graph for multi-hop
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-    semaphore = asyncio.Semaphore(max_concurrent)
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-    processing_batches = await get_batches_with_strategy(
-        nodes, edges, graph_storage, traverse_strategy
-    )
-    async def _process_single_batch(_process_batch: tuple) -> dict:
-        async with semaphore:
-            try:
-                language = (
-                    "Chinese"
-                    if detect_main_language(_process_batch[0][0]["description"]) == "zh"
-                    else "English"
-                )
-                _process_nodes = _process_batch[0]
-                _process_edges = _process_batch[1]
-                entities = [
-                    f"{_process_node['node_id']}: {_process_node['description']}"
-                    for _process_node in _process_nodes
-                ]
-                relations = [
-                    f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
-                    for _process_edge in _process_edges
-                ]
-                entities_str = "\n".join(
-                    [f"{index + 1}. {entity}" for index, entity in enumerate(entities)]
-                )
-                relations_str = "\n".join(
-                    [
-                        f"{index + 1}. {relation}"
-                        for index, relation in enumerate(relations)
-                    ]
-                )
-                prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
-                    entities=entities_str, relationships=relations_str
-                )
-                context = await llm_client.generate_answer(prompt)
-                # post-process the context
-                if "Question:" in context and "Answer:" in context:
-                    question = context.split("Question:")[1].split("Answer:")[0].strip()
-                    answer = context.split("Answer:")[1].strip()
-                elif "问题：" in context and "答案：" in context:
-                    question = context.split("问题：")[1].split("答案：")[0].strip()
-                    answer = context.split("答案：")[1].strip()
-                else:
-                    return {}
-                question = question.strip('"')
-                answer = answer.strip('"')
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", answer)
-                return {
-                    compute_content_hash(question): {
-                        "question": question,
-                        "answer": answer,
-                        "loss": get_average_loss(
-                            _process_batch, traverse_strategy["loss_strategy"]
-                        ),
-                    }
-                }
-            except Exception as e:  # pylint: disable=broad-except
-                logger.error("Error occurred while processing batch: %s", e)
-                return {}
-    async for result in tqdm_async(
-        asyncio.as_completed(
-            [_process_single_batch(batch) for batch in processing_batches]
-        ),
-        total=len(processing_batches),
-        desc="[4/4]Generating QAs",
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(
-                    len(results) / len(processing_batches), desc="[4/4]Generating QAs"
-                )
-            results.update(await result)
-            if progress_bar is not None and len(results) == len(processing_batches):
-                progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e:  # pylint: disable=broad-except
-            logger.error("Error occurred while generating QA: %s", e)
-    return results

graphgen/templates/__init__.py CHANGED Viewed

@@ -1,10 +1,13 @@
-from .answer_rephrasing import ANSWER_REPHRASING_PROMPT
-from .community import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT
 from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
 from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
 from .kg_extraction import KG_EXTRACTION_PROMPT
 from .kg_summarization import KG_SUMMARIZATION_PROMPT
-from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT

 from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
 from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
+from .generation import (
+    AGGREGATED_GENERATION_PROMPT,
+    ATOMIC_GENERATION_PROMPT,
+    COT_GENERATION_PROMPT,
+    MULTI_HOP_GENERATION_PROMPT,
+)
 from .kg_extraction import KG_EXTRACTION_PROMPT
 from .kg_summarization import KG_SUMMARIZATION_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT

graphgen/templates/community/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .cot_generation import COT_GENERATION_PROMPT
2	- from .cot_template_design import COT_TEMPLATE_DESIGN_PROMPT

graphgen/templates/community/cot_generation.py DELETED Viewed

@@ -1,87 +0,0 @@
-TEMPLATE_ZH = """根据给定的知识图谱原始信息及已生成的推理路径，产出一条符合模板要求、可直接用于下游训练或推理的 CoT 数据。\
-CoT（Chain-of-Thought，思维链）指在回答复杂问题时，把中间推理步骤一步一步显式写出来，使推理过程透明、可追溯，而不是直接给出最终答案。
--输入格式-
-[Entities:]
-(实体名:实体描述)
-...
-[Relationships:]
-(来源实体)-[关系描述]->(目标实体)
-...
-[Question and Reasoning Path:]
-(问题)
-(推理路径)
--输出要求-
-1. 每一步只完成一个不可分割的子任务，并用自然语言衔接，但是要避免生硬的连接词。
-2. 使用中文。
-3. 不要使用有序列表或编号。
-4. 请直接给出答案，不要生成无关信息。
--真实数据-
-输入:
-[Entities:]:
-{entities}
-[Relationships:]:
-{relationships}
-[Question:]:
-{question}
-[Reasoning_Template:]:
-{reasoning_template}
-输出：
-"""
-TEMPLATE_EN = """Given the raw knowledge graph information and the provided reasoning-path, \
-produce one Chain-of-Thought (CoT) sample that strictly follows the template \
-and can be directly used for downstream training or inference.
-CoT (Chain-of-Thought) means that when answering a complex question, the intermediate reasoning steps are \
-explicitly written out one by one, making the reasoning process transparent and traceable instead of giving \
-only the final answer.
--Input Format-
-[Entities:]:
-(ENTITY_NAME: ENTITY_DESCRIPTION)
-...
-[Relationships:]:
-(ENTITY_SOURCE)-[RELATIONSHIP_DESCRIPTION]->(ENTITY_TARGET)
-...
-[Question and Reasoning Path:]:
-(QUESTION)
-(REASONING_PATH)
--Output Requirements-
-1. Each step completes a single, indivisible sub-task and is naturally connected, avoiding abrupt transition words.
-2. Use English.
-3. Do not use ordered lists or numbering.
-4. Do not generate extraneous information, just provide the answer.
--Real Data-
-Input:
-[Entities:]:
-{entities}
-[Relationships:]:
-{relationships}
-[Question:]:
-{question}
-[Reasoning_Template:]:
-{reasoning_template}
-Output:
-"""
-COT_GENERATION_PROMPT = {
-    "Chinese": {"TEMPLATE": TEMPLATE_ZH},
-    "English": {"TEMPLATE": TEMPLATE_EN},
-}