Spaces:

chenzihong
/

GraphGen

Running

App Files Files Community

github-actions[bot] commited on 23 days ago

Commit

8e67692

1 Parent(s): 930dd4f

Auto-sync from demo at Fri Nov 21 06:02:41 UTC 2025

Browse files

Files changed (10) hide show

graphgen/graphgen.py +1 -0
graphgen/models/__init__.py +1 -0
graphgen/models/generator/__init__.py +1 -0
graphgen/models/generator/quiz_generator.py +70 -0
graphgen/operators/__init__.py +1 -2
graphgen/operators/generate/generate_qas.py +3 -1
graphgen/operators/quiz_and_judge/__init__.py +2 -0
graphgen/operators/{judge.py → quiz_and_judge/judge.py} +0 -0
graphgen/operators/{quiz.py → quiz_and_judge/quiz.py} +38 -68
graphgen/templates/description_rephrasing.py +2 -2

graphgen/graphgen.py CHANGED Viewed

@@ -221,6 +221,7 @@ class GraphGen:
             self.graph_storage,
             self.rephrase_storage,
             max_samples,
         )
         # TODO： assert trainee_llm_client is valid before judge

             self.graph_storage,
             self.rephrase_storage,
             max_samples,
+            progress_bar=self.progress_bar,
         )
         # TODO： assert trainee_llm_client is valid before judge

graphgen/models/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .generator import (
     AtomicGenerator,
     CoTGenerator,
     MultiHopGenerator,
     VQAGenerator,
 )
 from .kg_builder import LightRAGKGBuilder, MMKGBuilder

     AtomicGenerator,
     CoTGenerator,
     MultiHopGenerator,
+    QuizGenerator,
     VQAGenerator,
 )
 from .kg_builder import LightRAGKGBuilder, MMKGBuilder

graphgen/models/generator/__init__.py CHANGED Viewed

@@ -2,4 +2,5 @@ from .aggregated_generator import AggregatedGenerator
 from .atomic_generator import AtomicGenerator
 from .cot_generator import CoTGenerator
 from .multi_hop_generator import MultiHopGenerator
 from .vqa_generator import VQAGenerator

 from .atomic_generator import AtomicGenerator
 from .cot_generator import CoTGenerator
 from .multi_hop_generator import MultiHopGenerator
+from .quiz_generator import QuizGenerator
 from .vqa_generator import VQAGenerator

graphgen/models/generator/quiz_generator.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import Any
+from graphgen.bases import BaseGenerator
+from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT
+from graphgen.utils import detect_main_language, logger
+class QuizGenerator(BaseGenerator):
+    """
+    Quiz Generator rephrases given descriptions to create quiz questions.
+    """
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompt for rephrasing the description.
+        :param batch: A tuple containing (nodes, edges) where nodes/edges
+                      contain description information
+        :return: Prompt string
+        """
+        # Extract description from batch
+        # For quiz generator, we expect a special format where
+        # the description is passed as the first node's description
+        nodes, edges = batch
+        if nodes:
+            description = nodes[0][1].get("description", "")
+            template_type = nodes[0][1].get("template_type", "TEMPLATE")
+        elif edges:
+            description = edges[0][2].get("description", "")
+            template_type = edges[0][2].get("template_type", "TEMPLATE")
+        else:
+            raise ValueError("Batch must contain at least one node or edge with description")
+        return QuizGenerator.build_prompt_for_description(description, template_type)
+    @staticmethod
+    def build_prompt_for_description(description: str, template_type: str = "TEMPLATE") -> str:
+        """
+        Build prompt for rephrasing a single description.
+        :param description: The description to rephrase
+        :param template_type: Either "TEMPLATE" (same meaning) or "ANTI_TEMPLATE" (opposite meaning)
+        :return: Prompt string
+        """
+        language = detect_main_language(description)
+        prompt = DESCRIPTION_REPHRASING_PROMPT[language][template_type].format(
+            input_sentence=description
+        )
+        return prompt
+    @staticmethod
+    def parse_rephrased_text(response: str) -> str:
+        """
+        Parse the rephrased text from the response.
+        :param response:
+        :return:
+        """
+        rephrased_text = response.strip().strip('"')
+        logger.debug("Rephrased Text: %s", rephrased_text)
+        return rephrased_text
+    @staticmethod
+    def parse_response(response: str) -> Any:
+        """
+        Parse the LLM response. For quiz generator, this returns the rephrased text.
+        :param response: LLM response
+        :return: Rephrased text
+        """
+        return QuizGenerator.parse_rephrased_text(response)

graphgen/operators/__init__.py CHANGED Viewed

@@ -2,9 +2,8 @@ from .build_kg import build_kg
 from .extract import extract_info
 from .generate import generate_qas
 from .init import init_llm
-from .judge import judge_statement
 from .partition import partition_kg
-from .quiz import quiz
 from .read import read_files
 from .search import search_all
 from .split import chunk_documents

 from .extract import extract_info
 from .generate import generate_qas
 from .init import init_llm
 from .partition import partition_kg
+from .quiz_and_judge import judge_statement, quiz
 from .read import read_files
 from .search import search_all
 from .split import chunk_documents

graphgen/operators/generate/generate_qas.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import Any
 from graphgen.bases import BaseLLMWrapper
 from graphgen.models import (
     AggregatedGenerator,
@@ -19,7 +21,7 @@ async def generate_qas(
         ]
     ],
     generation_config: dict,
-    progress_bar=None,
 ) -> list[dict[str, Any]]:
     """
     Generate question-answer pairs based on nodes and edges.

 from typing import Any
+import gradio as gr
 from graphgen.bases import BaseLLMWrapper
 from graphgen.models import (
     AggregatedGenerator,
         ]
     ],
     generation_config: dict,
+    progress_bar: gr.Progress = None,
 ) -> list[dict[str, Any]]:
     """
     Generate question-answer pairs based on nodes and edges.

graphgen/operators/quiz_and_judge/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .judge import judge_statement
2	+ from .quiz import quiz

graphgen/operators/{judge.py → quiz_and_judge/judge.py} RENAMED Viewed

File without changes

graphgen/operators/{quiz.py → quiz_and_judge/quiz.py} RENAMED Viewed

@@ -1,12 +1,10 @@
-import asyncio
 from collections import defaultdict
-from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.bases import BaseLLMWrapper
-from graphgen.models import JsonKVStorage, NetworkXStorage
-from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT
-from graphgen.utils import detect_main_language, logger
 async def quiz(
@@ -14,104 +12,76 @@ async def quiz(
     graph_storage: NetworkXStorage,
     rephrase_storage: JsonKVStorage,
     max_samples: int = 1,
-    max_concurrent: int = 1000,
 ) -> JsonKVStorage:
     """
-    Get all edges and quiz them
     :param synth_llm_client: generate statements
     :param graph_storage: graph storage instance
     :param rephrase_storage: rephrase storage instance
     :param max_samples: max samples for each edge
-    :param max_concurrent: max concurrent
     :return:
     """
-    semaphore = asyncio.Semaphore(max_concurrent)
-    async def _process_single_quiz(des: str, prompt: str, gt: str):
-        async with semaphore:
-            try:
-                # 如果在rephrase_storage中已经存在，直接取出
-                descriptions = await rephrase_storage.get_by_id(des)
-                if descriptions:
-                    return None
-                new_description = await synth_llm_client.generate_answer(
-                    prompt, temperature=1
-                )
-                return {des: [(new_description, gt)]}
-            except Exception as e:  # pylint: disable=broad-except
-                logger.error("Error when quizzing description %s: %s", des, e)
-                return None
     edges = await graph_storage.get_all_edges()
     nodes = await graph_storage.get_all_nodes()
     results = defaultdict(list)
-    tasks = []
     for edge in edges:
         edge_data = edge[2]
         description = edge_data["description"]
-        language = "English" if detect_main_language(description) == "en" else "Chinese"
         results[description] = [(description, "yes")]
         for i in range(max_samples):
             if i > 0:
-                tasks.append(
-                    _process_single_quiz(
-                        description,
-                        DESCRIPTION_REPHRASING_PROMPT[language]["TEMPLATE"].format(
-                            input_sentence=description
-                        ),
-                        "yes",
-                    )
-                )
-            tasks.append(
-                _process_single_quiz(
-                    description,
-                    DESCRIPTION_REPHRASING_PROMPT[language]["ANTI_TEMPLATE"].format(
-                        input_sentence=description
-                    ),
-                    "no",
-                )
-            )
     for node in nodes:
         node_data = node[1]
         description = node_data["description"]
-        language = "English" if detect_main_language(description) == "en" else "Chinese"
         results[description] = [(description, "yes")]
         for i in range(max_samples):
             if i > 0:
-                tasks.append(
-                    _process_single_quiz(
-                        description,
-                        DESCRIPTION_REPHRASING_PROMPT[language]["TEMPLATE"].format(
-                            input_sentence=description
-                        ),
-                        "yes",
-                    )
-                )
-            tasks.append(
-                _process_single_quiz(
-                    description,
-                    DESCRIPTION_REPHRASING_PROMPT[language]["ANTI_TEMPLATE"].format(
-                        input_sentence=description
-                    ),
-                    "no",
-                )
-            )
-    for result in tqdm_async(
-        asyncio.as_completed(tasks), total=len(tasks), desc="Quizzing descriptions"
-    ):
-        new_result = await result
         if new_result:
             for key, value in new_result.items():
                 results[key].extend(value)

 from collections import defaultdict
+import gradio as gr
 from graphgen.bases import BaseLLMWrapper
+from graphgen.models import JsonKVStorage, NetworkXStorage, QuizGenerator
+from graphgen.utils import logger, run_concurrent
 async def quiz(
     graph_storage: NetworkXStorage,
     rephrase_storage: JsonKVStorage,
     max_samples: int = 1,
+    progress_bar: gr.Progress = None,
 ) -> JsonKVStorage:
     """
+    Get all edges and quiz them using QuizGenerator.
     :param synth_llm_client: generate statements
     :param graph_storage: graph storage instance
     :param rephrase_storage: rephrase storage instance
     :param max_samples: max samples for each edge
+    :param progress_bar
     :return:
     """
+    generator = QuizGenerator(synth_llm_client)
+    async def _process_single_quiz(item: tuple[str, str, str]):
+        description, template_type, gt = item
+        try:
+            # if rephrase_storage exists already, directly get it
+            descriptions = await rephrase_storage.get_by_id(description)
+            if descriptions:
+                return None
+            prompt = generator.build_prompt_for_description(description, template_type)
+            new_description = await synth_llm_client.generate_answer(
+                prompt, temperature=1
+            )
+            rephrased_text = generator.parse_rephrased_text(new_description)
+            return {description: [(rephrased_text, gt)]}
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error when quizzing description %s: %s", description, e)
+            return None
     edges = await graph_storage.get_all_edges()
     nodes = await graph_storage.get_all_nodes()
     results = defaultdict(list)
+    items = []
     for edge in edges:
         edge_data = edge[2]
         description = edge_data["description"]
         results[description] = [(description, "yes")]
         for i in range(max_samples):
             if i > 0:
+                items.append((description, "TEMPLATE", "yes"))
+            items.append((description, "ANTI_TEMPLATE", "no"))
     for node in nodes:
         node_data = node[1]
         description = node_data["description"]
         results[description] = [(description, "yes")]
         for i in range(max_samples):
             if i > 0:
+                items.append((description, "TEMPLATE", "yes"))
+            items.append((description, "ANTI_TEMPLATE", "no"))
+    quiz_results = await run_concurrent(
+        _process_single_quiz,
+        items,
+        desc="Quizzing descriptions",
+        unit="description",
+        progress_bar=progress_bar,
+    )
+    for new_result in quiz_results:
         if new_result:
             for key, value in new_result.items():
                 results[key].extend(value)

graphgen/templates/description_rephrasing.py CHANGED Viewed

@@ -110,11 +110,11 @@ Output:
 DESCRIPTION_REPHRASING_PROMPT= {
-    "English": {
         "ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
         "TEMPLATE": TEMPLATE_EN
     },
-    "Chinese": {
         "ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
         "TEMPLATE": TEMPLATE_ZH
     }

 DESCRIPTION_REPHRASING_PROMPT= {
+    "en": {
         "ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
         "TEMPLATE": TEMPLATE_EN
     },
+    "zh": {
         "ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
         "TEMPLATE": TEMPLATE_ZH
     }