Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
8e67692
1
Parent(s):
930dd4f
Auto-sync from demo at Fri Nov 21 06:02:41 UTC 2025
Browse files- graphgen/graphgen.py +1 -0
- graphgen/models/__init__.py +1 -0
- graphgen/models/generator/__init__.py +1 -0
- graphgen/models/generator/quiz_generator.py +70 -0
- graphgen/operators/__init__.py +1 -2
- graphgen/operators/generate/generate_qas.py +3 -1
- graphgen/operators/quiz_and_judge/__init__.py +2 -0
- graphgen/operators/{judge.py → quiz_and_judge/judge.py} +0 -0
- graphgen/operators/{quiz.py → quiz_and_judge/quiz.py} +38 -68
- graphgen/templates/description_rephrasing.py +2 -2
graphgen/graphgen.py
CHANGED
|
@@ -221,6 +221,7 @@ class GraphGen:
|
|
| 221 |
self.graph_storage,
|
| 222 |
self.rephrase_storage,
|
| 223 |
max_samples,
|
|
|
|
| 224 |
)
|
| 225 |
|
| 226 |
# TODO: assert trainee_llm_client is valid before judge
|
|
|
|
| 221 |
self.graph_storage,
|
| 222 |
self.rephrase_storage,
|
| 223 |
max_samples,
|
| 224 |
+
progress_bar=self.progress_bar,
|
| 225 |
)
|
| 226 |
|
| 227 |
# TODO: assert trainee_llm_client is valid before judge
|
graphgen/models/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from .generator import (
|
|
| 4 |
AtomicGenerator,
|
| 5 |
CoTGenerator,
|
| 6 |
MultiHopGenerator,
|
|
|
|
| 7 |
VQAGenerator,
|
| 8 |
)
|
| 9 |
from .kg_builder import LightRAGKGBuilder, MMKGBuilder
|
|
|
|
| 4 |
AtomicGenerator,
|
| 5 |
CoTGenerator,
|
| 6 |
MultiHopGenerator,
|
| 7 |
+
QuizGenerator,
|
| 8 |
VQAGenerator,
|
| 9 |
)
|
| 10 |
from .kg_builder import LightRAGKGBuilder, MMKGBuilder
|
graphgen/models/generator/__init__.py
CHANGED
|
@@ -2,4 +2,5 @@ from .aggregated_generator import AggregatedGenerator
|
|
| 2 |
from .atomic_generator import AtomicGenerator
|
| 3 |
from .cot_generator import CoTGenerator
|
| 4 |
from .multi_hop_generator import MultiHopGenerator
|
|
|
|
| 5 |
from .vqa_generator import VQAGenerator
|
|
|
|
| 2 |
from .atomic_generator import AtomicGenerator
|
| 3 |
from .cot_generator import CoTGenerator
|
| 4 |
from .multi_hop_generator import MultiHopGenerator
|
| 5 |
+
from .quiz_generator import QuizGenerator
|
| 6 |
from .vqa_generator import VQAGenerator
|
graphgen/models/generator/quiz_generator.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
from graphgen.bases import BaseGenerator
|
| 4 |
+
from graphgen.templates import DESCRIPTION_REPHRASING_PROMPT
|
| 5 |
+
from graphgen.utils import detect_main_language, logger
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class QuizGenerator(BaseGenerator):
|
| 9 |
+
"""
|
| 10 |
+
Quiz Generator rephrases given descriptions to create quiz questions.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
@staticmethod
|
| 14 |
+
def build_prompt(
|
| 15 |
+
batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
|
| 16 |
+
) -> str:
|
| 17 |
+
"""
|
| 18 |
+
Build prompt for rephrasing the description.
|
| 19 |
+
:param batch: A tuple containing (nodes, edges) where nodes/edges
|
| 20 |
+
contain description information
|
| 21 |
+
:return: Prompt string
|
| 22 |
+
"""
|
| 23 |
+
# Extract description from batch
|
| 24 |
+
# For quiz generator, we expect a special format where
|
| 25 |
+
# the description is passed as the first node's description
|
| 26 |
+
nodes, edges = batch
|
| 27 |
+
if nodes:
|
| 28 |
+
description = nodes[0][1].get("description", "")
|
| 29 |
+
template_type = nodes[0][1].get("template_type", "TEMPLATE")
|
| 30 |
+
elif edges:
|
| 31 |
+
description = edges[0][2].get("description", "")
|
| 32 |
+
template_type = edges[0][2].get("template_type", "TEMPLATE")
|
| 33 |
+
else:
|
| 34 |
+
raise ValueError("Batch must contain at least one node or edge with description")
|
| 35 |
+
|
| 36 |
+
return QuizGenerator.build_prompt_for_description(description, template_type)
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def build_prompt_for_description(description: str, template_type: str = "TEMPLATE") -> str:
|
| 40 |
+
"""
|
| 41 |
+
Build prompt for rephrasing a single description.
|
| 42 |
+
:param description: The description to rephrase
|
| 43 |
+
:param template_type: Either "TEMPLATE" (same meaning) or "ANTI_TEMPLATE" (opposite meaning)
|
| 44 |
+
:return: Prompt string
|
| 45 |
+
"""
|
| 46 |
+
language = detect_main_language(description)
|
| 47 |
+
prompt = DESCRIPTION_REPHRASING_PROMPT[language][template_type].format(
|
| 48 |
+
input_sentence=description
|
| 49 |
+
)
|
| 50 |
+
return prompt
|
| 51 |
+
|
| 52 |
+
@staticmethod
|
| 53 |
+
def parse_rephrased_text(response: str) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Parse the rephrased text from the response.
|
| 56 |
+
:param response:
|
| 57 |
+
:return:
|
| 58 |
+
"""
|
| 59 |
+
rephrased_text = response.strip().strip('"')
|
| 60 |
+
logger.debug("Rephrased Text: %s", rephrased_text)
|
| 61 |
+
return rephrased_text
|
| 62 |
+
|
| 63 |
+
@staticmethod
|
| 64 |
+
def parse_response(response: str) -> Any:
|
| 65 |
+
"""
|
| 66 |
+
Parse the LLM response. For quiz generator, this returns the rephrased text.
|
| 67 |
+
:param response: LLM response
|
| 68 |
+
:return: Rephrased text
|
| 69 |
+
"""
|
| 70 |
+
return QuizGenerator.parse_rephrased_text(response)
|
graphgen/operators/__init__.py
CHANGED
|
@@ -2,9 +2,8 @@ from .build_kg import build_kg
|
|
| 2 |
from .extract import extract_info
|
| 3 |
from .generate import generate_qas
|
| 4 |
from .init import init_llm
|
| 5 |
-
from .judge import judge_statement
|
| 6 |
from .partition import partition_kg
|
| 7 |
-
from .
|
| 8 |
from .read import read_files
|
| 9 |
from .search import search_all
|
| 10 |
from .split import chunk_documents
|
|
|
|
| 2 |
from .extract import extract_info
|
| 3 |
from .generate import generate_qas
|
| 4 |
from .init import init_llm
|
|
|
|
| 5 |
from .partition import partition_kg
|
| 6 |
+
from .quiz_and_judge import judge_statement, quiz
|
| 7 |
from .read import read_files
|
| 8 |
from .search import search_all
|
| 9 |
from .split import chunk_documents
|
graphgen/operators/generate/generate_qas.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
from typing import Any
|
| 2 |
|
|
|
|
|
|
|
| 3 |
from graphgen.bases import BaseLLMWrapper
|
| 4 |
from graphgen.models import (
|
| 5 |
AggregatedGenerator,
|
|
@@ -19,7 +21,7 @@ async def generate_qas(
|
|
| 19 |
]
|
| 20 |
],
|
| 21 |
generation_config: dict,
|
| 22 |
-
progress_bar=None,
|
| 23 |
) -> list[dict[str, Any]]:
|
| 24 |
"""
|
| 25 |
Generate question-answer pairs based on nodes and edges.
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
from graphgen.bases import BaseLLMWrapper
|
| 6 |
from graphgen.models import (
|
| 7 |
AggregatedGenerator,
|
|
|
|
| 21 |
]
|
| 22 |
],
|
| 23 |
generation_config: dict,
|
| 24 |
+
progress_bar: gr.Progress = None,
|
| 25 |
) -> list[dict[str, Any]]:
|
| 26 |
"""
|
| 27 |
Generate question-answer pairs based on nodes and edges.
|
graphgen/operators/quiz_and_judge/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .judge import judge_statement
|
| 2 |
+
from .quiz import quiz
|
graphgen/operators/{judge.py → quiz_and_judge/judge.py}
RENAMED
|
File without changes
|
graphgen/operators/{quiz.py → quiz_and_judge/quiz.py}
RENAMED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
from collections import defaultdict
|
| 3 |
|
| 4 |
-
|
| 5 |
|
| 6 |
from graphgen.bases import BaseLLMWrapper
|
| 7 |
-
from graphgen.models import JsonKVStorage, NetworkXStorage
|
| 8 |
-
from graphgen.
|
| 9 |
-
from graphgen.utils import detect_main_language, logger
|
| 10 |
|
| 11 |
|
| 12 |
async def quiz(
|
|
@@ -14,104 +12,76 @@ async def quiz(
|
|
| 14 |
graph_storage: NetworkXStorage,
|
| 15 |
rephrase_storage: JsonKVStorage,
|
| 16 |
max_samples: int = 1,
|
| 17 |
-
|
| 18 |
) -> JsonKVStorage:
|
| 19 |
"""
|
| 20 |
-
Get all edges and quiz them
|
| 21 |
|
| 22 |
:param synth_llm_client: generate statements
|
| 23 |
:param graph_storage: graph storage instance
|
| 24 |
:param rephrase_storage: rephrase storage instance
|
| 25 |
:param max_samples: max samples for each edge
|
| 26 |
-
:param
|
| 27 |
:return:
|
| 28 |
"""
|
| 29 |
|
| 30 |
-
|
| 31 |
|
| 32 |
-
async def _process_single_quiz(
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
edges = await graph_storage.get_all_edges()
|
| 50 |
nodes = await graph_storage.get_all_nodes()
|
| 51 |
|
| 52 |
results = defaultdict(list)
|
| 53 |
-
|
| 54 |
for edge in edges:
|
| 55 |
edge_data = edge[2]
|
| 56 |
-
|
| 57 |
description = edge_data["description"]
|
| 58 |
-
language = "English" if detect_main_language(description) == "en" else "Chinese"
|
| 59 |
|
| 60 |
results[description] = [(description, "yes")]
|
| 61 |
|
| 62 |
for i in range(max_samples):
|
| 63 |
if i > 0:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
description,
|
| 67 |
-
DESCRIPTION_REPHRASING_PROMPT[language]["TEMPLATE"].format(
|
| 68 |
-
input_sentence=description
|
| 69 |
-
),
|
| 70 |
-
"yes",
|
| 71 |
-
)
|
| 72 |
-
)
|
| 73 |
-
tasks.append(
|
| 74 |
-
_process_single_quiz(
|
| 75 |
-
description,
|
| 76 |
-
DESCRIPTION_REPHRASING_PROMPT[language]["ANTI_TEMPLATE"].format(
|
| 77 |
-
input_sentence=description
|
| 78 |
-
),
|
| 79 |
-
"no",
|
| 80 |
-
)
|
| 81 |
-
)
|
| 82 |
|
| 83 |
for node in nodes:
|
| 84 |
node_data = node[1]
|
| 85 |
description = node_data["description"]
|
| 86 |
-
language = "English" if detect_main_language(description) == "en" else "Chinese"
|
| 87 |
|
| 88 |
results[description] = [(description, "yes")]
|
| 89 |
|
| 90 |
for i in range(max_samples):
|
| 91 |
if i > 0:
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
DESCRIPTION_REPHRASING_PROMPT[language]["ANTI_TEMPLATE"].format(
|
| 105 |
-
input_sentence=description
|
| 106 |
-
),
|
| 107 |
-
"no",
|
| 108 |
-
)
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
for result in tqdm_async(
|
| 112 |
-
asyncio.as_completed(tasks), total=len(tasks), desc="Quizzing descriptions"
|
| 113 |
-
):
|
| 114 |
-
new_result = await result
|
| 115 |
if new_result:
|
| 116 |
for key, value in new_result.items():
|
| 117 |
results[key].extend(value)
|
|
|
|
|
|
|
| 1 |
from collections import defaultdict
|
| 2 |
|
| 3 |
+
import gradio as gr
|
| 4 |
|
| 5 |
from graphgen.bases import BaseLLMWrapper
|
| 6 |
+
from graphgen.models import JsonKVStorage, NetworkXStorage, QuizGenerator
|
| 7 |
+
from graphgen.utils import logger, run_concurrent
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
async def quiz(
|
|
|
|
| 12 |
graph_storage: NetworkXStorage,
|
| 13 |
rephrase_storage: JsonKVStorage,
|
| 14 |
max_samples: int = 1,
|
| 15 |
+
progress_bar: gr.Progress = None,
|
| 16 |
) -> JsonKVStorage:
|
| 17 |
"""
|
| 18 |
+
Get all edges and quiz them using QuizGenerator.
|
| 19 |
|
| 20 |
:param synth_llm_client: generate statements
|
| 21 |
:param graph_storage: graph storage instance
|
| 22 |
:param rephrase_storage: rephrase storage instance
|
| 23 |
:param max_samples: max samples for each edge
|
| 24 |
+
:param progress_bar
|
| 25 |
:return:
|
| 26 |
"""
|
| 27 |
|
| 28 |
+
generator = QuizGenerator(synth_llm_client)
|
| 29 |
|
| 30 |
+
async def _process_single_quiz(item: tuple[str, str, str]):
|
| 31 |
+
description, template_type, gt = item
|
| 32 |
+
try:
|
| 33 |
+
# if rephrase_storage exists already, directly get it
|
| 34 |
+
descriptions = await rephrase_storage.get_by_id(description)
|
| 35 |
+
if descriptions:
|
| 36 |
+
return None
|
| 37 |
|
| 38 |
+
prompt = generator.build_prompt_for_description(description, template_type)
|
| 39 |
+
new_description = await synth_llm_client.generate_answer(
|
| 40 |
+
prompt, temperature=1
|
| 41 |
+
)
|
| 42 |
+
rephrased_text = generator.parse_rephrased_text(new_description)
|
| 43 |
+
return {description: [(rephrased_text, gt)]}
|
| 44 |
|
| 45 |
+
except Exception as e: # pylint: disable=broad-except
|
| 46 |
+
logger.error("Error when quizzing description %s: %s", description, e)
|
| 47 |
+
return None
|
| 48 |
|
| 49 |
edges = await graph_storage.get_all_edges()
|
| 50 |
nodes = await graph_storage.get_all_nodes()
|
| 51 |
|
| 52 |
results = defaultdict(list)
|
| 53 |
+
items = []
|
| 54 |
for edge in edges:
|
| 55 |
edge_data = edge[2]
|
|
|
|
| 56 |
description = edge_data["description"]
|
|
|
|
| 57 |
|
| 58 |
results[description] = [(description, "yes")]
|
| 59 |
|
| 60 |
for i in range(max_samples):
|
| 61 |
if i > 0:
|
| 62 |
+
items.append((description, "TEMPLATE", "yes"))
|
| 63 |
+
items.append((description, "ANTI_TEMPLATE", "no"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
for node in nodes:
|
| 66 |
node_data = node[1]
|
| 67 |
description = node_data["description"]
|
|
|
|
| 68 |
|
| 69 |
results[description] = [(description, "yes")]
|
| 70 |
|
| 71 |
for i in range(max_samples):
|
| 72 |
if i > 0:
|
| 73 |
+
items.append((description, "TEMPLATE", "yes"))
|
| 74 |
+
items.append((description, "ANTI_TEMPLATE", "no"))
|
| 75 |
+
|
| 76 |
+
quiz_results = await run_concurrent(
|
| 77 |
+
_process_single_quiz,
|
| 78 |
+
items,
|
| 79 |
+
desc="Quizzing descriptions",
|
| 80 |
+
unit="description",
|
| 81 |
+
progress_bar=progress_bar,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
for new_result in quiz_results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
if new_result:
|
| 86 |
for key, value in new_result.items():
|
| 87 |
results[key].extend(value)
|
graphgen/templates/description_rephrasing.py
CHANGED
|
@@ -110,11 +110,11 @@ Output:
|
|
| 110 |
|
| 111 |
|
| 112 |
DESCRIPTION_REPHRASING_PROMPT= {
|
| 113 |
-
"
|
| 114 |
"ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
|
| 115 |
"TEMPLATE": TEMPLATE_EN
|
| 116 |
},
|
| 117 |
-
"
|
| 118 |
"ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
|
| 119 |
"TEMPLATE": TEMPLATE_ZH
|
| 120 |
}
|
|
|
|
| 110 |
|
| 111 |
|
| 112 |
DESCRIPTION_REPHRASING_PROMPT= {
|
| 113 |
+
"en": {
|
| 114 |
"ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
|
| 115 |
"TEMPLATE": TEMPLATE_EN
|
| 116 |
},
|
| 117 |
+
"zh": {
|
| 118 |
"ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
|
| 119 |
"TEMPLATE": TEMPLATE_ZH
|
| 120 |
}
|