Spaces:

chenzihong
/

GraphGen

Sleeping

App Files Files Community

github-actions[bot] commited on Oct 23

Commit

0b9d8c7

1 Parent(s): e4316f1

Auto-sync from demo at Thu Oct 23 11:07:54 UTC 2025

Browse files

Files changed (39) hide show

graphgen/bases/base_reader.py +45 -0
graphgen/bases/datatypes.py +10 -0
graphgen/configs/vqa_config.yaml +5 -9
graphgen/graphgen.py +111 -55
graphgen/models/__init__.py +2 -1
graphgen/models/generator/aggregated_generator.py +3 -3
graphgen/models/generator/atomic_generator.py +2 -2
graphgen/models/generator/cot_generator.py +3 -3
graphgen/models/generator/multi_hop_generator.py +2 -2
graphgen/models/generator/vqa_generator.py +122 -7
graphgen/models/kg_builder/__init__.py +1 -0
graphgen/models/kg_builder/light_rag_kg_builder.py +3 -10
graphgen/models/kg_builder/mm_kg_builder.py +93 -0
graphgen/models/partitioner/__init__.py +1 -0
graphgen/models/partitioner/anchor_bfs_partitioner.py +128 -0
graphgen/models/reader/csv_reader.py +6 -3
graphgen/models/reader/json_reader.py +2 -2
graphgen/models/reader/jsonl_reader.py +3 -4
graphgen/models/reader/pdf_reader.py +1 -3
graphgen/models/reader/txt_reader.py +1 -1
graphgen/operators/__init__.py +1 -1
graphgen/operators/build_kg/__init__.py +2 -1
graphgen/operators/build_kg/build_mm_kg.py +56 -0
graphgen/operators/build_kg/{build_kg.py → build_text_kg.py} +1 -1
graphgen/operators/judge.py +5 -5
graphgen/operators/partition/partition_kg.py +21 -1
graphgen/operators/split/split_chunks.py +23 -17
graphgen/templates/__init__.py +2 -2
graphgen/templates/generation/__init__.py +1 -0
graphgen/templates/generation/aggregated_generation.py +4 -4
graphgen/templates/generation/vqa_generation.py +104 -0
graphgen/templates/kg/__init__.py +3 -0
graphgen/templates/{kg_extraction.py → kg/kg_extraction.py} +5 -7
graphgen/templates/{kg_summarization.py → kg/kg_summarization.py} +4 -9
graphgen/templates/kg/mm_kg_extraction.py +131 -0
graphgen/utils/__init__.py +1 -1
graphgen/utils/detect_lang.py +10 -9
graphgen/utils/hash.py +16 -0
graphgen/utils/log.py +9 -4

graphgen/bases/base_reader.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
 class BaseReader(ABC):
     """
@@ -18,3 +21,45 @@ class BaseReader(ABC):
         :param file_path: Path to the input file.
         :return: List of dictionaries containing the data.
         """

+import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
+import requests
 class BaseReader(ABC):
     """
         :param file_path: Path to the input file.
         :return: List of dictionaries containing the data.
         """
+    @staticmethod
+    def filter(data: List[dict]) -> List[dict]:
+        """
+        Filter out entries with empty or missing text in the specified column.
+        :param data: List of dictionaries containing the data.
+        :return: Filtered list of dictionaries.
+        """
+        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+            """
+            Check if an image exists at the given local path or URL.
+            :param path_or_url: Local file path or remote URL of the image.
+            :param timeout: Timeout for remote URL requests in seconds.
+            :return: True if the image exists, False otherwise.
+            """
+            if not path_or_url:
+                return False
+            if not path_or_url.startswith(("http://", "https://", "ftp://")):
+                path = path_or_url.replace("file://", "", 1)
+                path = os.path.abspath(path)
+                return os.path.isfile(path)
+            try:
+                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+                return resp.status_code == 200
+            except requests.RequestException:
+                return False
+        filtered_data = []
+        for item in data:
+            if item.get("type") == "text":
+                content = item.get("content", "").strip()
+                if content:
+                    filtered_data.append(item)
+            elif item.get("type") in ("image", "table", "equation"):
+                img_path = item.get("img_path")
+                if _image_exists(img_path):
+                    filtered_data.append(item)
+            else:
+                filtered_data.append(item)
+        return filtered_data

graphgen/bases/datatypes.py CHANGED Viewed

@@ -7,8 +7,18 @@ from typing import List, Union
 class Chunk:
     id: str
     content: str
     metadata: dict = field(default_factory=dict)
 @dataclass
 class QAPair:

 class Chunk:
     id: str
     content: str
+    type: str
     metadata: dict = field(default_factory=dict)
+    @staticmethod
+    def from_dict(key: str, data: dict) -> "Chunk":
+        return Chunk(
+            id=key,
+            content=data.get("content", ""),
+            type=data.get("type", "unknown"),
+            metadata={k: v for k, v in data.items() if k != "content"},
+        )
 @dataclass
 class QAPair:

graphgen/configs/vqa_config.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -7,16 +7,12 @@ search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
 quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
 partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
 generate:
   mode: vqa # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML

 read:
+  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
 quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
+  enabled: false
 partition: # graph partition configuration
+  method: anchor_bfs # partition method
   method_params:
+    anchor_type: image # node type to select anchor nodes
+    max_units_per_community: 10 # atomic partition, one node or edge per community
 generate:
   mode: vqa # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/graphgen.py CHANGED Viewed

@@ -16,7 +16,8 @@ from graphgen.models import (
     Tokenizer,
 )
 from graphgen.operators import (
-    build_kg,
     chunk_documents,
     generate_qas,
     judge_statement,
@@ -25,7 +26,7 @@ from graphgen.operators import (
     read_files,
     search_all,
 )
-from graphgen.utils import async_to_sync_method, compute_content_hash, logger
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -68,8 +69,8 @@ class GraphGen:
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
-        self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="text_chunks"
         )
         self.graph_storage: NetworkXStorage = NetworkXStorage(
             self.working_dir, namespace="graph"
@@ -96,70 +97,122 @@ class GraphGen:
             logger.warning("No data to process")
             return
         # TODO: configurable whether to use coreference resolution
-        # Step 2: Split chunks and filter existing ones
-        assert isinstance(data, list) and isinstance(data[0], dict)
-        new_docs = {
-            compute_content_hash(doc["content"], prefix="doc-"): {
-                "content": doc["content"]
-            }
-            for doc in data
-            if doc.get("type", "text") == "text"
-        }
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
-        if len(new_docs) == 0:
-            logger.warning("All docs are already in the storage")
-            return
-        logger.info("[New Docs] inserting %d docs", len(new_docs))
-        inserting_chunks = await chunk_documents(
-            new_docs,
-            split_config["chunk_size"],
-            split_config["chunk_overlap"],
-            self.tokenizer_instance,
-            self.progress_bar,
-        )
-        _add_chunk_keys = await self.text_chunks_storage.filter_keys(
-            list(inserting_chunks.keys())
-        )
-        inserting_chunks = {
-            k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
-        }
-        if len(inserting_chunks) == 0:
-            logger.warning("All chunks are already in the storage")
-            return
-        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
-        await self.full_docs_storage.upsert(new_docs)
-        await self.text_chunks_storage.upsert(inserting_chunks)
-        # Step 3: Extract entities and relations from chunks
-        logger.info("[Entity and Relation Extraction]...")
-        _add_entities_and_relations = await build_kg(
-            llm_client=self.synthesizer_llm_client,
-            kg_instance=self.graph_storage,
-            chunks=[
-                Chunk(id=k, content=v["content"]) for k, v in inserting_chunks.items()
-            ],
-            progress_bar=self.progress_bar,
-        )
-        if not _add_entities_and_relations:
-            logger.warning("No entities or relations extracted")
-            return
-        await self._insert_done()
-        return _add_entities_and_relations
     async def _insert_done(self):
         tasks = []
         for storage_instance in [
             self.full_docs_storage,
-            self.text_chunks_storage,
             self.graph_storage,
             self.search_storage,
         ]:
@@ -233,7 +286,10 @@ class GraphGen:
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
         batches = await partition_kg(
-            self.graph_storage, self.tokenizer_instance, partition_config
         )
         # Step 2： generate QA pairs
@@ -255,7 +311,7 @@ class GraphGen:
     @async_to_sync_method
     async def clear(self):
         await self.full_docs_storage.drop()
-        await self.text_chunks_storage.drop()
         await self.search_storage.drop()
         await self.graph_storage.clear()
         await self.rephrase_storage.drop()

     Tokenizer,
 )
 from graphgen.operators import (
+    build_mm_kg,
+    build_text_kg,
     chunk_documents,
     generate_qas,
     judge_statement,
     read_files,
     search_all,
 )
+from graphgen.utils import async_to_sync_method, compute_mm_hash, logger
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
+        self.chunks_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="chunks"
         )
         self.graph_storage: NetworkXStorage = NetworkXStorage(
             self.working_dir, namespace="graph"
             logger.warning("No data to process")
             return
+        assert isinstance(data, list) and isinstance(data[0], dict)
         # TODO: configurable whether to use coreference resolution
+        new_docs = {compute_mm_hash(doc, prefix="doc-"): doc for doc in data}
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+        new_text_docs = {k: v for k, v in new_docs.items() if v.get("type") == "text"}
+        new_mm_docs = {k: v for k, v in new_docs.items() if v.get("type") != "text"}
+        await self.full_docs_storage.upsert(new_docs)
+        async def _insert_text_docs(text_docs):
+            if len(text_docs) == 0:
+                logger.warning("All text docs are already in the storage")
+                return
+            logger.info("[New Docs] inserting %d text docs", len(text_docs))
+            # Step 2.1: Split chunks and filter existing ones
+            inserting_chunks = await chunk_documents(
+                text_docs,
+                split_config["chunk_size"],
+                split_config["chunk_overlap"],
+                self.tokenizer_instance,
+                self.progress_bar,
+            )
+            _add_chunk_keys = await self.chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+            if len(inserting_chunks) == 0:
+                logger.warning("All text chunks are already in the storage")
+                return
+            logger.info("[New Chunks] inserting %d text chunks", len(inserting_chunks))
+            await self.chunks_storage.upsert(inserting_chunks)
+            # Step 2.2: Extract entities and relations from text chunks
+            logger.info("[Text Entity and Relation Extraction] processing ...")
+            _add_entities_and_relations = await build_text_kg(
+                llm_client=self.synthesizer_llm_client,
+                kg_instance=self.graph_storage,
+                chunks=[
+                    Chunk(id=k, content=v["content"], type="text")
+                    for k, v in inserting_chunks.items()
+                ],
+                progress_bar=self.progress_bar,
+            )
+            if not _add_entities_and_relations:
+                logger.warning("No entities or relations extracted from text chunks")
+                return
+            await self._insert_done()
+            return _add_entities_and_relations
+        async def _insert_multi_modal_docs(mm_docs):
+            if len(mm_docs) == 0:
+                logger.warning("No multi-modal documents to insert")
+                return
+            logger.info("[New Docs] inserting %d multi-modal docs", len(mm_docs))
+            # Step 3.1: Transform multi-modal documents into chunks and filter existing ones
+            inserting_chunks = await chunk_documents(
+                mm_docs,
+                split_config["chunk_size"],
+                split_config["chunk_overlap"],
+                self.tokenizer_instance,
+                self.progress_bar,
+            )
+            _add_chunk_keys = await self.chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+            if len(inserting_chunks) == 0:
+                logger.warning("All multi-modal chunks are already in the storage")
+                return
+            logger.info(
+                "[New Chunks] inserting %d multimodal chunks", len(inserting_chunks)
+            )
+            await self.chunks_storage.upsert(inserting_chunks)
+            # Step 3.2: Extract multi-modal entities and relations from chunks
+            logger.info("[Multi-modal Entity and Relation Extraction] processing ...")
+            _add_entities_and_relations = await build_mm_kg(
+                llm_client=self.synthesizer_llm_client,
+                kg_instance=self.graph_storage,
+                chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()],
+                progress_bar=self.progress_bar,
+            )
+            if not _add_entities_and_relations:
+                logger.warning(
+                    "No entities or relations extracted from multi-modal chunks"
+                )
+                return
+            await self._insert_done()
+            return _add_entities_and_relations
+        # Step 2: Insert text documents
+        await _insert_text_docs(new_text_docs)
+        # Step 3: Insert multi-modal documents
+        await _insert_multi_modal_docs(new_mm_docs)
     async def _insert_done(self):
         tasks = []
         for storage_instance in [
             self.full_docs_storage,
+            self.chunks_storage,
             self.graph_storage,
             self.search_storage,
         ]:
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
         batches = await partition_kg(
+            self.graph_storage,
+            self.chunks_storage,
+            self.tokenizer_instance,
+            partition_config,
         )
         # Step 2： generate QA pairs
     @async_to_sync_method
     async def clear(self):
         await self.full_docs_storage.drop()
+        await self.chunks_storage.drop()
         await self.search_storage.drop()
         await self.graph_storage.clear()
         await self.rephrase_storage.drop()

graphgen/models/__init__.py CHANGED Viewed

@@ -6,10 +6,11 @@ from .generator import (
     MultiHopGenerator,
     VQAGenerator,
 )
-from .kg_builder import LightRAGKGBuilder
 from .llm.openai_client import OpenAIClient
 from .llm.topk_token_model import TopkTokenModel
 from .partitioner import (
     BFSPartitioner,
     DFSPartitioner,
     ECEPartitioner,

     MultiHopGenerator,
     VQAGenerator,
 )
+from .kg_builder import LightRAGKGBuilder, MMKGBuilder
 from .llm.openai_client import OpenAIClient
 from .llm.topk_token_model import TopkTokenModel
 from .partitioner import (
+    AnchorBFSPartitioner,
     BFSPartitioner,
     DFSPartitioner,
     ECEPartitioner,

graphgen/models/generator/aggregated_generator.py CHANGED Viewed

@@ -53,7 +53,7 @@ class AggregatedGenerator(BaseGenerator):
         #             ]
         #         )
         prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format(
-            language=language, entities=entities_str, relationships=relations_str
         )
         return prompt
@@ -115,8 +115,8 @@ class AggregatedGenerator(BaseGenerator):
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
-        logger.info("Question: %s", question)
-        logger.info("Answer: %s", context)
         qa_pairs = {
             compute_content_hash(question): {
                 "question": question,

         #             ]
         #         )
         prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format(
+            entities=entities_str, relationships=relations_str
         )
         return prompt
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", context)
         qa_pairs = {
             compute_content_hash(question): {
                 "question": question,

graphgen/models/generator/atomic_generator.py CHANGED Viewed

@@ -42,8 +42,8 @@ class AtomicGenerator(BaseGenerator):
             return {}
         question = question.strip('"')
         answer = answer.strip('"')
-        logger.info("Question: %s", question)
-        logger.info("Answer: %s", answer)
         return {
             compute_content_hash(question): {
                 "question": question,

             return {}
         question = question.strip('"')
         answer = answer.strip('"')
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", answer)
         return {
             compute_content_hash(question): {
                 "question": question,

graphgen/models/generator/cot_generator.py CHANGED Viewed

@@ -85,8 +85,8 @@ class CoTGenerator(BaseGenerator):
         question = question.strip('"')
         reasoning_path = reasoning_path.strip('"')
-        logger.info("CoT Question: %s", question)
-        logger.info("CoT Reasoning Path: %s", reasoning_path)
         return {
             "question": question,
             "reasoning_path": reasoning_path,
@@ -110,7 +110,7 @@ class CoTGenerator(BaseGenerator):
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)
-        logger.info("CoT Answer: %s", cot_answer)
         qa_pairs = {
             compute_content_hash(question): {
                 "question": question,

         question = question.strip('"')
         reasoning_path = reasoning_path.strip('"')
+        logger.debug("CoT Question: %s", question)
+        logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
             "question": question,
             "reasoning_path": reasoning_path,
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)
+        logger.debug("CoT Answer: %s", cot_answer)
         qa_pairs = {
             compute_content_hash(question): {
                 "question": question,

graphgen/models/generator/multi_hop_generator.py CHANGED Viewed

@@ -45,8 +45,8 @@ class MultiHopGenerator(BaseGenerator):
             return {}
         question = question.strip('"')
         answer = answer.strip('"')
-        logger.info("Question: %s", question)
-        logger.info("Answer: %s", answer)
         return {
             compute_content_hash(question): {
                 "question": question,

             return {}
         question = question.strip('"')
         answer = answer.strip('"')
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", answer)
         return {
             compute_content_hash(question): {
                 "question": question,

graphgen/models/generator/vqa_generator.py CHANGED Viewed

@@ -2,6 +2,8 @@ from dataclasses import dataclass
 from typing import Any
 from graphgen.bases import BaseGenerator
 @dataclass
@@ -10,14 +12,127 @@ class VQAGenerator(BaseGenerator):
     def build_prompt(
         batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
     ) -> str:
-        raise NotImplementedError(
-            "VQAGenerator.build_prompt is not implemented. "
-            "Please provide an implementation for VQA prompt construction."
         )
     @staticmethod
     def parse_response(response: str) -> Any:
-        raise NotImplementedError(
-            "VQAGenerator.parse_response is not implemented. "
-            "Please provide an implementation for VQA response parsing."
-        )

 from typing import Any
 from graphgen.bases import BaseGenerator
+from graphgen.templates import VQA_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
 @dataclass
     def build_prompt(
         batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
     ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
         )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = VQA_GENERATION_PROMPT[language].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
     @staticmethod
     def parse_response(response: str) -> Any:
+        """
+        Parse the LLM response and return the generated QAs
+        :param response
+        :return: QA pairs
+        """
+        qa_pairs = {}
+        qa_list = response.strip().split("\n\n")
+        for qa in qa_list:
+            if "Question:" in qa and "Answer:" in qa:
+                question = qa.split("Question:")[1].split("Answer:")[0].strip()
+                answer = qa.split("Answer:")[1].strip()
+            elif "问题：" in qa and "答案：" in qa:
+                question = qa.split("问题：")[1].split("答案：")[0].strip()
+                answer = qa.split("答案：")[1].strip()
+            else:
+                logger.error("Failed to parse QA pair: %s", qa)
+                continue
+            question = question.strip('"')
+            answer = answer.strip('"')
+            logger.debug("Question: %s", question)
+            logger.debug("Answer: %s", answer)
+            qa_pairs[compute_content_hash(question)] = {
+                "question": question,
+                "answer": answer,
+            }
+        return qa_pairs
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        nodes, _ = batch
+        for node in nodes:
+            node_data = node[1]
+            if "images" in node_data and node_data["images"]:
+                img_path = node_data["images"]["img_path"]
+                for qa in qa_pairs.values():
+                    qa["img_path"] = img_path
+        result.update(qa_pairs)
+        return result
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                    "image": v.get("img_path", ""),
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {
+                            "from": "human",
+                            "value": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results

graphgen/models/kg_builder/__init__.py CHANGED Viewed

	@@ -1 +1,2 @@
1	from .light_rag_kg_builder import LightRAGKGBuilder


1	from .light_rag_kg_builder import LightRAGKGBuilder
2	+ from .mm_kg_builder import MMKGBuilder

graphgen/models/kg_builder/light_rag_kg_builder.py CHANGED Viewed

@@ -6,7 +6,6 @@ from typing import Dict, List, Tuple
 from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMClient, Chunk
 from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
 from graphgen.utils import (
-    detect_if_chinese,
     detect_main_language,
     handle_single_entity_extraction,
     handle_single_relationship_extraction,
@@ -33,8 +32,7 @@ class LightRAGKGBuilder(BaseKGBuilder):
         content = chunk.content
         # step 1: language_detection
-        language = "Chinese" if detect_if_chinese(content) else "English"
-        KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
         hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
             **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
@@ -42,7 +40,7 @@ class LightRAGKGBuilder(BaseKGBuilder):
         # step 2: initial glean
         final_result = await self.llm_client.generate_answer(hint_prompt)
-        logger.info("First extraction result: %s", final_result)
         # step3: iterative refinement
         history = pack_history_conversations(hint_prompt, final_result)
@@ -57,7 +55,7 @@ class LightRAGKGBuilder(BaseKGBuilder):
             glean_result = await self.llm_client.generate_answer(
                 text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
             )
-            logger.info("Loop %s glean: %s", loop_idx + 1, glean_result)
             history += pack_history_conversations(
                 KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
@@ -201,11 +199,6 @@ class LightRAGKGBuilder(BaseKGBuilder):
         tokenizer_instance = self.llm_client.tokenizer
         language = detect_main_language(description)
-        if language == "en":
-            language = "English"
-        else:
-            language = "Chinese"
-        KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
         tokens = tokenizer_instance.encode(description)
         if len(tokens) < max_summary_tokens:

 from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMClient, Chunk
 from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
 from graphgen.utils import (
     detect_main_language,
     handle_single_entity_extraction,
     handle_single_relationship_extraction,
         content = chunk.content
         # step 1: language_detection
+        language = detect_main_language(content)
         hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
             **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
         # step 2: initial glean
         final_result = await self.llm_client.generate_answer(hint_prompt)
+        logger.debug("First extraction result: %s", final_result)
         # step3: iterative refinement
         history = pack_history_conversations(hint_prompt, final_result)
             glean_result = await self.llm_client.generate_answer(
                 text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
             )
+            logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result)
             history += pack_history_conversations(
                 KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
         tokenizer_instance = self.llm_client.tokenizer
         language = detect_main_language(description)
         tokens = tokenizer_instance.encode(description)
         if len(tokens) < max_summary_tokens:

graphgen/models/kg_builder/mm_kg_builder.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import re
+from collections import defaultdict
+from typing import Dict, List, Tuple
+from graphgen.bases import BaseLLMClient, Chunk
+from graphgen.templates import MMKG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_main_language,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    split_string_by_multi_markers,
+)
+from .light_rag_kg_builder import LightRAGKGBuilder
+class MMKGBuilder(LightRAGKGBuilder):
+    llm_client: BaseLLMClient = None
+    async def extract(
+        self, chunk: Chunk
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """
+        Extract entities and relationships from a single multi-modal chunk using the LLM client.
+        Expect to get a mini graph which contains a central multi-modal entity
+        and its related text entities and relationships.
+        Like:
+        (image: "image_of_eiffel_tower") --[located_in]--> (text: "Paris")
+        (image: "image_of_eiffel_tower") --[built_in]--> (text: "1889")
+        (text: "Eiffel Tower") --[height]--> (text: "324 meters")
+        :param chunk
+        """
+        chunk_id = chunk.id
+        chunk_type = chunk.type  # image | table | formula | ...
+        metadata = chunk.metadata
+        # choose different extraction strategies based on chunk type
+        if chunk_type == "image":
+            image_caption = "\n".join(metadata.get("image_caption", ""))
+            language = detect_main_language(image_caption)
+            prompt_template = MMKG_EXTRACTION_PROMPT[language].format(
+                **MMKG_EXTRACTION_PROMPT["FORMAT"],
+                chunk_type=chunk_type,
+                chunk_id=chunk_id,
+                chunk_text=image_caption,
+            )
+            result = await self.llm_client.generate_answer(prompt_template)
+            logger.debug("Image chunk extraction result: %s", result)
+            # parse the result
+            records = split_string_by_multi_markers(
+                result,
+                [
+                    MMKG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                    MMKG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+                ],
+            )
+            nodes = defaultdict(list)
+            edges = defaultdict(list)
+            for record in records:
+                match = re.search(r"\((.*)\)", record)
+                if not match:
+                    continue
+                inner = match.group(1)
+                attributes = split_string_by_multi_markers(
+                    inner, [MMKG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+                )
+                entity = await handle_single_entity_extraction(attributes, chunk_id)
+                if entity is not None:
+                    nodes[entity["entity_name"]].append(entity)
+                    continue
+                relation = await handle_single_relationship_extraction(
+                    attributes, chunk_id
+                )
+                if relation is not None:
+                    key = (relation["src_id"], relation["tgt_id"])
+                    edges[key].append(relation)
+            return dict(nodes), dict(edges)
+        if chunk_type == "table":
+            pass  # TODO: implement table-based entity and relationship extraction
+        if chunk_type == "formula":
+            pass  # TODO: implement formula-based entity and relationship extraction
+        logger.error("Unsupported chunk type for MMKGBuilder: %s", chunk_type)
+        return defaultdict(list), defaultdict(list)

graphgen/models/partitioner/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .bfs_partitioner import BFSPartitioner
 from .dfs_partitioner import DFSPartitioner
 from .ece_partitioner import ECEPartitioner

+from .anchor_bfs_partitioner import AnchorBFSPartitioner
 from .bfs_partitioner import BFSPartitioner
 from .dfs_partitioner import DFSPartitioner
 from .ece_partitioner import ECEPartitioner

graphgen/models/partitioner/anchor_bfs_partitioner.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import random
+from collections import deque
+from typing import Any, List, Literal, Set, Tuple
+from graphgen.bases import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+from .bfs_partitioner import BFSPartitioner
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+class AnchorBFSPartitioner(BFSPartitioner):
+    """
+    Anchor BFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a node of a specified type as the anchor.
+    2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.)
+    3. Non-anchor units can only be "pulled" into a community and never become seeds themselves.
+    For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges.
+    """
+    def __init__(
+        self,
+        *,
+        anchor_type: Literal["image"] = "image",
+        anchor_ids: Set[str] | None = None,
+    ) -> None:
+        super().__init__()
+        self.anchor_type = anchor_type
+        self.anchor_ids = anchor_ids
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> List[Community]:
+        nodes = await g.get_all_nodes()  # List[tuple[id, meta]]
+        edges = await g.get_all_edges()  # List[tuple[u, v, meta]]
+        adj, _ = self._build_adjacency_list(nodes, edges)
+        anchors: Set[str] = await self._pick_anchor_ids(nodes)
+        if not anchors:
+            return []  # if no anchors, return empty list
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+        communities: List[Community] = []
+        seeds = list(anchors)
+        random.shuffle(seeds)
+        for seed_node in seeds:
+            if seed_node in used_n:
+                continue
+            comm_n, comm_e = await self._grow_community(
+                seed_node, adj, max_units_per_community, used_n, used_e
+            )
+            if comm_n or comm_e:
+                communities.append(
+                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
+                )
+        return communities
+    async def _pick_anchor_ids(
+        self,
+        nodes: List[tuple[str, dict]],
+    ) -> Set[str]:
+        if self.anchor_ids is not None:
+            return self.anchor_ids
+        anchor_ids: Set[str] = set()
+        for node_id, meta in nodes:
+            node_type = str(meta.get("entity_type", "")).lower()
+            if self.anchor_type.lower() in node_type:
+                anchor_ids.add(node_id)
+        return anchor_ids
+    @staticmethod
+    async def _grow_community(
+        seed: str,
+        adj: dict[str, List[str]],
+        max_units: int,
+        used_n: set[str],
+        used_e: set[frozenset[str]],
+    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        """
+        Grow a community from the seed node using BFS.
+        :param seed: seed node id
+        :param adj: adjacency list
+        :param max_units: maximum number of units (nodes + edges) in the community
+        :param used_n: set of used node ids
+        :param used_e: set of used edge keys
+        :return: (list of node ids, list of edge tuples)
+        """
+        comm_n: List[str] = []
+        comm_e: List[Tuple[str, str]] = []
+        queue: deque[tuple[str, Any]] = deque([(NODE_UNIT, seed)])
+        cnt = 0
+        while queue and cnt < max_units:
+            k, it = queue.popleft()
+            if k == NODE_UNIT:
+                if it in used_n:
+                    continue
+                used_n.add(it)
+                comm_n.append(it)
+                cnt += 1
+                for nei in adj[it]:
+                    e_key = frozenset((it, nei))
+                    if e_key not in used_e:
+                        queue.append((EDGE_UNIT, e_key))
+            else:  # EDGE_UNIT
+                if it in used_e:
+                    continue
+                used_e.add(it)
+                u, v = it
+                comm_e.append((u, v))
+                cnt += 1
+                for n in it:
+                    if n not in used_n:
+                        queue.append((NODE_UNIT, n))
+        return comm_n, comm_e

graphgen/models/reader/csv_reader.py CHANGED Viewed

@@ -9,6 +9,9 @@ class CSVReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
         df = pd.read_csv(file_path)
-        if self.text_column not in df.columns:
-            raise ValueError(f"Missing '{self.text_column}' column in CSV file.")
-        return df.to_dict(orient="records")

     def read(self, file_path: str) -> List[Dict[str, Any]]:
         df = pd.read_csv(file_path)
+        for _, row in df.iterrows():
+            if "type" in row and row["type"] == "text" and self.text_column not in row:
+                raise ValueError(
+                    f"Missing '{self.text_column}' in document: {row.to_dict()}"
+                )
+        return self.filter(df.to_dict(orient="records"))

graphgen/models/reader/json_reader.py CHANGED Viewed

@@ -10,9 +10,9 @@ class JSONReader(BaseReader):
             data = json.load(f)
             if isinstance(data, list):
                 for doc in data:
-                    if self.text_column not in doc:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
-                return data
             raise ValueError("JSON file must contain a list of documents.")

             data = json.load(f)
             if isinstance(data, list):
                 for doc in data:
+                    if doc.get("type") == "text" and self.text_column not in doc:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
+                return self.filter(data)
             raise ValueError("JSON file must contain a list of documents.")

graphgen/models/reader/jsonl_reader.py CHANGED Viewed

@@ -12,12 +12,11 @@ class JSONLReader(BaseReader):
             for line in f:
                 try:
                     doc = json.loads(line)
-                    if self.text_column in doc:
-                        docs.append(doc)
-                    else:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
                 except json.JSONDecodeError as e:
                     logger.error("Error decoding JSON line: %s. Error: %s", line, e)
-        return docs

             for line in f:
                 try:
                     doc = json.loads(line)
+                    if doc.get("type") == "text" and self.text_column not in doc:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
+                    docs.append(doc)
                 except json.JSONDecodeError as e:
                     logger.error("Error decoding JSON line: %s. Error: %s", line, e)
+        return self.filter(docs)

graphgen/models/reader/pdf_reader.py CHANGED Viewed

@@ -74,7 +74,7 @@ class PDFReader(BaseReader):
         kwargs = {**self._default_kwargs, **override}
         mineru_result = self._call_mineru(pdf_path, kwargs)
-        return mineru_result
     def _call_mineru(
         self, pdf_path: Path, kwargs: Dict[str, Any]
@@ -172,8 +172,6 @@ class MinerUParser:
             for key in ("page_idx", "bbox", "text_level"):
                 if item.get(key) is not None:
                     del item[key]
-            if item["type"] == "text" and not item["content"].strip():
-                continue
             results.append(item)
         return results

         kwargs = {**self._default_kwargs, **override}
         mineru_result = self._call_mineru(pdf_path, kwargs)
+        return self.filter(mineru_result)
     def _call_mineru(
         self, pdf_path: Path, kwargs: Dict[str, Any]
             for key in ("page_idx", "bbox", "text_level"):
                 if item.get(key) is not None:
                     del item[key]
             results.append(item)
         return results

graphgen/models/reader/txt_reader.py CHANGED Viewed

@@ -11,4 +11,4 @@ class TXTReader(BaseReader):
                 line = line.strip()
                 if line:
                     docs.append({self.text_column: line})
-        return docs

                 line = line.strip()
                 if line:
                     docs.append({self.text_column: line})
+        return self.filter(docs)

graphgen/operators/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .build_kg import build_kg
 from .generate import generate_qas
 from .judge import judge_statement
 from .partition import partition_kg

+from .build_kg import build_mm_kg, build_text_kg
 from .generate import generate_qas
 from .judge import judge_statement
 from .partition import partition_kg

graphgen/operators/build_kg/__init__.py CHANGED Viewed

	@@ -1 +1,2 @@
1	- from .~~build_kg~~ import ~~build_kg~~


1	+ from .build_mm_kg import build_mm_kg
2	+ from .build_text_kg import build_text_kg

graphgen/operators/build_kg/build_mm_kg.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from collections import defaultdict
+from typing import List
+import gradio as gr
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import MMKGBuilder, OpenAIClient
+from graphgen.utils import run_concurrent
+async def build_mm_kg(
+    llm_client: OpenAIClient,
+    kg_instance: BaseGraphStorage,
+    chunks: List[Chunk],
+    progress_bar: gr.Progress = None,
+):
+    """
+    Build multi-modal KG and merge into kg_instance
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance
+    :param chunks
+    :param progress_bar: Gradio progress bar to show the progress of the extraction
+    :return:
+    """
+    mm_builder = MMKGBuilder(llm_client=llm_client)
+    results = await run_concurrent(
+        mm_builder.extract,
+        chunks,
+        desc="[2/4] Extracting entities and relationships from multi-modal chunks",
+        unit="chunk",
+        progress_bar=progress_bar,
+    )
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+    await run_concurrent(
+        lambda kv: mm_builder.merge_nodes(kv, kg_instance=kg_instance),
+        list(nodes.items()),
+        desc="Inserting entities into storage",
+    )
+    await run_concurrent(
+        lambda kv: mm_builder.merge_edges(kv, kg_instance=kg_instance),
+        list(edges.items()),
+        desc="Inserting relationships into storage",
+    )
+    return kg_instance

graphgen/operators/build_kg/{build_kg.py → build_text_kg.py} RENAMED Viewed

@@ -9,7 +9,7 @@ from graphgen.models import LightRAGKGBuilder, OpenAIClient
 from graphgen.utils import run_concurrent
-async def build_kg(
     llm_client: OpenAIClient,
     kg_instance: BaseGraphStorage,
     chunks: List[Chunk],

 from graphgen.utils import run_concurrent
+async def build_text_kg(
     llm_client: OpenAIClient,
     kg_instance: BaseGraphStorage,
     chunks: List[Chunk],

graphgen/operators/judge.py CHANGED Viewed

@@ -37,7 +37,7 @@ async def judge_statement(  # pylint: disable=too-many-statements
             edge_data = edge[2]
             if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info(
                     "Edge %s -> %s already judged, loss: %s, skip",
                     source_id,
                     target_id,
@@ -63,7 +63,7 @@ async def judge_statement(  # pylint: disable=too-many-statements
                 loss = yes_no_loss_entropy(judgements, gts)
-                logger.info(
                     "Edge %s -> %s description: %s loss: %s",
                     source_id,
                     target_id,
@@ -100,7 +100,7 @@ async def judge_statement(  # pylint: disable=too-many-statements
             node_data = node[1]
             if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
-                logger.info(
                     "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
                 )
                 return node_id, node_data
@@ -123,14 +123,14 @@ async def judge_statement(  # pylint: disable=too-many-statements
                 loss = yes_no_loss_entropy(judgements, gts)
-                logger.info(
                     "Node %s description: %s loss: %s", node_id, description, loss
                 )
                 node_data["loss"] = loss
             except Exception as e:  # pylint: disable=broad-except
                 logger.error("Error in judging entity %s: %s", node_id, e)
-                logger.info("Use default loss 0.1")
                 node_data["loss"] = -math.log(0.1)
             await graph_storage.update_node(node_id, node_data)

             edge_data = edge[2]
             if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
+                logger.debug(
                     "Edge %s -> %s already judged, loss: %s, skip",
                     source_id,
                     target_id,
                 loss = yes_no_loss_entropy(judgements, gts)
+                logger.debug(
                     "Edge %s -> %s description: %s loss: %s",
                     source_id,
                     target_id,
             node_data = node[1]
             if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
+                logger.debug(
                     "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
                 )
                 return node_id, node_data
                 loss = yes_no_loss_entropy(judgements, gts)
+                logger.debug(
                     "Node %s description: %s loss: %s", node_id, description, loss
                 )
                 node_data["loss"] = loss
             except Exception as e:  # pylint: disable=broad-except
                 logger.error("Error in judging entity %s: %s", node_id, e)
+                logger.error("Use default loss 0.1")
                 node_data["loss"] = -math.log(0.1)
             await graph_storage.update_node(node_id, node_data)

graphgen/operators/partition/partition_kg.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import Any
-from graphgen.bases import BaseGraphStorage, BaseTokenizer
 from graphgen.models import (
     BFSPartitioner,
     DFSPartitioner,
     ECEPartitioner,
@@ -14,6 +15,7 @@ from .pre_tokenize import pre_tokenize
 async def partition_kg(
     kg_instance: BaseGraphStorage,
     tokenizer: Any = BaseTokenizer,
     partition_config: dict = None,
 ) -> list[
@@ -39,10 +41,28 @@ async def partition_kg(
     elif method == "leiden":
         logger.info("Partitioning knowledge graph using Leiden method.")
         partitioner = LeidenPartitioner()
     else:
         raise ValueError(f"Unsupported partition method: {method}")
     communities = await partitioner.partition(g=kg_instance, **method_params)
     logger.info("Partitioned the graph into %d communities.", len(communities))
     batches = await partitioner.community2batch(communities, g=kg_instance)
     return batches

 from typing import Any
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseTokenizer
 from graphgen.models import (
+    AnchorBFSPartitioner,
     BFSPartitioner,
     DFSPartitioner,
     ECEPartitioner,
 async def partition_kg(
     kg_instance: BaseGraphStorage,
+    chunk_storage: BaseKVStorage,
     tokenizer: Any = BaseTokenizer,
     partition_config: dict = None,
 ) -> list[
     elif method == "leiden":
         logger.info("Partitioning knowledge graph using Leiden method.")
         partitioner = LeidenPartitioner()
+    elif method == "anchor_bfs":
+        logger.info("Partitioning knowledge graph using Anchor BFS method.")
+        partitioner = AnchorBFSPartitioner(
+            anchor_type=method_params.get("anchor_type"),
+            anchor_ids=set(method_params.get("anchor_ids", []))
+            if method_params.get("anchor_ids")
+            else None,
+        )
     else:
         raise ValueError(f"Unsupported partition method: {method}")
     communities = await partitioner.partition(g=kg_instance, **method_params)
     logger.info("Partitioned the graph into %d communities.", len(communities))
     batches = await partitioner.community2batch(communities, g=kg_instance)
+    for _, batch in enumerate(batches):
+        nodes, edges = batch
+        for node_id, node_data in nodes:
+            entity_type = node_data.get("entity_type")
+            if entity_type and "image" in entity_type.lower():
+                node_id = node_id.strip('"').lower()
+                image_data = await chunk_storage.get_by_id(node_id)
+                if image_data:
+                    node_data["images"] = image_data
     return batches

graphgen/operators/split/split_chunks.py CHANGED Viewed

@@ -48,25 +48,31 @@ async def chunk_documents(
     async for doc_key, doc in tqdm_async(
         new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
     ):
-        doc_language = detect_main_language(doc["content"])
-        text_chunks = split_chunks(
-            doc["content"],
-            language=doc_language,
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-        )
-        chunks = {
-            compute_content_hash(txt, prefix="chunk-"): {
-                "content": txt,
-                "full_doc_id": doc_key,
-                "length": len(tokenizer_instance.encode(txt))
-                if tokenizer_instance
-                else len(txt),
-                "language": doc_language,
             }
-            for txt in text_chunks
-        }
         inserting_chunks.update(chunks)
         if progress_bar is not None:

     async for doc_key, doc in tqdm_async(
         new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
     ):
+        doc_type = doc.get("type")
+        if doc_type == "text":
+            doc_language = detect_main_language(doc["content"])
+            text_chunks = split_chunks(
+                doc["content"],
+                language=doc_language,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+            )
+            chunks = {
+                compute_content_hash(txt, prefix="chunk-"): {
+                    "content": txt,
+                    "type": "text",
+                    "full_doc_id": doc_key,
+                    "length": len(tokenizer_instance.encode(txt))
+                    if tokenizer_instance
+                    else len(txt),
+                    "language": doc_language,
+                }
+                for txt in text_chunks
             }
+        else:
+            chunks = {doc_key.replace("doc-", f"{doc_type}-"): {**doc}}
         inserting_chunks.update(chunks)
         if progress_bar is not None:

graphgen/templates/__init__.py CHANGED Viewed

@@ -5,9 +5,9 @@ from .generation import (
     ATOMIC_GENERATION_PROMPT,
     COT_GENERATION_PROMPT,
     MULTI_HOP_GENERATION_PROMPT,
 )
-from .kg_extraction import KG_EXTRACTION_PROMPT
-from .kg_summarization import KG_SUMMARIZATION_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT

     ATOMIC_GENERATION_PROMPT,
     COT_GENERATION_PROMPT,
     MULTI_HOP_GENERATION_PROMPT,
+    VQA_GENERATION_PROMPT,
 )
+from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT

graphgen/templates/generation/__init__.py CHANGED Viewed

@@ -2,3 +2,4 @@ from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
 from .atomic_generation import ATOMIC_GENERATION_PROMPT
 from .cot_generation import COT_GENERATION_PROMPT
 from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT

 from .atomic_generation import ATOMIC_GENERATION_PROMPT
 from .cot_generation import COT_GENERATION_PROMPT
 from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
+from .vqa_generation import VQA_GENERATION_PROMPT

graphgen/templates/generation/aggregated_generation.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # pylint: disable=C0301
 ANSWER_REPHRASING_CONTEXT_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements.
-Use {language} as output language.
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
@@ -52,7 +52,7 @@ To generate a version of the text that is rephrased and conveys the same meaning
 ANSWER_REPHRASING_CONTEXT_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成，但需要确保最终输出的文本符合要求。
-使用{language}作为输出语言。
 ---目标---
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
@@ -100,7 +100,7 @@ ANSWER_REPHRASING_CONTEXT_ZH: str = """---角色---
 ANSWER_REPHRASING_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below.
-Use {language} as output language.
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
@@ -146,7 +146,7 @@ To generate a version of the text that is rephrased and conveys the same meaning
 ANSWER_REPHRASING_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。
-使用{language}作为输出语言。
 ---目标---
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：

 # pylint: disable=C0301
 ANSWER_REPHRASING_CONTEXT_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements.
+Use English as output language.
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
 ANSWER_REPHRASING_CONTEXT_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成，但需要确保最终输出的文本符合要求。
+使用中文作为输出语言。
 ---目标---
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
 ANSWER_REPHRASING_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below.
+Use English as output language.
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
 ANSWER_REPHRASING_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。
+使用中文作为输出语言。
 ---目标---
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：

graphgen/templates/generation/vqa_generation.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are a senior VQA data engineer. Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given multi-modal samples.
+Use English as the output language.
+---Objectives---
+Create multiple sets of VQA question-answer pairs that satisfy the following:
+1. Only ask about objectively existing facts in the given data, avoiding subjective or ambiguous questions.
+2. Ensure that each question has a clear and verifiable answer, avoiding questions with no answer or uncertainty.
+3. Questions should cover various aspects of both image and text content, ensuring diversity and comprehensiveness.
+4. Avoid repetitive questions, ensuring that each question is unique and meaningful.
+5. Use clear and concise language, avoiding complex or ambiguous wording.
+---Instructions---
+1. Carefully analyze the provided entities and relationships to identify:
+    - Key concepts and their hierarchical relationships
+    - Temporal sequences and time order
+    - Cause-and-effect relationships
+    - Dependencies between different elements
+2. Organize the information into a logical sequence by:
+    - Starting with foundational concepts
+    - Gradually building up to more complex relationships
+    - Grouping related ideas together
+    - Creating clear transitions between sections
+3. Maintain the following when generating question-answer pairs:
+    - Logical flow
+    - Clear connections between concepts
+    - Appropriate context and background
+    - Coherent narrative structure
+4. Review and refine the question-answer pairs to ensure:
+    - Overall logical consistency
+    - Clear cause-and-effect relationships
+################
+-Entities-
+################
+{entities}
+################
+-Relationships-
+################
+{relationships}
+################
+Directly output the generated questions and answers, please do not directly copy the example questions and answers, and do not provide irrelevant information.
+Here is the response format you should follow:
+Question: <Question1>
+Answer: <Answer1>
+Question: <Question2>
+Answer: <Answer2>
+"""
+TEMPLATE_ZH: str = """---角色---
+你是一位资深 VQA 数据工程师。你需要为给定的多模态样本生成逻辑连贯、可验证、无幻觉的问答对。
+使用中文作为输出语言。
+---目标---
+创建多组 VQA 问答对，满足：
+1. 仅询问给定数据中客观存在的事实，避免主观或模糊的问题。
+2. 确保每个问题都有明确且可验证的答案，避免无答案或不确定的问题。
+3. 问题应涵盖图像和文本内容的各个方面，确保多样性和全面性。
+4. 避免重复问题，确保每个问题都是独特且有意义的。
+5. 使用清晰简洁的语言，避免复杂或含糊的措辞。
+---说明---
+1. 仔细分析提供的实体和关系，以识别：
+    - 关键概念及其层级关系
+    - 时间序列和时间顺序
+    - 因果关系
+    - 不同元素之间的依赖关系
+2. 通过以下方式将信息组织成逻辑顺序：
+    - 从基础概念开始
+    - 逐步建立更复杂的关系
+    - 将相关的想法分组在一起
+    - 在各部分之间创建清晰的过渡
+3. 生成问答对时保持：
+    - 逻辑流畅
+    - 概念之间的清晰联系
+    - 适当的上下文和背景
+    - 连贯的叙述结构
+4. 检查和完善问答对以确保：
+    - 整体逻辑一致性
+    - 清晰的因果关系
+################
+-实体-
+################
+{entities}
+################
+-关系-
+################
+{relationships}
+################
+直接输出生成的问题和答案，请不要直接复制示例问题和答案，不要输出无关内容。
+以下是你应该遵循的响应格式：
+问题： <问题1>
+答案： <答案1>
+问题： <问题2>
+答案： <答案2>
+"""
+VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}

graphgen/templates/kg/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .kg_extraction import KG_EXTRACTION_PROMPT
+from .kg_summarization import KG_SUMMARIZATION_PROMPT
+from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT

graphgen/templates/{kg_extraction.py → kg/kg_extraction.py} RENAMED Viewed

@@ -1,10 +1,9 @@
 # pylint: disable=C0301
 TEMPLATE_EN: str = """You are an NLP expert, skilled at analyzing text to extract named entities and their relationships.
 -Goal-
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
-Use {language} as output language.
 -Steps-
 1. Identify all entities. For each identified entity, extract the following information:
@@ -23,7 +22,7 @@ Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tupl
 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
 Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
-4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 5. When finished, output {completion_delimiter}
@@ -85,7 +84,7 @@ TEMPLATE_ZH: str = """你是一个NLP专家，擅长分析文本提取命名实
 -目标-
 给定一个实体类型列表和可能与列表相关的文本，从文本中识别所有这些类型的实体，以及这些实体之间所有的关系。
-使用{language}作为输出语言。
 -步骤-
 1. 识别所有实体。对于每个识别的实体，提取以下信息：
@@ -189,12 +188,12 @@ Answer YES | NO if there are still entities and relationships that need to be ad
 IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加，请回答YES | NO。"""
 KG_EXTRACTION_PROMPT: dict = {
-    "English": {
         "TEMPLATE": TEMPLATE_EN,
         "CONTINUE": CONTINUE_EN,
         "IF_LOOP": IF_LOOP_EN,
     },
-    "Chinese": {
         "TEMPLATE": TEMPLATE_ZH,
         "CONTINUE": CONTINUE_ZH,
         "IF_LOOP": IF_LOOP_ZH,
@@ -205,6 +204,5 @@ KG_EXTRACTION_PROMPT: dict = {
         "completion_delimiter": "<|COMPLETE|>",
         "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
 science, technology, mission, gene",
-        "language": "English",
     },
 }

 # pylint: disable=C0301
 TEMPLATE_EN: str = """You are an NLP expert, skilled at analyzing text to extract named entities and their relationships.
 -Goal-
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
+Use English as output language.
 -Steps-
 1. Identify all entities. For each identified entity, extract the following information:
 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
 Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
+4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 5. When finished, output {completion_delimiter}
 -目标-
 给定一个实体类型列表和可能与列表相关的文本，从文本中识别所有这些类型的实体，以及这些实体之间所有的关系。
+使用中文作为输出语言。
 -步骤-
 1. 识别所有实体。对于每个识别的实体，提取以下信息：
 IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加，请回答YES | NO。"""
 KG_EXTRACTION_PROMPT: dict = {
+    "en": {
         "TEMPLATE": TEMPLATE_EN,
         "CONTINUE": CONTINUE_EN,
         "IF_LOOP": IF_LOOP_EN,
     },
+    "zh": {
         "TEMPLATE": TEMPLATE_ZH,
         "CONTINUE": CONTINUE_ZH,
         "IF_LOOP": IF_LOOP_ZH,
         "completion_delimiter": "<|COMPLETE|>",
         "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
 science, technology, mission, gene",
     },
 }

graphgen/templates/{kg_summarization.py → kg/kg_summarization.py} RENAMED Viewed

@@ -3,7 +3,7 @@ Given one entity or relationship, and a list of descriptions, all related to the
 Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
 If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
 Make sure it is written in third person, and include the entity names so we the have full context.
-Use {language} as output language.
 #######
 -Data-
@@ -18,7 +18,7 @@ TEMPLATE_ZH = """你是一个NLP专家，负责根据以下提供的数据生成
 请将所有这些描述整合成一个综合描述。确保包含所有描述中收集的信息。
 如果提供的描述是矛盾的，请解决这些矛盾并提供一个连贯的总结。
 确保以第三人称写作，并包含实体名称，以便我们有完整的上下文。
-使用{language}作为输出语言。
 #######
 -数据-
@@ -30,14 +30,9 @@ TEMPLATE_ZH = """你是一个NLP专家，负责根据以下提供的数据生成
 KG_SUMMARIZATION_PROMPT = {
-    "Chinese": {
-        "TEMPLATE": TEMPLATE_ZH
-    },
-    "English": {
-        "TEMPLATE": TEMPLATE_EN
-    },
     "FORMAT": {
-        "language": "English",
         "tuple_delimiter": "<|>",
         "record_delimiter": "##",
         "completion_delimiter": "<|COMPLETE|>",

 Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
 If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
 Make sure it is written in third person, and include the entity names so we the have full context.
+Use English as output language.
 #######
 -Data-
 请将所有这些描述整合成一个综合描述。确保包含所有描述中收集的信息。
 如果提供的描述是矛盾的，请解决这些矛盾并提供一个连贯的总结。
 确保以第三人称写作，并包含实体名称，以便我们有完整的上下文。
+使用中文作为输出语言。
 #######
 -数据-
 KG_SUMMARIZATION_PROMPT = {
+    "zh": {"TEMPLATE": TEMPLATE_ZH},
+    "en": {"TEMPLATE": TEMPLATE_EN},
     "FORMAT": {
         "tuple_delimiter": "<|>",
         "record_delimiter": "##",
         "completion_delimiter": "<|COMPLETE|>",

graphgen/templates/kg/mm_kg_extraction.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are an expert in multi-modal data analysis and knowledge graph construction. Your task is to extract named entities and relationships from a given multi-modal data chunk and its accompanying text.
+-Objective-
+Given a multi-modal data chunk (e.g., image, table, formula, etc. + accompanying text), construct a knowledge graph centered around the "central multi-modal entity":
+- The central entity must be the image/table/formula itself (e.g., image-c71ef797e99af81047fbc7509609c765).
+- Related entities and relationships must be extracted from the accompanying text.
+- Only retain edges directly connected to the central entity, forming a star-shaped graph.
+Use English as the output language.
+-Steps-
+1. Identify the unique central multi-modal entity and recognize all text entities directly related to the central entity from the accompanying text.
+    For the central entity, extract the following information:
+    - entity_name: Use the unique identifier of the data chunk (e.g., image-c71ef797e99af81047fbc7509609c765).
+    - entity_type: Label according to the type of data chunk (image, table, formula, etc.).
+    - entity_summary: A brief description of the content of the data chunk and its role in the accompanying text.
+    For each entity recognized from the accompanying text, extract the following information:
+    - entity_name: The name of the entity, capitalized
+    - entity_type: One of the following types: [{entity_types}]
+    - entity_summary: A comprehensive summary of the entity's attributes and activities
+    Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+2. From the entities identified in Step 1, recognize all (source_entity, target_entity) pairs that are *obviously related* to each other.
+    For each pair of related entities, extract the following information:
+    - source_entity: The name of the source entity identified in Step 1
+    - target_entity: The name of the target entity identified in Step 1
+    - relationship_summary: Explain why you think the source entity and target entity are related to each other
+    Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator.
+4. Upon completion, output {completion_delimiter}
+################
+-Example-
+################
+Multi-modal data chunk type: image
+Multi-modal data chunk unique identifier: image-c71ef797e99af81047fbc7509609c765
+Accompanying text: The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889. It stands 324 meters tall and is one of the tallest structures in the world. The Eiffel Tower is located on the banks of the Seine River and attracts millions of visitors each year. It is not only an engineering marvel but also an important symbol of French culture.
+################
+Output:
+("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"This is an image showcasing the iconic structure in Paris, France, the Eiffel Tower, highlighting its full height of 324 meters along with the riverside scenery, symbolizing both engineering and cultural significance"){record_delimiter}
+("entity"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"landmark"{tuple_delimiter}"The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889, standing 324 meters tall, located on the banks of the Seine River, attracting millions of visitors each year"){record_delimiter}
+("entity"{tuple_delimiter}"Paris, France"{tuple_delimiter}"location"{tuple_delimiter}"Paris, France is the capital of France, known for its rich historical and cultural heritage and as the location of the Eiffel Tower"){record_delimiter}
+("entity"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"person"{tuple_delimiter}"Gustave Eiffel is a renowned French engineer who designed and built the Eiffel Tower"){record_delimiter}
+("entity"{tuple_delimiter}"Seine River"{tuple_delimiter}"location"{tuple_delimiter}"The Seine River is a major river flowing through Paris, France, with the Eiffel Tower located on its banks"){completion_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"The image showcases the iconic structure, the Eiffel Tower"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Paris, France"{tuple_delimiter}"The image's background is Paris, France, highlighting the geographical location of the Eiffel Tower"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"The Eiffel Tower in the image was designed by Gustave Eiffel"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Seine River"{tuple_delimiter}"The image showcases the scenery of the Eiffel Tower located on the banks of the Seine River"){completion_delimiter}
+################
+-Real Data-
+Multi-modal data chunk type: {chunk_type}
+Multi-modal data chunk unique identifier: {chunk_id}
+Accompanying text: {chunk_text}
+################
+Output:
+"""
+TEMPLATE_ZH: str = """你是一个多模态数据分析和知识图谱构建专家。你的任务是从给定的多模态数据块及其伴随文本中抽取命名实体与关系。
+-目标-
+给定一个多模态数据块（例如图像、表格、公式等 + 伴随文本），构建以「中心多模态实体」为核心的知识图：
+- 中心实体必须是图像/表格/公式本身（如 image-c71ef797e99af81047fbc7509609c765）。
+- 相关实体和关系必须从伴随文本中抽取。
+- 只保留与中心实体直接相连的边，形成星型图。
+使用中文作为输出语言。
+-步骤-
+1. 确定唯一的中心多模态实体，从伴随文本中识别所有与中心实体直接相关的文本实体。
+   对于中心实体，提取以下信息：
+    - entity_name：使用数据块的唯一标识符（如 image-c71ef797e99af81047fbc7509609c765）。
+    - entity_type：根据数据块类型（图像、表格、公式等）进行标注。
+    - entity_summary：简要描述数据块的内容和其在伴随文本中的作用。
+   对于从伴随文本中识别的每个实体，提取以下信息：
+    - entity_name：实体的名称，首字母大写
+    - entity_type：以下类型之一：[{entity_types}]
+    - entity_summary：实体的属性与活动的全面总结
+    将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*明显相关*。
+   对于每对相关的实体，提取以下信息：
+   - source_entity：步骤1中识别的源实体名称
+   - target_entity：步骤1中识别的目标实体名称
+   - relationship_summary：解释为什么你认为源实体和目标实体彼此相关
+   将每个关系格式化为("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+3. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。
+4. 完成后，输出{completion_delimiter}
+################
+-示例-
+################
+多模态数据块类型：image
+多模态数据块唯一标识符：image-c71ef797e99af81047fbc7509609c765
+伴随文本：埃菲尔铁塔是法国巴黎的标志性结构，由古斯塔夫·埃菲尔设计并于1889年建成。它高324米，是世界上最高的建筑之一。埃菲尔铁塔位于塞纳河畔，吸引了数百万游客前来参观。它不仅是工程学的奇迹，也是法国文化的重要象征。
+################
+输出：
+("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"这是一张展示法国巴黎标志性建筑的图像，主体为埃菲尔铁塔，呈现其324米高度的全貌与河畔景观，具有工程与文化双重象征意义"){record_delimiter}
+("entity"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"landmark"{tuple_delimiter}"埃菲尔铁塔是法国巴黎的标志性结构，由古斯塔夫·埃菲尔设计并于1889年建成，高324米，是世界上最高的建筑之一，位于塞纳河畔，吸引了数百万游客前来参观"){record_delimiter}
+("entity"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"location"{tuple_delimiter}"法国巴黎是法国的首都，以其丰富的历史文化遗产和作为埃菲尔铁塔所在地而闻名"){record_delimiter}
+("entity"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"person"{tuple_delimiter}"古斯塔夫·埃菲尔是法国著名的工程师，设计并建造了埃菲尔铁塔"){record_delimiter}
+("entity"{tuple_delimiter}"塞纳河"{tuple_delimiter}"location"{tuple_delimiter}"塞纳河是流经法国巴黎的重要河流，埃菲尔铁塔位于其畔"){completion_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"图像展示了埃菲尔铁塔这一标志性建筑"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"图像背景为法国巴黎，突显了埃菲尔铁塔的地理位置"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"图像中的埃菲尔铁塔是由古斯塔夫·埃菲尔设计的"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"塞纳河"{tuple_delimiter}"���像展示了埃菲尔铁塔位于塞纳河畔的景观"){completion_delimiter}
+################
+-真实数据-
+多模态数据块类型： {chunk_type}
+多模态数据块唯一标识符： {chunk_id}
+伴随文本： {chunk_text}
+################
+输出：
+"""
+MMKG_EXTRACTION_PROMPT: dict = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+    "FORMAT": {
+        "tuple_delimiter": "<|>",
+        "record_delimiter": "##",
+        "completion_delimiter": "<|COMPLETE|>",
+        "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
+science, technology, mission, gene",
+    },
+}

graphgen/utils/__init__.py CHANGED Viewed

@@ -9,7 +9,7 @@ from .format import (
     split_string_by_multi_markers,
     write_json,
 )
-from .hash import compute_args_hash, compute_content_hash
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop

     split_string_by_multi_markers,
     write_json,
 )
+from .hash import compute_args_hash, compute_content_hash, compute_mm_hash
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop

graphgen/utils/detect_lang.py CHANGED Viewed

@@ -1,40 +1,41 @@
 def detect_main_language(text):
     """
-    识别文本的主要语言
     :param text:
     :return:
     """
     assert isinstance(text, str)
     def is_chinese_char(char):
-        return '\u4e00' <= char <= '\u9fff'
     def is_english_char(char):
         return char.isascii() and char.isalpha()
-    # 去除空格和标点符号
-    text = ''.join(char for char in text if char.strip())
     chinese_count = sum(1 for char in text if is_chinese_char(char))
     english_count = sum(1 for char in text if is_english_char(char))
     total = chinese_count + english_count
     if total == 0:
-        return 'en'
     chinese_ratio = chinese_count / total
     if chinese_ratio >= 0.5:
-        return 'zh'
-    return 'en'
 def detect_if_chinese(text):
     """
-    判断文本是否包含有中文
     :param text:
     :return:
     """
     assert isinstance(text, str)
-    return any('\u4e00' <= char <= '\u9fff' for char in text)

 def detect_main_language(text):
     """
+    Detect the main language of the text, 'zh' for Chinese, 'en' for English
     :param text:
     :return:
     """
     assert isinstance(text, str)
     def is_chinese_char(char):
+        return "\u4e00" <= char <= "\u9fff"
     def is_english_char(char):
         return char.isascii() and char.isalpha()
+    text = "".join(char for char in text if char.strip())
     chinese_count = sum(1 for char in text if is_chinese_char(char))
     english_count = sum(1 for char in text if is_english_char(char))
     total = chinese_count + english_count
     if total == 0:
+        return "en"
     chinese_ratio = chinese_count / total
     if chinese_ratio >= 0.5:
+        return "zh"
+    return "en"
 def detect_if_chinese(text):
     """
+    Detect if the text contains any Chinese characters
     :param text:
     :return:
     """
     assert isinstance(text, str)
+    return any("\u4e00" <= char <= "\u9fff" for char in text)

graphgen/utils/hash.py CHANGED Viewed

@@ -1,7 +1,23 @@
 from hashlib import md5
 def compute_args_hash(*args):
     return md5(str(args).encode()).hexdigest()
 def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()

 from hashlib import md5
 def compute_args_hash(*args):
     return md5(str(args).encode()).hexdigest()
 def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
+def compute_mm_hash(item, prefix: str = ""):
+    if item.get("type") == "text" and item.get("text"):
+        content = item["text"].strip()
+    elif item.get("type") == "image" and item.get("img_path"):
+        content = f"image:{item['img_path']}"
+    elif item.get("type") == "table" and item.get("table_body"):
+        content = f"table:{item['table_body']}"
+    elif item.get("type") == "equation" and item.get("text"):
+        content = f"equation:{item['text']}"
+    else:
+        content = str(item)
+    return prefix + md5(content.encode()).hexdigest()

graphgen/utils/log.py CHANGED Viewed

@@ -8,7 +8,8 @@ logger = logging.getLogger("graphgen")
 def set_logger(
     log_file: str,
-    log_level: int = logging.INFO,
     *,
     if_stream: bool = True,
     max_bytes: int = 50 * 1024 * 1024,  # 50 MB
@@ -22,14 +23,18 @@ def set_logger(
     if force:
         logger.handlers.clear()
-    logger.setLevel(log_level)
     logger.propagate = False
     if logger.handlers:
         logger.handlers.clear()
     if if_stream:
-        console = RichHandler(level=log_level, show_path=False, rich_tracebacks=True)
         console.setFormatter(logging.Formatter("%(message)s"))
         logger.addHandler(console)
@@ -39,7 +44,7 @@ def set_logger(
         backupCount=backup_count,
         encoding="utf-8",
     )
-    file_handler.setLevel(log_level)
     file_handler.setFormatter(
         logging.Formatter(
             "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s",

 def set_logger(
     log_file: str,
+    file_level: int = logging.DEBUG,
+    console_level: int = logging.INFO,
     *,
     if_stream: bool = True,
     max_bytes: int = 50 * 1024 * 1024,  # 50 MB
     if force:
         logger.handlers.clear()
+    logger.setLevel(
+        min(file_level, console_level)
+    )  # Set to the lowest level to capture all logs
     logger.propagate = False
     if logger.handlers:
         logger.handlers.clear()
     if if_stream:
+        console = RichHandler(
+            level=console_level, show_path=False, rich_tracebacks=True
+        )
         console.setFormatter(logging.Formatter("%(message)s"))
         logger.addHandler(console)
         backupCount=backup_count,
         encoding="utf-8",
     )
+    file_handler.setLevel(file_level)
     file_handler.setFormatter(
         logging.Formatter(
             "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s",