Spaces:

chenzihong
/

GraphGen

Sleeping

GraphGen / graphgen /models /kg_builder /mm_kg_builder.py

github-actions[bot]

Auto-sync from demo at Thu Oct 23 12:37:24 UTC 2025

8c66169 about 2 months ago

3.5 kB

	import re
	from collections import defaultdict
	from typing import Dict, List, Tuple

	from graphgen.bases import Chunk
	from graphgen.templates import MMKG_EXTRACTION_PROMPT
	from graphgen.utils import (
	detect_main_language,
	handle_single_entity_extraction,
	handle_single_relationship_extraction,
	logger,
	split_string_by_multi_markers,
	)

	from .light_rag_kg_builder import LightRAGKGBuilder


	class MMKGBuilder(LightRAGKGBuilder):
	async def extract(
	self, chunk: Chunk
	) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
	"""
	Extract entities and relationships from a single multi-modal chunk using the LLM client.
	Expect to get a mini graph which contains a central multi-modal entity
	and its related text entities and relationships.
	Like:
	(image: "image_of_eiffel_tower") --[located_in]--> (text: "Paris")
	(image: "image_of_eiffel_tower") --[built_in]--> (text: "1889")
	(text: "Eiffel Tower") --[height]--> (text: "324 meters")
	:param chunk
	"""
	chunk_id = chunk.id
	chunk_type = chunk.type # image \| table \| formula \| ...
	metadata = chunk.metadata

	# choose different extraction strategies based on chunk type
	if chunk_type == "image":
	image_caption = "\n".join(metadata.get("image_caption", ""))
	language = detect_main_language(image_caption)
	prompt_template = MMKG_EXTRACTION_PROMPT[language].format(
	**MMKG_EXTRACTION_PROMPT["FORMAT"],
	chunk_type=chunk_type,
	chunk_id=chunk_id,
	chunk_text=image_caption,
	)
	result = await self.llm_client.generate_answer(prompt_template)
	logger.debug("Image chunk extraction result: %s", result)

	# parse the result
	records = split_string_by_multi_markers(
	result,
	[
	MMKG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
	MMKG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
	],
	)

	nodes = defaultdict(list)
	edges = defaultdict(list)

	for record in records:
	match = re.search(r"\((.*)\)", record)
	if not match:
	continue
	inner = match.group(1)

	attributes = split_string_by_multi_markers(
	inner, [MMKG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
	)

	entity = await handle_single_entity_extraction(attributes, chunk_id)
	if entity is not None:
	nodes[entity["entity_name"]].append(entity)
	continue

	relation = await handle_single_relationship_extraction(
	attributes, chunk_id
	)
	if relation is not None:
	key = (relation["src_id"], relation["tgt_id"])
	edges[key].append(relation)

	return dict(nodes), dict(edges)

	if chunk_type == "table":
	pass # TODO: implement table-based entity and relationship extraction
	if chunk_type == "formula":
	pass # TODO: implement formula-based entity and relationship extraction

	logger.error("Unsupported chunk type for MMKGBuilder: %s", chunk_type)
	return defaultdict(list), defaultdict(list)