hmnshudhmn24
/

docintel-ai-extractor

Document Question Answering

Model card Files Files and versions

docintel-ai-extractor / summarize_doc.py

hmnshudhmn24's picture

Upload 14 files

1108401 verified about 1 month ago

history blame contribute delete

1.59 kB

	"""Summarization using DONUT (naver-clova-ix/donut-base) via Hugging Face.
	This module uses Donut's processor and VisionEncoderDecoderModel for docVQA-style prompts.
	"""
	from transformers import DonutProcessor, VisionEncoderDecoderModel
	from PIL import Image
	from utils import load_config

	_processor = None
	_model = None

	def _init(model_name):
	global _processor, _model
	if _processor is None or _model is None:
	_processor = DonutProcessor.from_pretrained(model_name)
	_model = VisionEncoderDecoderModel.from_pretrained(model_name)
	return _processor, _model

	def summarize_image(image_path, model_name=None, max_length=250):
	cfg = load_config()
	model_name = model_name or cfg.get('model', {}).get('name')
	processor, model = _init(model_name)
	image = Image.open(image_path).convert('RGB')
	task_prompt = '<s_docvqa><s_question>Summarize the document:</s_question>'
	inputs = processor(image, task_prompt, return_tensors='pt')
	output = model.generate(**inputs, max_new_tokens=max_length)
	decoded = processor.batch_decode(output, skip_special_tokens=True)[0]
	return decoded

	def summarize_text(text, chunk_size=1000, model_name=None):
	# naive: summarize by extracting first chunk and running model on placeholder image (not ideal for text-only)
	# For text-heavy docs, use text summarization pipeline instead; here we return a simple extractive summary.
	lines = [l.strip() for l in text.split('\n') if l.strip()]
	if not lines:
	return ''
	summary = ' '.join(lines[:min(5, len(lines))])
	return summary