Spaces:

RameshJ
/

Invoices_to_Table

Running

App Files Files Community

Invoices_to_Table / ocr_llm_utils.py

RameshJ

Update ocr_llm_utils.py

44081bf verified 6 months ago

raw

history blame

2.51 kB

	import os
	import io
	import json
	from google.cloud import vision
	from dotenv import load_dotenv
	from groq import Groq

	load_dotenv()

	# Load credentials from env variable

	# Save secret JSON string to a temporary file
	gcv_json_str = os.environ.get("GCV_JSON")
	if gcv_json_str:
	temp_path = "/tmp/gcv_temp.json"
	with open(temp_path, "w") as f:
	f.write(gcv_json_str)
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path

	client = Groq(api_key=os.getenv("GROQ_API_KEY"))

	def run_ocr_with_gcv(image_path):
	client_vision = vision.ImageAnnotatorClient()
	with io.open(image_path, 'rb') as image_file:
	content = image_file.read()

	image = vision.Image(content=content)
	response = client_vision.document_text_detection(image=image)
	return response.full_text_annotation.text

	def extract_table_from_text(text):
	prompt = f"""
	Extract a structured table of items from the invoice text below.
	- First findout what are the table column names
	- The table should include all items under column names.
	-
	If some values are missing, fill as "N/A".

	Output the table in Markdown format. Only return the table.

	Invoice Text:
	\"\"\"
	{text}
	\"\"\"
	"""
	response = client.chat.completions.create(
	model="meta-llama/llama-4-scout-17b-16e-instruct",
	messages=[
	{"role": "system", "content": "You are a professional invoice data extractor."},
	{"role": "user", "content": prompt}
	],
	temperature=1,
	max_completion_tokens=4096,
	top_p=1,
	)
	return response.choices[0].message.content


	import pandas as pd
	from io import StringIO


	def extract_markdown_table(output_text):
	# Step 1: Try to find the first line that starts with '\|'
	lines = output_text.strip().split('\n')
	table_lines = [line for line in lines if '\|' in line and line.count('\|') > 1]

	if not table_lines or len(table_lines) < 2:
	raise ValueError("❌ No markdown table found in output.")

	# Step 2: Remove markdown header separator if exists
	if '---' in table_lines[1]:
	table_lines = [table_lines[0]] + table_lines[2:]

	# Step 3: Clean and convert to CSV
	cleaned_md = "\n".join(table_lines)
	df = pd.read_csv(StringIO(cleaned_md), sep='\|', engine='python')
	df = df.dropna(axis=1, how='all') # remove empty columns
	df.columns = [col.strip() for col in df.columns]
	df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

	return df