Invoices_to_Table / ocr_llm_utils.py
RameshJ's picture
Update ocr_llm_utils.py
44081bf verified
raw
history blame
2.51 kB
import os
import io
import json
from google.cloud import vision
from dotenv import load_dotenv
from groq import Groq
load_dotenv()
# Load credentials from env variable
# Save secret JSON string to a temporary file
gcv_json_str = os.environ.get("GCV_JSON")
if gcv_json_str:
temp_path = "/tmp/gcv_temp.json"
with open(temp_path, "w") as f:
f.write(gcv_json_str)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
def run_ocr_with_gcv(image_path):
client_vision = vision.ImageAnnotatorClient()
with io.open(image_path, 'rb') as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client_vision.document_text_detection(image=image)
return response.full_text_annotation.text
def extract_table_from_text(text):
prompt = f"""
Extract a structured table of items from the invoice text below.
- First findout what are the table column names
- The table should include all items under column names.
-
If some values are missing, fill as "N/A".
Output the table in Markdown format. Only return the table.
Invoice Text:
\"\"\"
{text}
\"\"\"
"""
response = client.chat.completions.create(
model="meta-llama/llama-4-scout-17b-16e-instruct",
messages=[
{"role": "system", "content": "You are a professional invoice data extractor."},
{"role": "user", "content": prompt}
],
temperature=1,
max_completion_tokens=4096,
top_p=1,
)
return response.choices[0].message.content
import pandas as pd
from io import StringIO
def extract_markdown_table(output_text):
# Step 1: Try to find the first line that starts with '|'
lines = output_text.strip().split('\n')
table_lines = [line for line in lines if '|' in line and line.count('|') > 1]
if not table_lines or len(table_lines) < 2:
raise ValueError("❌ No markdown table found in output.")
# Step 2: Remove markdown header separator if exists
if '---' in table_lines[1]:
table_lines = [table_lines[0]] + table_lines[2:]
# Step 3: Clean and convert to CSV
cleaned_md = "\n".join(table_lines)
df = pd.read_csv(StringIO(cleaned_md), sep='|', engine='python')
df = df.dropna(axis=1, how='all') # remove empty columns
df.columns = [col.strip() for col in df.columns]
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
return df