import os import io import json from google.cloud import vision from dotenv import load_dotenv from groq import Groq load_dotenv() # Load credentials from env variable # Save secret JSON string to a temporary file gcv_json_str = os.environ.get("GCV_JSON") if gcv_json_str: temp_path = "/tmp/gcv_temp.json" with open(temp_path, "w") as f: f.write(gcv_json_str) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path client = Groq(api_key=os.getenv("GROQ_API_KEY")) def run_ocr_with_gcv(image_path): client_vision = vision.ImageAnnotatorClient() with io.open(image_path, 'rb') as image_file: content = image_file.read() image = vision.Image(content=content) response = client_vision.document_text_detection(image=image) return response.full_text_annotation.text def extract_table_from_text(text,max_tokens=4096,model="meta-llama/llama-4-scout-17b-16e-instruct"): prompt = f""" Extract a structured table of items from the invoice text below. - First findout what are the table column names - The table should include all items under column names. - If some values are missing, fill as "N/A". Output the table in Markdown format. Only return the table. Invoice Text: \"\"\" {text} \"\"\" """ response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a professional invoice data extractor."}, {"role": "user", "content": prompt} ], temperature=1, max_completion_tokens=max_tokens, top_p=1, ) return response.choices[0].message.content import pandas as pd from io import StringIO def extract_markdown_table(output_text): # Step 1: Try to find the first line that starts with '|' lines = output_text.strip().split('\n') table_lines = [line for line in lines if '|' in line and line.count('|') > 1] if not table_lines or len(table_lines) < 2: raise ValueError("❌ No markdown table found in output.") # Step 2: Remove markdown header separator if exists if '---' in table_lines[1]: table_lines = [table_lines[0]] + table_lines[2:] # Step 3: Clean and convert to CSV cleaned_md = "\n".join(table_lines) df = pd.read_csv(StringIO(cleaned_md), sep='|', engine='python') df = df.dropna(axis=1, how='all') # remove empty columns df.columns = [col.strip() for col in df.columns] df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) return df