Spaces:
Running
Running
| import os | |
| import io | |
| import json | |
| from google.cloud import vision | |
| from dotenv import load_dotenv | |
| from groq import Groq | |
| load_dotenv() | |
| # Load credentials from env variable | |
| # Save secret JSON string to a temporary file | |
| gcv_json_str = os.environ.get("GCV_JSON") | |
| if gcv_json_str: | |
| temp_path = "/tmp/gcv_temp.json" | |
| with open(temp_path, "w") as f: | |
| f.write(gcv_json_str) | |
| os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_path | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| def run_ocr_with_gcv(image_path): | |
| client_vision = vision.ImageAnnotatorClient() | |
| with io.open(image_path, 'rb') as image_file: | |
| content = image_file.read() | |
| image = vision.Image(content=content) | |
| response = client_vision.document_text_detection(image=image) | |
| return response.full_text_annotation.text | |
| def extract_table_from_text(text): | |
| prompt = f""" | |
| Extract a structured table of items from the invoice text below. | |
| - First findout what are the table column names | |
| - The table should include all items under column names. | |
| - | |
| If some values are missing, fill as "N/A". | |
| Output the table in Markdown format. Only return the table. | |
| Invoice Text: | |
| \"\"\" | |
| {text} | |
| \"\"\" | |
| """ | |
| response = client.chat.completions.create( | |
| model="meta-llama/llama-4-scout-17b-16e-instruct", | |
| messages=[ | |
| {"role": "system", "content": "You are a professional invoice data extractor."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=1, | |
| max_completion_tokens=4096, | |
| top_p=1, | |
| ) | |
| return response.choices[0].message.content | |
| import pandas as pd | |
| from io import StringIO | |
| def extract_markdown_table(output_text): | |
| # Step 1: Try to find the first line that starts with '|' | |
| lines = output_text.strip().split('\n') | |
| table_lines = [line for line in lines if '|' in line and line.count('|') > 1] | |
| if not table_lines or len(table_lines) < 2: | |
| raise ValueError("β No markdown table found in output.") | |
| # Step 2: Remove markdown header separator if exists | |
| if '---' in table_lines[1]: | |
| table_lines = [table_lines[0]] + table_lines[2:] | |
| # Step 3: Clean and convert to CSV | |
| cleaned_md = "\n".join(table_lines) | |
| df = pd.read_csv(StringIO(cleaned_md), sep='|', engine='python') | |
| df = df.dropna(axis=1, how='all') # remove empty columns | |
| df.columns = [col.strip() for col in df.columns] | |
| df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) | |
| return df | |