Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF for PDF processing | |
| from PIL import Image | |
| import pytesseract | |
| import gradio as gr | |
| # Ensure Tesseract is configured with Marathi language support | |
| pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Default on Linux-based HF Spaces | |
| # Install Marathi language: sudo apt-get install tesseract-ocr-mar | |
| def extract_images_from_pdf(pdf_path): | |
| """ | |
| Extract images from a PDF file using PyMuPDF. | |
| """ | |
| images = [] | |
| document = fitz.open(pdf_path) | |
| for page_number in range(len(document)): | |
| page = document.load_page(page_number) | |
| pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| images.append(img) | |
| return images | |
| def perform_ocr_on_images(images): | |
| """ | |
| Perform OCR on the extracted images using pytesseract for Marathi text. | |
| """ | |
| ocr_results = [] | |
| for img in images: | |
| text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi | |
| ocr_results.append(text) | |
| return "\n".join(ocr_results) | |
| def ocr_marathi_from_pdf(pdf_file_path): | |
| """ | |
| Main function to handle Marathi OCR from a PDF file. | |
| """ | |
| images = extract_images_from_pdf(pdf_file_path) # Use the file path from the upload | |
| ocr_text = perform_ocr_on_images(images) | |
| return ocr_text | |
| # Define the Gradio interface | |
| interface = gr.Interface( | |
| fn=ocr_marathi_from_pdf, | |
| inputs=gr.File(type="filepath", label="Upload Marathi PDF"), # Fixed the type here | |
| outputs=gr.Textbox(label="Extracted Marathi Text"), | |
| title="Marathi PDF OCR", | |
| description="Upload a PDF containing Marathi text. The app will extract the text using OCR.", | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |