Spaces:

mocktestgen
/

HritwikrudraGupta

Runtime error

App Files Files Community

mocktestgen commited on May 6

Commit

cf6fac4

verified ·

1 Parent(s): facb671

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -18

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import pdfplumber
 from PIL import Image
-import pytesseract
 import io
 import re
 import random
@@ -23,28 +22,14 @@ def extract_text_from_pdf(file_bytes):
                 page_text = page.extract_text()
                 if page_text:
                     text += page_text + "\n"
-        # If extracted text is empty, fallback to OCR per page
-        if not text.strip():
-            text = ocr_pdf(file_bytes)
         return text
     except Exception as e:
         return ""
-def ocr_pdf(file_bytes):
-    text = ""
-    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-        for page in pdf.pages:
-            # Convert page to image
-            pil_image = page.to_image(resolution=300).original
-            # OCR
-            page_text = pytesseract.image_to_string(pil_image)
-            text += page_text + "\n"
-    return text
 def extract_text_from_image(file_bytes):
-    image = Image.open(io.BytesIO(file_bytes))
-    text = pytesseract.image_to_string(image)
-    return text
 def extract_text_from_txt(file_bytes):
     try:
@@ -216,6 +201,7 @@ def main_process(file, question_type, num_questions):
     if fname.endswith(".pdf"):
         extracted_text = extract_text_from_pdf(file_bytes)
     elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
         extracted_text = extract_text_from_image(file_bytes)
     elif fname.endswith(".txt"):
         extracted_text = extract_text_from_txt(file_bytes)
@@ -285,3 +271,4 @@ with gr.Blocks(css="""
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pdfplumber
 from PIL import Image
 import io
 import re
 import random
                 page_text = page.extract_text()
                 if page_text:
                     text += page_text + "\n"
+        # Do not fallback on OCR because pytesseract requires system installation
         return text
     except Exception as e:
         return ""
 def extract_text_from_image(file_bytes):
+    # OCR disabled due to system dependencies on Tesseract
+    return "OCR not supported in this environment. Please upload a PDF or TXT file containing selectable text."
 def extract_text_from_txt(file_bytes):
     try:
     if fname.endswith(".pdf"):
         extracted_text = extract_text_from_pdf(file_bytes)
     elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
+        # OCR unsupported fallback message
         extracted_text = extract_text_from_image(file_bytes)
     elif fname.endswith(".txt"):
         extracted_text = extract_text_from_txt(file_bytes)
 if __name__ == "__main__":
     demo.launch()