Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pdfplumber
|
| 3 |
from PIL import Image
|
| 4 |
-
import pytesseract
|
| 5 |
import io
|
| 6 |
import re
|
| 7 |
import random
|
|
@@ -23,28 +22,14 @@ def extract_text_from_pdf(file_bytes):
|
|
| 23 |
page_text = page.extract_text()
|
| 24 |
if page_text:
|
| 25 |
text += page_text + "\n"
|
| 26 |
-
#
|
| 27 |
-
if not text.strip():
|
| 28 |
-
text = ocr_pdf(file_bytes)
|
| 29 |
return text
|
| 30 |
except Exception as e:
|
| 31 |
return ""
|
| 32 |
|
| 33 |
-
def ocr_pdf(file_bytes):
|
| 34 |
-
text = ""
|
| 35 |
-
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 36 |
-
for page in pdf.pages:
|
| 37 |
-
# Convert page to image
|
| 38 |
-
pil_image = page.to_image(resolution=300).original
|
| 39 |
-
# OCR
|
| 40 |
-
page_text = pytesseract.image_to_string(pil_image)
|
| 41 |
-
text += page_text + "\n"
|
| 42 |
-
return text
|
| 43 |
-
|
| 44 |
def extract_text_from_image(file_bytes):
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
return text
|
| 48 |
|
| 49 |
def extract_text_from_txt(file_bytes):
|
| 50 |
try:
|
|
@@ -216,6 +201,7 @@ def main_process(file, question_type, num_questions):
|
|
| 216 |
if fname.endswith(".pdf"):
|
| 217 |
extracted_text = extract_text_from_pdf(file_bytes)
|
| 218 |
elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
|
|
|
|
| 219 |
extracted_text = extract_text_from_image(file_bytes)
|
| 220 |
elif fname.endswith(".txt"):
|
| 221 |
extracted_text = extract_text_from_txt(file_bytes)
|
|
@@ -285,3 +271,4 @@ with gr.Blocks(css="""
|
|
| 285 |
|
| 286 |
if __name__ == "__main__":
|
| 287 |
demo.launch()
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pdfplumber
|
| 3 |
from PIL import Image
|
|
|
|
| 4 |
import io
|
| 5 |
import re
|
| 6 |
import random
|
|
|
|
| 22 |
page_text = page.extract_text()
|
| 23 |
if page_text:
|
| 24 |
text += page_text + "\n"
|
| 25 |
+
# Do not fallback on OCR because pytesseract requires system installation
|
|
|
|
|
|
|
| 26 |
return text
|
| 27 |
except Exception as e:
|
| 28 |
return ""
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def extract_text_from_image(file_bytes):
|
| 31 |
+
# OCR disabled due to system dependencies on Tesseract
|
| 32 |
+
return "OCR not supported in this environment. Please upload a PDF or TXT file containing selectable text."
|
|
|
|
| 33 |
|
| 34 |
def extract_text_from_txt(file_bytes):
|
| 35 |
try:
|
|
|
|
| 201 |
if fname.endswith(".pdf"):
|
| 202 |
extracted_text = extract_text_from_pdf(file_bytes)
|
| 203 |
elif fname.endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff")):
|
| 204 |
+
# OCR unsupported fallback message
|
| 205 |
extracted_text = extract_text_from_image(file_bytes)
|
| 206 |
elif fname.endswith(".txt"):
|
| 207 |
extracted_text = extract_text_from_txt(file_bytes)
|
|
|
|
| 271 |
|
| 272 |
if __name__ == "__main__":
|
| 273 |
demo.launch()
|
| 274 |
+
|