Yaz Hobooti
commited on
Commit
·
2a98843
1
Parent(s):
0f0c5dd
Fix poppler PDF processing: add comprehensive packages and multiple path attempts
Browse files- apt.txt +3 -1
- pdf_comparator.py +17 -12
apt.txt
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
poppler-utils
|
|
|
|
|
|
|
|
|
|
| 2 |
tesseract-ocr
|
| 3 |
libzbar0
|
| 4 |
-
libpoppler-cpp-dev
|
|
|
|
| 1 |
poppler-utils
|
| 2 |
+
poppler-data
|
| 3 |
+
libpoppler-dev
|
| 4 |
+
libpoppler-cpp-dev
|
| 5 |
tesseract-ocr
|
| 6 |
libzbar0
|
|
|
pdf_comparator.py
CHANGED
|
@@ -49,21 +49,26 @@ def _is_pdf(path: str) -> bool:
|
|
| 49 |
|
| 50 |
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
| 51 |
if _is_pdf(path):
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
raise ValueError(f"No pages in PDF: {path}")
|
| 57 |
-
return imgs[0].convert("RGB")
|
| 58 |
-
except Exception as e1:
|
| 59 |
try:
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
| 62 |
if not imgs:
|
| 63 |
-
|
|
|
|
| 64 |
return imgs[0].convert("RGB")
|
| 65 |
-
except Exception as
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
return Image.open(path).convert("RGB")
|
| 68 |
|
| 69 |
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|
|
|
|
| 49 |
|
| 50 |
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
| 51 |
if _is_pdf(path):
|
| 52 |
+
# Try multiple poppler paths
|
| 53 |
+
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
| 54 |
+
|
| 55 |
+
for poppler_path in poppler_paths:
|
|
|
|
|
|
|
|
|
|
| 56 |
try:
|
| 57 |
+
if poppler_path:
|
| 58 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path=poppler_path)
|
| 59 |
+
else:
|
| 60 |
+
imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
|
| 61 |
+
|
| 62 |
if not imgs:
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
return imgs[0].convert("RGB")
|
| 66 |
+
except Exception as e:
|
| 67 |
+
if poppler_path is None: # Last attempt failed
|
| 68 |
+
raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: {str(e)}. Make sure poppler-utils is installed.")
|
| 69 |
+
continue # Try next path
|
| 70 |
+
|
| 71 |
+
raise ValueError(f"No pages in PDF: {path}")
|
| 72 |
return Image.open(path).convert("RGB")
|
| 73 |
|
| 74 |
def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
|