Yaz Hobooti commited on
Commit
2a98843
·
1 Parent(s): 0f0c5dd

Fix poppler PDF processing: add comprehensive packages and multiple path attempts

Browse files
Files changed (2) hide show
  1. apt.txt +3 -1
  2. pdf_comparator.py +17 -12
apt.txt CHANGED
@@ -1,4 +1,6 @@
1
  poppler-utils
 
 
 
2
  tesseract-ocr
3
  libzbar0
4
- libpoppler-cpp-dev
 
1
  poppler-utils
2
+ poppler-data
3
+ libpoppler-dev
4
+ libpoppler-cpp-dev
5
  tesseract-ocr
6
  libzbar0
 
pdf_comparator.py CHANGED
@@ -49,21 +49,26 @@ def _is_pdf(path: str) -> bool:
49
 
50
  def load_first_page(path: str, dpi: int = 300) -> Image.Image:
51
  if _is_pdf(path):
52
- try:
53
- # Try with poppler_path explicitly set
54
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path="/usr/bin")
55
- if not imgs:
56
- raise ValueError(f"No pages in PDF: {path}")
57
- return imgs[0].convert("RGB")
58
- except Exception as e1:
59
  try:
60
- # Fallback: try without explicit poppler_path
61
- imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
 
 
 
62
  if not imgs:
63
- raise ValueError(f"No pages in PDF: {path}")
 
64
  return imgs[0].convert("RGB")
65
- except Exception as e2:
66
- raise ValueError(f"Failed to convert PDF to image. Error 1: {str(e1)}. Error 2: {str(e2)}. Make sure poppler-utils is installed.")
 
 
 
 
67
  return Image.open(path).convert("RGB")
68
 
69
  def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]:
 
49
 
50
  def load_first_page(path: str, dpi: int = 300) -> Image.Image:
51
  if _is_pdf(path):
52
+ # Try multiple poppler paths
53
+ poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
54
+
55
+ for poppler_path in poppler_paths:
 
 
 
56
  try:
57
+ if poppler_path:
58
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1, poppler_path=poppler_path)
59
+ else:
60
+ imgs = convert_from_path(path, dpi=dpi, first_page=1, last_page=1)
61
+
62
  if not imgs:
63
+ continue
64
+
65
  return imgs[0].convert("RGB")
66
+ except Exception as e:
67
+ if poppler_path is None: # Last attempt failed
68
+ raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: {str(e)}. Make sure poppler-utils is installed.")
69
+ continue # Try next path
70
+
71
+ raise ValueError(f"No pages in PDF: {path}")
72
  return Image.open(path).convert("RGB")
73
 
74
  def match_sizes(a: Image.Image, b: Image.Image) -> Tuple[Image.Image, Image.Image]: