Spaces:

Svngoku
/

PDF2Dataset

Running

App Files Files Community

Svngoku commited on Mar 28

Commit

93be02b

verified ·

1 Parent(s): 39c7fb4

🥹

Browse files

Files changed (1) hide show

app.py +62 -28

app.py CHANGED Viewed

@@ -45,6 +45,29 @@ def encode_image_bytes(image_bytes: bytes) -> str:
     """Encodes image bytes to a base64 string."""
     return base64.b64encode(image_bytes).decode('utf-8')
 def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
     """Combines markdown from OCR pages, replacing image IDs with base64 data URIs."""
     processed_markdowns = []
@@ -58,12 +81,16 @@ def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
     try:
         for page_idx, page in enumerate(ocr_response.pages):
             if hasattr(page, 'images') and page.images:
                 for img in page.images:
                     if hasattr(img, 'id') and hasattr(img, 'image_base64') and img.image_base64:
                         image_data_map[img.id] = img.image_base64
                     else:
-                        logger.warning(f"Page {page_idx}: Image object lacks 'id' or valid 'image_base64'.")
             if not hasattr(page, 'markdown'):
                 logger.warning(f"Page {page_idx} lacks 'markdown' attribute. Skipping.")
                 continue
@@ -73,6 +100,7 @@ def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
             current_processed_markdown = current_raw_markdown
             img_refs = re.findall(r"!\[.*?\]\((.*?)\)", current_processed_markdown)
             for img_id in img_refs:
                 if img_id in image_data_map:
                     base64_data_uri = image_data_map[img_id]
@@ -84,11 +112,13 @@ def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
                             r"\1" + base64_data_uri + r"\2",
                             current_processed_markdown
                         )
                 elif not img_id.startswith(('http:', 'https:', 'data:')):
                     logger.warning(f"Page {page_idx}: Image ID '{img_id}' not in image data.")
             processed_markdowns.append(current_processed_markdown)
         return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
     except Exception as e:
@@ -114,10 +144,9 @@ def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
         if file_ext == '.pdf':
             try:
                 with open(file_path, "rb") as f:
-                    file_content = f.read()  # Read the entire file content
                 logger.info(f"Uploading PDF {file_name} to Mistral...")
-                # Use dictionary format as per documentation
                 uploaded_pdf = client.files.upload(
                     file={
                         "file_name": file_name,
@@ -134,6 +163,7 @@ def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
                     document={"type": "document_url", "document_url": signed_url_response.url},
                     include_image_base64=True
                 )
             finally:
                 if uploaded_file_id:
                     try:
@@ -155,12 +185,15 @@ def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
                     document={"type": "image_url", "image_url": data_uri},
                     include_image_base64=True
                 )
         else:
             return f"Unsupported file type: '{file_name}'.", "", {}
         if ocr_response:
-            return get_combined_markdown(ocr_response)
         return f"Error: OCR failed for '{file_name}'.", "", {}
     except Exception as e:
@@ -173,11 +206,16 @@ def chunk_markdown(
     chunk_overlap: int = 200,
     strip_headers: bool = True
 ) -> List[Document]:
-    """Chunks markdown text, preserving headers in metadata and extracting base64 images."""
     if not markdown_text_with_images or not markdown_text_with_images.strip():
         logger.warning("chunk_markdown received empty input.")
         return []
     headers_to_split_on = [
         ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"),
         ("####", "Header 4"), ("#####", "Header 5"), ("######", "Header 6"),
@@ -185,47 +223,43 @@ def chunk_markdown(
     markdown_splitter = MarkdownHeaderTextSplitter(
         headers_to_split_on=headers_to_split_on, strip_headers=strip_headers
     )
-    header_chunks = markdown_splitter.split_text(markdown_text_with_images)
     if not header_chunks:
-        logger.warning("No chunks created from markdown splitting.")
-        return []
     final_chunks = []
     if chunk_size > 0:
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-            length_function=len,
             separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
             add_start_index=True
         )
         for i, header_chunk in enumerate(header_chunks):
-            if header_chunk.page_content and len(header_chunk.page_content) > chunk_size:
                 sub_chunks = text_splitter.split_documents([header_chunk])
                 final_chunks.extend(sub_chunks)
-            elif header_chunk.page_content:
-                final_chunks.append(header_chunk)
     else:
         final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
-    # Extract base64 images and add to metadata
     for chunk in final_chunks:
         if not hasattr(chunk, 'metadata'):
             chunk.metadata = {}
-        # Improved regex to capture full base64 data URI
-        images_in_chunk = re.findall(
-            r"!\[.*?\]\((data:image/[a-zA-Z]+;base64,[A-Za-z0-9+/]+={0,2})\)",
-            chunk.page_content
-        )
-        chunk.metadata["images_base64"] = images_in_chunk if images_in_chunk else []
-        logger.debug(f"Chunk metadata updated with {len(images_in_chunk)} base64 images")
-    logger.info(f"Created {len(final_chunks)} chunks with base64 metadata")
     return final_chunks
 def get_hf_token(explicit_token: str = None) -> str:
     """Retrieve Hugging Face token with fallback mechanisms."""
     global hf_token_global
@@ -280,7 +314,7 @@ def process_file_and_save(
         source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
         logger.info(f"--- Starting processing for file: {source_filename} ---")
-        processed_markdown, _, _ = perform_ocr_file(file_obj)
         if not processed_markdown or processed_markdown.startswith("Error:"):
             return processed_markdown

     """Encodes image bytes to a base64 string."""
     return base64.b64encode(image_bytes).decode('utf-8')
+def extract_images_from_markdown(markdown_text: str) -> Dict[str, str]:
+    """
+    Extracts base64 image data URIs from markdown and maps them to reference IDs.
+    Returns a dictionary mapping reference IDs to base64 data URIs.
+    """
+    image_map = {}
+    img_refs = re.findall(r"!\[.*?\]\((data:image/[a-zA-Z+]+;base64,[A-Za-z0-9+/=]+)\)", markdown_text)
+    for idx, img_uri in enumerate(img_refs):
+        ref_id = f"img_ref_{idx+1}"
+        image_map[ref_id] = img_uri
+    return image_map
+def replace_image_references(markdown_text: str, image_map: Dict[str, str]) -> str:
+    """
+    Replaces base64 image data URIs in markdown with reference IDs (e.g., img_ref_1).
+    """
+    updated_markdown = markdown_text
+    for ref_id, img_uri in image_map.items():
+        escaped_uri = re.escape(img_uri)
+        pattern = r"(!\[.*?\]\()" + escaped_uri + r"(\))"
+        updated_markdown = re.sub(pattern, f"\\1{ref_id}\\2", updated_markdown)
+    return updated_markdown
 def get_combined_markdown(ocr_response: Any) -> Tuple[str, str, Dict[str, str]]:
     """Combines markdown from OCR pages, replacing image IDs with base64 data URIs."""
     processed_markdowns = []
     try:
         for page_idx, page in enumerate(ocr_response.pages):
             if hasattr(page, 'images') and page.images:
+                logger.info(f"Page {page_idx}: Found {len(page.images)} images.")
                 for img in page.images:
                     if hasattr(img, 'id') and hasattr(img, 'image_base64') and img.image_base64:
                         image_data_map[img.id] = img.image_base64
+                        logger.debug(f"Page {page_idx}: Image ID {img.id} added to image_data_map.")
                     else:
+                        logger.warning(f"Page {page_idx}: Image object lacks 'id' or valid 'image_base64'. Image: {img}")
+            else:
+                logger.info(f"Page {page_idx}: No images found.")
             if not hasattr(page, 'markdown'):
                 logger.warning(f"Page {page_idx} lacks 'markdown' attribute. Skipping.")
                 continue
             current_processed_markdown = current_raw_markdown
             img_refs = re.findall(r"!\[.*?\]\((.*?)\)", current_processed_markdown)
+            logger.debug(f"Page {page_idx}: Found {len(img_refs)} image references in markdown.")
             for img_id in img_refs:
                 if img_id in image_data_map:
                     base64_data_uri = image_data_map[img_id]
                             r"\1" + base64_data_uri + r"\2",
                             current_processed_markdown
                         )
+                        logger.debug(f"Page {page_idx}: Replaced image ID {img_id} with base64 data URI.")
                 elif not img_id.startswith(('http:', 'https:', 'data:')):
                     logger.warning(f"Page {page_idx}: Image ID '{img_id}' not in image data.")
             processed_markdowns.append(current_processed_markdown)
+        logger.info(f"Processed {len(processed_markdowns)} pages with {len(image_data_map)} images.")
         return "\n\n".join(processed_markdowns), "\n\n".join(raw_markdowns), image_data_map
     except Exception as e:
         if file_ext == '.pdf':
             try:
                 with open(file_path, "rb") as f:
+                    file_content = f.read()
                 logger.info(f"Uploading PDF {file_name} to Mistral...")
                 uploaded_pdf = client.files.upload(
                     file={
                         "file_name": file_name,
                     document={"type": "document_url", "document_url": signed_url_response.url},
                     include_image_base64=True
                 )
+                logger.info(f"OCR response received: {ocr_response}")
             finally:
                 if uploaded_file_id:
                     try:
                     document={"type": "image_url", "image_url": data_uri},
                     include_image_base64=True
                 )
+                logger.info(f"OCR response received: {ocr_response}")
         else:
             return f"Unsupported file type: '{file_name}'.", "", {}
         if ocr_response:
+            processed_md, raw_md, img_map = get_combined_markdown(ocr_response)
+            logger.info(f"Processed markdown length: {len(processed_md)}")
+            return processed_md, raw_md, img_map
         return f"Error: OCR failed for '{file_name}'.", "", {}
     except Exception as e:
     chunk_overlap: int = 200,
     strip_headers: bool = True
 ) -> List[Document]:
+    """Chunks markdown text, preserving headers in metadata and extracting images."""
     if not markdown_text_with_images or not markdown_text_with_images.strip():
         logger.warning("chunk_markdown received empty input.")
         return []
+    # Extract images and replace with reference IDs
+    image_map = extract_images_from_markdown(markdown_text_with_images)
+    updated_markdown = replace_image_references(markdown_text_with_images, image_map)
+    logger.info(f"Extracted {len(image_map)} images from markdown.")
     headers_to_split_on = [
         ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"),
         ("####", "Header 4"), ("#####", "Header 5"), ("######", "Header 6"),
     markdown_splitter = MarkdownHeaderTextSplitter(
         headers_to_split_on=headers_to_split_on, strip_headers=strip_headers
     )
+    header_chunks = markdown_splitter.split_text(updated_markdown)
     if not header_chunks:
+        logger.warning("No header chunks created. Treating entire text as one chunk.")
+        return [Document(page_content=updated_markdown, metadata={"images_base64": list(image_map.values())})]
     final_chunks = []
     if chunk_size > 0:
         text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len,
             separators=["\n\n", "\n", "(?<=\. )", "(?<=\? )", "(?<=! )", ", ", "; ", " ", ""],
             add_start_index=True
         )
         for i, header_chunk in enumerate(header_chunks):
+            if header_chunk.page_content:
                 sub_chunks = text_splitter.split_documents([header_chunk])
                 final_chunks.extend(sub_chunks)
+                logger.debug(f"Header chunk {i}: Split into {len(sub_chunks)} sub-chunks.")
+            else:
+                logger.debug(f"Header chunk {i}: Empty, skipping.")
     else:
         final_chunks = [chunk for chunk in header_chunks if chunk.page_content]
+    # Add image references to metadata for each chunk
     for chunk in final_chunks:
         if not hasattr(chunk, 'metadata'):
             chunk.metadata = {}
+        # Find image references in this chunk
+        chunk_img_refs = re.findall(r"!\[.*?\]\((img_ref_\d+)\)", chunk.page_content)
+        chunk_images = [image_map[ref_id] for ref_id in chunk_img_refs if ref_id in image_map]
+        chunk.metadata["images_base64"] = chunk_images
+        chunk.metadata["image_references"] = chunk_img_refs
+        logger.debug(f"Chunk {chunk.metadata.get('start_index', 'unknown')}: Found {len(chunk_images)} images.")
+    logger.info(f"Created {len(final_chunks)} final chunks.")
     return final_chunks
 def get_hf_token(explicit_token: str = None) -> str:
     """Retrieve Hugging Face token with fallback mechanisms."""
     global hf_token_global
         source_filename = getattr(file_obj, 'orig_name', os.path.basename(file_obj.name))
         logger.info(f"--- Starting processing for file: {source_filename} ---")
+        processed_markdown, raw_markdown, img_map = perform_ocr_file(file_obj)
         if not processed_markdown or processed_markdown.startswith("Error:"):
             return processed_markdown