Spaces:

Svngoku
/

PDF2Dataset

Running

App Files Files Community

Svngoku commited on Mar 27, 2025

Commit

84661cc

verified ·

1 Parent(s): 8a5a9ab

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -6

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from mistralai import Mistral
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Mistral OCR setup (ensure you have your API key set)
 api_key = os.environ.get("MISTRAL_API_KEY")
 if not api_key:
     raise ValueError("MISTRAL_API_KEY environment variable not set")
@@ -109,7 +109,7 @@ def chunk_markdown(
         document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
         separators = (
-            ["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
             if preserve_numbering
             else ["\n\n", "\n", ".", " ", ""]
         )
@@ -118,7 +118,7 @@ def chunk_markdown(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             length_function=len,
-            on=separators,
             keep_separator=True,
             add_start_index=True,
             is_separator_regex=preserve_numbering
@@ -154,7 +154,7 @@ def chunk_markdown(
         logger.error(f"Error processing markdown: {str(e)}")
         raise
-# Placeholder image generation (for chunks without images)
 def text_to_base64_dummy(text: str, chunk_index: int):
     img = Image.new('RGB', (200, 200), color='white')
     buffer = io.BytesIO()
@@ -184,10 +184,8 @@ def process_file_and_save(file, chunk_size, chunk_overlap, preserve_numbering, h
             data["chunk_id"].append(i)
             data["content"].append(chunk.page_content)
             data["metadata"].append(chunk.metadata)
-            # Extract base64 images from markdown if present, else use placeholder
             img_base64 = None
             if "![image" in chunk.page_content:
-                # Simple extraction (assumes one image per chunk for simplicity)
                 start = chunk.page_content.find("data:image")
                 if start != -1:
                     end = chunk.page_content.find(")", start)

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Mistral OCR setup
 api_key = os.environ.get("MISTRAL_API_KEY")
 if not api_key:
     raise ValueError("MISTRAL_API_KEY environment variable not set")
         document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
         separators = (
+            ["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
             if preserve_numbering
             else ["\n\n", "\n", ".", " ", ""]
         )
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             length_function=len,
+            separators=separators,  # Fixed parameter name
             keep_separator=True,
             add_start_index=True,
             is_separator_regex=preserve_numbering
         logger.error(f"Error processing markdown: {str(e)}")
         raise
+# Placeholder image generation
 def text_to_base64_dummy(text: str, chunk_index: int):
     img = Image.new('RGB', (200, 200), color='white')
     buffer = io.BytesIO()
             data["chunk_id"].append(i)
             data["content"].append(chunk.page_content)
             data["metadata"].append(chunk.metadata)
             img_base64 = None
             if "![image" in chunk.page_content:
                 start = chunk.page_content.find("data:image")
                 if start != -1:
                     end = chunk.page_content.find(")", start)