Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,7 +17,7 @@ from mistralai import Mistral
|
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
-
# Mistral OCR setup
|
| 21 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 22 |
if not api_key:
|
| 23 |
raise ValueError("MISTRAL_API_KEY environment variable not set")
|
|
@@ -109,7 +109,7 @@ def chunk_markdown(
|
|
| 109 |
document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
|
| 110 |
|
| 111 |
separators = (
|
| 112 |
-
["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
|
| 113 |
if preserve_numbering
|
| 114 |
else ["\n\n", "\n", ".", " ", ""]
|
| 115 |
)
|
|
@@ -118,7 +118,7 @@ def chunk_markdown(
|
|
| 118 |
chunk_size=chunk_size,
|
| 119 |
chunk_overlap=chunk_overlap,
|
| 120 |
length_function=len,
|
| 121 |
-
|
| 122 |
keep_separator=True,
|
| 123 |
add_start_index=True,
|
| 124 |
is_separator_regex=preserve_numbering
|
|
@@ -154,7 +154,7 @@ def chunk_markdown(
|
|
| 154 |
logger.error(f"Error processing markdown: {str(e)}")
|
| 155 |
raise
|
| 156 |
|
| 157 |
-
# Placeholder image generation
|
| 158 |
def text_to_base64_dummy(text: str, chunk_index: int):
|
| 159 |
img = Image.new('RGB', (200, 200), color='white')
|
| 160 |
buffer = io.BytesIO()
|
|
@@ -184,10 +184,8 @@ def process_file_and_save(file, chunk_size, chunk_overlap, preserve_numbering, h
|
|
| 184 |
data["chunk_id"].append(i)
|
| 185 |
data["content"].append(chunk.page_content)
|
| 186 |
data["metadata"].append(chunk.metadata)
|
| 187 |
-
# Extract base64 images from markdown if present, else use placeholder
|
| 188 |
img_base64 = None
|
| 189 |
if "![image" in chunk.page_content:
|
| 190 |
-
# Simple extraction (assumes one image per chunk for simplicity)
|
| 191 |
start = chunk.page_content.find("data:image")
|
| 192 |
if start != -1:
|
| 193 |
end = chunk.page_content.find(")", start)
|
|
|
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
+
# Mistral OCR setup
|
| 21 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 22 |
if not api_key:
|
| 23 |
raise ValueError("MISTRAL_API_KEY environment variable not set")
|
|
|
|
| 109 |
document = Document(page_content=markdown_text, metadata={"source": "ocr_output"})
|
| 110 |
|
| 111 |
separators = (
|
| 112 |
+
["\n\d+\.\s+", "\n\n", "\n", ".", " ", ""]
|
| 113 |
if preserve_numbering
|
| 114 |
else ["\n\n", "\n", ".", " ", ""]
|
| 115 |
)
|
|
|
|
| 118 |
chunk_size=chunk_size,
|
| 119 |
chunk_overlap=chunk_overlap,
|
| 120 |
length_function=len,
|
| 121 |
+
separators=separators, # Fixed parameter name
|
| 122 |
keep_separator=True,
|
| 123 |
add_start_index=True,
|
| 124 |
is_separator_regex=preserve_numbering
|
|
|
|
| 154 |
logger.error(f"Error processing markdown: {str(e)}")
|
| 155 |
raise
|
| 156 |
|
| 157 |
+
# Placeholder image generation
|
| 158 |
def text_to_base64_dummy(text: str, chunk_index: int):
|
| 159 |
img = Image.new('RGB', (200, 200), color='white')
|
| 160 |
buffer = io.BytesIO()
|
|
|
|
| 184 |
data["chunk_id"].append(i)
|
| 185 |
data["content"].append(chunk.page_content)
|
| 186 |
data["metadata"].append(chunk.metadata)
|
|
|
|
| 187 |
img_base64 = None
|
| 188 |
if "![image" in chunk.page_content:
|
|
|
|
| 189 |
start = chunk.page_content.find("data:image")
|
| 190 |
if start != -1:
|
| 191 |
end = chunk.page_content.find(")", start)
|