semmyk commited on
Commit
c6fb648
·
1 Parent(s): f9088c5

baseline08_beta0.4.0_06Oct25: Refactored. now runs without ProcessPoolExecutor. Marker inherently handles ThreadPoolExecutor and ProcessPoolExecutor. Gradio ui separated from Gradio process logics

Browse files
converters/extraction_converter.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  from pathlib import Path
3
  import traceback
4
  #import time
5
- from typing import Dict, Any, Type, Optional, Union #, BaseModel
6
  from pydantic import BaseModel
7
 
8
  from marker.models import create_model_dict
@@ -10,13 +10,9 @@ from marker.models import create_model_dict
10
  from marker.converters.pdf import PdfConverter as MarkerConverter ## full document convertion/extraction
11
  from marker.config.parser import ConfigParser ## Process custom configuration
12
  from marker.services.openai import OpenAIService as MarkerOpenAIService
 
13
  #from sympy import Union
14
 
15
- #from llm.hf_client import HFChatClient
16
- from llm.openai_client import OpenAIChatClient
17
- from file_handler.file_utils import collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
18
- from utils.lib_loader import load_library
19
-
20
  from utils.logger import get_logger
21
 
22
  logger = get_logger(__name__)
@@ -48,13 +44,17 @@ class DocumentConverter:
48
  api_token: str,
49
  openai_base_url: str = "https://router.huggingface.co/v1",
50
  openai_image_format: Optional[str] = "webp",
51
- max_workers: Optional[str] =1, #4, for config_dict["pdftext_workers"]
52
  max_retries: Optional[int] = 2,
53
- output_format: str = "markdown",
 
 
54
  output_dir: Optional[Union[str, Path]] = "output_dir",
55
  use_llm: Optional[bool] = None, #bool = False, #Optional[bool] = False, #True,
56
  force_ocr: Optional[bool] = None, #bool = False,
57
- page_range: Optional[str] = None, #str = None #Optional[str] = None,
 
 
58
  ):
59
 
60
  #self.converter = None #MarkerConverter
@@ -65,20 +65,21 @@ class DocumentConverter:
65
  self.top_p = top_p # self.client.top_p,
66
  self.llm_service = MarkerOpenAIService
67
  self.openai_image_format = openai_image_format #"png" #better compatibility
68
- self.max_workers = max_workers ## pass to config_dict["pdftext_workers"]
69
  self.max_retries = max_retries ## pass to __call__
70
- self.output_dir = output_dir ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
71
- self.use_llm = use_llm if use_llm else False #use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
 
 
 
72
  self.force_ocr = force_ocr if force_ocr else False
 
 
73
  #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
74
  self.page_range = page_range if page_range else None
75
  # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None, ##Example: "0,4-8,16" ##Marker parses as List[int] #]debug #len(pdf_file)
76
- '''
77
- if isinstance(page_range, tuple | str):
78
- self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range
79
- else:
80
- self.page_range = None
81
- '''
82
 
83
  # 0) Instantiate the LLM Client (OPENAIChatClient): Get a provider‐agnostic chat function
84
  ##SMY: #future. Plan to integrate into Marker: uses its own LLM services (clients). As at 1.9.2, there's no huggingface client service.
@@ -102,12 +103,14 @@ class DocumentConverter:
102
  # 1) # Define the custom configuration for the Hugging Face LLM.
103
  # Use typing.Dict and typing.Any for flexible dictionary type hints
104
  try:
105
- self.config_dict: Dict[str, Any] = self.get_config_dict(model_id=model_id, llm_service=str(self.llm_service), output_format=output_format)
 
106
 
107
  ##SMY: execute if page_range is none. `else None` ensures valid syntactic expression
108
  ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
109
  self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
110
- self.config_dict.pop("use_llm", None) if not self.config_dict.get("use_llm") or self.config_dict.get("use_llm") is False or self.config_dict.get("use_llm") == 'False' else None
 
111
  self.config_dict.pop("force_ocr", None) if not self.config_dict.get("force_ocr") or self.config_dict.get("force_ocr") is False or self.config_dict.get("force_ocr") == 'False' else None
112
 
113
  logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"}) #, "config": str(self.config_dict)})
@@ -150,7 +153,8 @@ class DocumentConverter:
150
 
151
  # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
152
  try: # Assign llm_service if api_token. ##SMY: split and slicing ##Gets the string value
153
- llm_service_str = None if api_token == '' or api_token is None or self.use_llm is False else str(self.llm_service).split("'")[1] #
 
154
 
155
  # sets api_key required by Marker ## to handle Marker's assertion test on OpenAI
156
  if llm_service_str:
@@ -174,13 +178,15 @@ class DocumentConverter:
174
 
175
  logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
176
  #return self.converter ##SMY: to query why did I comment out?. Bingo: "__init__() should return None, not 'PdfConverter'"
 
177
  except Exception as exc:
178
  tb = traceback.format_exc
179
  logger.exception(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
180
  raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
181
 
182
  # Define the custom configuration for HF LLM.
183
- def get_config_dict(self, model_id: str, llm_service=MarkerOpenAIService, output_format: Optional[str] = "markdown" ) -> Dict[str, Any]:
 
184
  """ Define the custom configuration for the Hugging Face LLM: combining Markers cli_options and LLM. """
185
 
186
  try:
@@ -191,8 +197,28 @@ class DocumentConverter:
191
  self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
192
 
193
  ##SMY: TODO: convert to {inputs} and called from gradio_ui
194
- config_dict = {
195
- "output_format" : output_format, #"markdown",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  "openai_model" : self.model_id, #self.client.model_id, #"model_name"
197
  "openai_api_key" : self.openai_api_key, #self.client.openai_api_key, #self.api_token,
198
  "openai_base_url": self.openai_base_url, #self.client.base_url, #self.base_url,
@@ -200,94 +226,19 @@ class DocumentConverter:
200
  "top_p" : self.top_p, #self.client.top_p,
201
  "openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
202
  "pdftext_workers": self.max_workers, ## number of workers to use for pdftext."
203
- "max_retries" : self.max_retries, #3, ## pass to __call__
 
204
  "output_dir" : self.output_dir,
205
- "use_llm" : self.use_llm, #False, #True,
206
- "force_ocr" : self.force_ocr, #False,
 
 
207
  "page_range" : self.page_range, ##debug #len(pdf_file)
208
- }
 
209
  return config_dict
210
  except Exception as exc:
211
  tb = traceback.format_exc() #exc.__traceback__
212
  logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
213
  raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
214
- #raise
215
-
216
- ##SMY: flagged for deprecation
217
- ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
218
- #def get_extraction_converter(self, chat_fn):
219
- def get_create_model_dict(self):
220
- """
221
- Wraps the LLM chat_fn into marker’s artifact_dict
222
- and returns an ExtractionConverter for PDFs & HTML.
223
- """
224
- return create_model_dict()
225
- #artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
226
- #return artifact_dict
227
-
228
- ## SMY: Kept for future implementation (and historic reasoning). Keeping the classes separate to avoid confusion with the original implementation
229
- '''
230
- class DocumentExtractor:
231
- """
232
- Business logic wrapper using HFChatClient and Marker to
233
- convert documents (PDF, HTML files) into markdowns + assets
234
- Wrapper around the Marker extraction converter for PDFs & HTML.
235
- """
236
-
237
- def __init__(self,
238
- provider: str,
239
- model_id: str,
240
- hf_provider: str,
241
- endpoint_url: str,
242
- backend_choice: str,
243
- system_message: str,
244
- max_tokens: int,
245
- temperature: float,
246
- top_p: float,
247
- stream: bool,
248
- api_token: str,
249
- ):
250
- # 1) Instantiate the LLM Client (HFChatClient): Get a provider‐agnostic chat function
251
- try:
252
- self.client = HFChatClient(
253
- provider=provider,
254
- model_id=model_id,
255
- hf_provider=hf_provider,
256
- endpoint_url=endpoint_url,
257
- backend_choice=backend_choice, #choices=["model-id", "provider", "endpoint"]
258
- system_message=system_message,
259
- max_tokens=max_tokens,
260
- temperature=temperature,
261
- top_p=top_p,
262
- stream=stream,
263
- api_token=api_token,
264
- )
265
- logger.log(level=20, msg="✔️ HFChatClient instantiated:", extra={"model_id": model_id, "chatclient": str(self.client)})
266
-
267
- except Exception as exc:
268
- tb = traceback.format_exc() #exc.__traceback__
269
- logger.exception(f"✗ Error initialising HFChatClient: {exc}")
270
- raise RuntimeError(f"✗ Error initialising HFChatClient: {exc}").with_traceback(tb)
271
- #raise
272
-
273
- # 2) Build Marker's artifact dict using the client's chat method
274
- self.artifact_dict = self.get_extraction_converter(self.client)
275
-
276
- # 3) Instantiate Marker's ExtractionConverter (ExtractionConverter)
277
- try:
278
- self.extractor = MarkerExtractor(artifact_dict=self.artifact_dict)
279
- except Exception as exc:
280
- logger.exception(f"✗ Error initialising MarkerExtractor: {exc}")
281
- raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}")
282
-
283
- ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
284
- def get_extraction_converter(self, chat_fn):
285
- """
286
- Wraps the LLM chat_fn into marker’s artifact_dict
287
- and returns an ExtractionConverter for PDFs & HTML.
288
- """
289
-
290
- artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
291
- return artifact_dict
292
- '''
293
-
 
2
  from pathlib import Path
3
  import traceback
4
  #import time
5
+ from typing import Dict, Any, Type, Optional, Union, Literal #, BaseModel
6
  from pydantic import BaseModel
7
 
8
  from marker.models import create_model_dict
 
10
  from marker.converters.pdf import PdfConverter as MarkerConverter ## full document convertion/extraction
11
  from marker.config.parser import ConfigParser ## Process custom configuration
12
  from marker.services.openai import OpenAIService as MarkerOpenAIService
13
+ from marker.settings import settings
14
  #from sympy import Union
15
 
 
 
 
 
 
16
  from utils.logger import get_logger
17
 
18
  logger = get_logger(__name__)
 
44
  api_token: str,
45
  openai_base_url: str = "https://router.huggingface.co/v1",
46
  openai_image_format: Optional[str] = "webp",
47
+ max_workers: Optional[str] = 1, #4, for config_dict["pdftext_workers"]
48
  max_retries: Optional[int] = 2,
49
+ debug: Optional[bool] = None, #bool = False,
50
+ #output_format: str = "markdown",
51
+ output_format: Literal["markdown", "json", "html"] = "markdown",
52
  output_dir: Optional[Union[str, Path]] = "output_dir",
53
  use_llm: Optional[bool] = None, #bool = False, #Optional[bool] = False, #True,
54
  force_ocr: Optional[bool] = None, #bool = False,
55
+ strip_existing_ocr: Optional[bool] = None, #bool = False,
56
+ disable_ocr_math: Optional[bool] = None, #bool = False,
57
+ page_range: Optional[str] = None, #str = None #Optional[str] = None,
58
  ):
59
 
60
  #self.converter = None #MarkerConverter
 
65
  self.top_p = top_p # self.client.top_p,
66
  self.llm_service = MarkerOpenAIService
67
  self.openai_image_format = openai_image_format #"png" #better compatibility
68
+ self.max_workers = max_workers #int(1) ## pass to config_dict["pdftext_workers"]
69
  self.max_retries = max_retries ## pass to __call__
70
+ self.debug = debug
71
+ #self.output_format = output_format
72
+ self.output_format = output_format
73
+ self.output_dir = settings.DEBUG_DATA_FOLDER if debug else output_dir,
74
+ self.use_llm = use_llm if use_llm else False #use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
75
  self.force_ocr = force_ocr if force_ocr else False
76
+ self.strip_existing_ocr = strip_existing_ocr #if strip_existing_ocr else False
77
+ self.disable_ocr_math = disable_ocr_math #if disable_ocr else False
78
  #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
79
  self.page_range = page_range if page_range else None
80
  # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None, ##Example: "0,4-8,16" ##Marker parses as List[int] #]debug #len(pdf_file)
81
+
82
+ self.converter = None
 
 
 
 
83
 
84
  # 0) Instantiate the LLM Client (OPENAIChatClient): Get a provider‐agnostic chat function
85
  ##SMY: #future. Plan to integrate into Marker: uses its own LLM services (clients). As at 1.9.2, there's no huggingface client service.
 
103
  # 1) # Define the custom configuration for the Hugging Face LLM.
104
  # Use typing.Dict and typing.Any for flexible dictionary type hints
105
  try:
106
+ #self.config_dict: Dict[str, Any] = self.get_config_dict(model_id=model_id, llm_service=str(self.llm_service), output_format=output_format)
107
+ self.config_dict: Dict[str, Any] = self.get_config_dict()
108
 
109
  ##SMY: execute if page_range is none. `else None` ensures valid syntactic expression
110
  ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
111
  self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
112
+ # use_llm test moved to config_dict
113
+ #self.config_dict.pop("use_llm", None) if not self.config_dict.get("use_llm") or self.config_dict.get("use_llm") is False or self.config_dict.get("use_llm") == 'False' else None
114
  self.config_dict.pop("force_ocr", None) if not self.config_dict.get("force_ocr") or self.config_dict.get("force_ocr") is False or self.config_dict.get("force_ocr") == 'False' else None
115
 
116
  logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"}) #, "config": str(self.config_dict)})
 
153
 
154
  # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
155
  try: # Assign llm_service if api_token. ##SMY: split and slicing ##Gets the string value
156
+ #llm_service_str = None if api_token == '' or api_token is None or self.use_llm is False else str(self.llm_service).split("'")[1] #
157
+ llm_service_str = None if not self.use_llm or self.use_llm == "False" or self.use_llm is False else str(self.llm_service).split("'")[1] #
158
 
159
  # sets api_key required by Marker ## to handle Marker's assertion test on OpenAI
160
  if llm_service_str:
 
178
 
179
  logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
180
  #return self.converter ##SMY: to query why did I comment out?. Bingo: "__init__() should return None, not 'PdfConverter'"
181
+
182
  except Exception as exc:
183
  tb = traceback.format_exc
184
  logger.exception(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
185
  raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
186
 
187
  # Define the custom configuration for HF LLM.
188
+ #def get_config_dict(self, model_id: str, llm_service=MarkerOpenAIService, output_format: Optional[str] = "markdown" ) -> Dict[str, Any]:
189
+ def get_config_dict(self, ) -> Dict[str, Any]:
190
  """ Define the custom configuration for the Hugging Face LLM: combining Markers cli_options and LLM. """
191
 
192
  try:
 
197
  self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
198
 
199
  ##SMY: TODO: convert to {inputs} and called from gradio_ui
200
+ if not self.use_llm or self.use_llm == 'False':
201
+ config_dict = {
202
+ "output_format" : self.output_format, #"markdown",
203
+ #"openai_model" : self.model_id, #self.client.model_id, #"model_name"
204
+ #"openai_api_key" : self.openai_api_key, #self.client.openai_api_key, #self.api_token,
205
+ #"openai_base_url": self.openai_base_url, #self.client.base_url, #self.base_url,
206
+ #"temperature" : self.temperature, #self.client.temperature,
207
+ #"top_p" : self.top_p, #self.client.top_p,
208
+ #"openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
209
+ "pdftext_workers": self.max_workers, ## number of workers to use for pdftext."
210
+ #"max_retries" : self.max_retries, #3, ## pass to __call__
211
+ "debug" : self.debug,
212
+ "output_dir" : self.output_dir,
213
+ #"use_llm" : self.use_llm, #False, #True,
214
+ "force_ocr" : self.force_ocr, #False,
215
+ "strip_existing_ocr": self.strip_existing_ocr, #False
216
+ "disable_ocr_math": self.disable_ocr_math,
217
+ "page_range" : self.page_range, ##debug #len(pdf_file)
218
+ }
219
+ else:
220
+ config_dict = {
221
+ "output_format" : self.output_format, #"markdown",
222
  "openai_model" : self.model_id, #self.client.model_id, #"model_name"
223
  "openai_api_key" : self.openai_api_key, #self.client.openai_api_key, #self.api_token,
224
  "openai_base_url": self.openai_base_url, #self.client.base_url, #self.base_url,
 
226
  "top_p" : self.top_p, #self.client.top_p,
227
  "openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
228
  "pdftext_workers": self.max_workers, ## number of workers to use for pdftext."
229
+ #"max_retries" : self.max_retries, #3, ## pass to __call__
230
+ "debug" : self.debug,
231
  "output_dir" : self.output_dir,
232
+ "use_llm" : self.use_llm, #False, #True,
233
+ "force_ocr" : self.force_ocr, #False,
234
+ "strip_existing_ocr": self.strip_existing_ocr, #False
235
+ "disable_ocr_math": self.disable_ocr_math,
236
  "page_range" : self.page_range, ##debug #len(pdf_file)
237
+ }
238
+
239
  return config_dict
240
  except Exception as exc:
241
  tb = traceback.format_exc() #exc.__traceback__
242
  logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
243
  raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
244
+ #raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
converters/pdf_to_md.py CHANGED
@@ -5,97 +5,27 @@ from typing import List, Dict, Union, Optional
5
  import traceback ## Extract, format and print information about Python stack traces.
6
  import time
7
 
 
8
  import spaces
9
- from globals import config_load_models
10
 
11
  from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
12
- from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
13
 
14
- from utils import config
15
  from utils.lib_loader import set_weasyprint_library
16
  from utils.logger import get_logger
17
 
18
  logger = get_logger(__name__)
19
 
20
  # Define global variables ##SMY: TODO: consider moving to Globals sigleton constructor
21
- docconverter: DocumentConverter = None
22
- converter = None #DocumentConverter
23
- #converter:DocumentConverter.converter = None
24
-
25
- #@spaces.GPU
26
- duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
27
- @spaces.GPU(duration=duration) ## HF Spaces GPU support
28
  # Define docextractor in the pool as serialised object and passed to each worker process.
29
  # Note: DocumentConverter must be "picklable".
30
- def init_worker(#self,
31
- provider: str,
32
- model_id: str,
33
- #base_url,
34
- hf_provider: str,
35
- endpoint_url: str,
36
- backend_choice: str,
37
- system_message: str,
38
- max_tokens: int,
39
- temperature: float,
40
- top_p: float,
41
- stream: bool,
42
- api_token: str,
43
- openai_base_url: str, #: str = "https://router.huggingface.co/v1",
44
- openai_image_format: str, #: str | None = "webp",
45
- max_workers: int,
46
- max_retries: int, #: int | None = 2,
47
- output_format: str, #: str = "markdown",
48
- output_dir: str, #: Union | None = "output_dir",
49
- use_llm: bool, #: bool | None = False,
50
- force_ocr: bool,
51
- page_range: str, #: str | None = None
52
- ):
53
- #'''
54
- """
55
- instantiate DocumentConverter/DocumentExtractor for use in each pool worker
56
- Args:
57
-
58
- """
59
 
60
- ## moved to class
61
- # Initialise the global `converter` in each worker
62
- # Define global variables
63
- global docconverter
64
- global converter
65
-
66
- #'''
67
- # 1) Instantiate the DocumentConverter
68
- logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
69
-
70
- try:
71
- docconverter = DocumentConverter(
72
- model_id, #: str,
73
- hf_provider, #: str,
74
- temperature, #: float,
75
- top_p, #: float,
76
- api_token, #: str,
77
- openai_base_url, #: str = "https://router.huggingface.co/v1",
78
- openai_image_format, #: str | None = "webp",
79
- max_workers, #: int | None = 1,
80
- max_retries, #: int | None = 2,
81
- output_format, #: str = "markdown",
82
- output_dir, #: Union | None = "output_dir",
83
- use_llm, #: bool | None = False,
84
- force_ocr,
85
- page_range, #: str | None = None
86
- )
87
- logger.log(level=20, msg="✔️ docextractor initialised:", extra={"docconverter model_id": docconverter.converter.config.get("openai_model"), "docconverter use_llm": docconverter.converter.use_llm, "docconverter output_dir": docconverter.output_dir})
88
- except Exception as exc:
89
- #logger.error(f"Failed to initialise DocumentConverter: {exc}") #debug
90
- tb = traceback.format_exc()
91
- logger.exception(f"init_worker: Error initialising DocumentConverter → {exc}\n{tb}", exc_info=True)
92
- return f"✗ init_worker: error initialising DocumentConverter → {exc}\n{tb}"
93
-
94
- #docconverter = docconverter
95
- converter = docconverter.converter
96
- #self.llm_service = docconverter.llm_service ##duplicate?
97
- #self.model_id = model_id ##duplicate?
98
- #'''
99
 
100
  class PdfToMarkdownConverter:
101
  """
@@ -106,22 +36,90 @@ class PdfToMarkdownConverter:
106
  def __init__(self, options: Dict | None = None): #extractor: DocumentExtractor, options: Dict | None = None):
107
  self.options = options or {} ##SMY: TOBE implemented - bring all Marker's options
108
  self.output_dir_string = ''
109
- self.output_dir = self.output_dir_string ## placeholder
110
- #self.OUTPUT_DIR = config.OUTPUT_DIR ##flag unused
111
- #self.MAX_RETRIES = config.MAX_RETRIES ##flag unused
112
- #self.docconverter = None #DocumentConverter
113
- #self.converter = self.docconverter.converter #None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- # This global will be set (re-initialised) in each worker after init_worker runs
 
 
 
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
119
- #duration = 10
120
- #@spaces.GPU(duration=duration) ## HF Spaces GPU support
121
- #@spaces.GPU
122
- ## moved from extraction_converter ( to standalone extract_to_md)
123
  #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
124
- def extract(self, src_path: str, output_dir: str): #Dict:
125
  #def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
126
  """
127
  Convert one file (PDF/HTML) to Markdown + images.
@@ -140,13 +138,29 @@ class PdfToMarkdownConverter:
140
  logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
141
  raise RuntimeWarning(f"✗ error during loading weasyprint backend dependency → {exc}\n{tb}")
142
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  # Run Marker conversion with LLM if use_llm is true
145
  try:
146
- #rendered = self.docconverter.converter(src_path, use_llm=True)
 
 
147
  #rendered = self.docconverter.converter(src_path)
148
- rendered = converter(src_path)
 
149
  logger.log(level=20, msg=f"✓ File extraction successful for {Path(src_path).name}")
 
 
150
  except Exception as exc:
151
  tb = traceback.format_exc()
152
  logger.exception(f"Error during file extraction → {exc}\n{tb}", exc_info=True) # Log the full traceback
@@ -154,15 +168,8 @@ class PdfToMarkdownConverter:
154
  return f"✗ error during extraction → {exc}\n{tb}"
155
 
156
  # Write Markdown file
157
- '''
158
- base = Path(str_path).stem ## Get filename without extension
159
- md_path = output_dir / f"{base}.md" # Join output dir and new markdown file with the slash operator
160
-
161
- with open(md_path, "w", encoding="utf-8") as f:
162
- f.write(rendered.markdown)
163
- '''
164
  try:
165
- md_file = write_markdown(src_path=src_path, output_dir=output_dir, rendered=rendered)
166
  #debug md_file = "debug_md_file dummy name" ##debug
167
  except Exception as exc:
168
  tb = traceback.format_exc()
@@ -181,9 +188,9 @@ class PdfToMarkdownConverter:
181
  #return {"images": len(rendered.images), "file": md_file} ##debug
182
  return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
183
 
184
- #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
185
- duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
186
- @spaces.GPU(duration=duration) ## HF Spaces GPU support
187
  #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
188
  #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
189
  def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
@@ -200,13 +207,15 @@ class PdfToMarkdownConverter:
200
  tb = traceback.format_exc()
201
  logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
202
  return f"✗ error creating output_dir → {exc}\n{tb}"'''
203
- output_dir = Path(self.output_dir) ## takes the value from gradio_ui
 
 
204
 
205
  try:
206
  #if Path(src_path).suffix.lower() not in {".pdf", ".html", ".htm"}:
207
  #if not Path(src_path).name.endswith(tuple({".pdf", ".html"})): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
208
  #if not Path(src_path).name.endswith((".pdf", ".html", ".docx", ".doc")): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
209
- if not Path(src_path).name.endswith(config.file_types_tuple): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
210
  logger.log(level=20, msg=f"skipped {Path(src_path).name}", exc_info=True)
211
  return f"skipped {Path(src_path).name}"
212
  except Exception as exc:
 
5
  import traceback ## Extract, format and print information about Python stack traces.
6
  import time
7
 
8
+ from gradio import Progress as grP
9
  import spaces
10
+ from globals import config_load_models, config_load
11
 
12
  from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
13
+ from utils.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
14
 
15
+ #from utils import config
16
  from utils.lib_loader import set_weasyprint_library
17
  from utils.logger import get_logger
18
 
19
  logger = get_logger(__name__)
20
 
21
  # Define global variables ##SMY: TODO: consider moving to Globals sigleton constructor
22
+ ## moved to class
23
+ #docconverter: DocumentConverter = None
24
+ #converter = None #DocumentConverter
 
 
 
 
25
  # Define docextractor in the pool as serialised object and passed to each worker process.
26
  # Note: DocumentConverter must be "picklable".
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ #def init_worker(#self, ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  class PdfToMarkdownConverter:
31
  """
 
36
  def __init__(self, options: Dict | None = None): #extractor: DocumentExtractor, options: Dict | None = None):
37
  self.options = options or {} ##SMY: TOBE implemented - bring all Marker's options
38
  self.output_dir_string = ''
39
+ self.output_dir = '' #self.output_dir_string ## placeholder
40
+ self.docconverter = None #DocumentConverter
41
+ self.converter = None #self.docconverter.converter #None
42
+
43
+ def init_docconverter(self, output_dir: Union[str, Path] = config_load.output_dir, progress3=grP(track_tqdm=True)):
44
+ #'''
45
+ """
46
+ instantiate DocumentConverter/DocumentExtractor for use
47
+ Args:
48
+ ##TODO
49
+ """
50
+
51
+ provider: str = config_load.provider
52
+ model_id: str = config_load.model_id
53
+ #base_url,
54
+ hf_provider: str = config_load.hf_provider
55
+ endpoint_url: str = config_load.endpoint
56
+ backend_choice: str = config_load.backend_choice
57
+ system_message: str = config_load.system_message
58
+ max_tokens: int = config_load.max_tokens
59
+ temperature: float = config_load.temperature
60
+ top_p: float = config_load.top_p
61
+ stream: bool = config_load.stream
62
+ api_token: str = config_load.api_token
63
+ openai_base_url: str = config_load.openai_base_url
64
+ openai_image_format: str = config_load.openai_image_format
65
+ max_workers: int = config_load.max_workers
66
+ max_retries: int = config_load.max_retries
67
+ debug: bool = config_load.debug
68
+ output_format: str = config_load.output_format
69
+ output_dir: Union[str, Path] = config_load.output_dir_string #output_dir #
70
+ use_llm: bool = config_load.use_llm
71
+ force_ocr: bool = config_load.force_ocr
72
+ strip_existing_ocr: bool = config_load.strip_existing_ocr
73
+ disable_ocr_math: bool = config_load.disable_ocr_math
74
+ page_range: str = config_load.page_range
75
+
76
 
77
+ # 1) Instantiate the DocumentConverter
78
+ logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
79
+ progress3((0,1), desc=f"initialising docconverter: ...")
80
+ #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
81
+ time.sleep(0.75) #.sleep(0.25)
82
 
83
+ try:
84
+ docconverter = DocumentConverter(
85
+ model_id, #: str,
86
+ hf_provider, #: str,
87
+ temperature, #: float,
88
+ top_p, #: float,
89
+ api_token, #: str,
90
+ openai_base_url, #: str = "https://router.huggingface.co/v1",
91
+ openai_image_format, #: str | None = "webp",
92
+ max_workers, #: int | None = 1,
93
+ max_retries, #: int | None = 2,
94
+ debug, #: bool = False
95
+ output_format, #: str = "markdown",
96
+ output_dir, #: Union | None = "output_dir",
97
+ use_llm, #: bool | None = False,
98
+ force_ocr, #: bool | None = False,
99
+ strip_existing_ocr, #bool = False,
100
+ disable_ocr_math, #bool = False,
101
+ page_range, #: str | None = None
102
+ )
103
+ logger.log(level=20, msg="✔️ docextractor initialised:", extra={"docconverter model_id": docconverter.converter.config.get("openai_model"), "docconverter use_llm": docconverter.converter.use_llm, "docconverter output_dir": docconverter.output_dir})
104
+ progress3((1,1), desc=f"✔️ docextractor initialised:")
105
+ time.sleep(0.75) #.sleep(0.25)
106
+ except Exception as exc:
107
+ #logger.error(f"Failed to initialise DocumentConverter: {exc}") #debug
108
+ tb = traceback.format_exc()
109
+ logger.exception(f"init_worker: Error initialising DocumentConverter → {exc}\n{tb}", exc_info=True)
110
+ return f"✗ init_worker: error initialising DocumentConverter → {exc}\n{tb}"
111
+
112
+ converter = docconverter.converter
113
+ self.docconverter = docconverter
114
+ self.converter = converter
115
+
116
+ #return converter
117
 
118
+ #duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
119
+ duration = 60*config_load_models.pdf_files_count if config_load_models.use_llm else 90 ## sec
120
+ @spaces.GPU(duration=duration) ## HF Spaces GPU support
 
 
121
  #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
122
+ def extract(self, src_path: str, output_dir: str, progress4=grP()): #Dict:
123
  #def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
124
  """
125
  Convert one file (PDF/HTML) to Markdown + images.
 
138
  logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
139
  raise RuntimeWarning(f"✗ error during loading weasyprint backend dependency → {exc}\n{tb}")
140
 
141
+ # Initialise Marker Converter
142
+ try:
143
+ if not self.converter:
144
+ self.init_docconverter(output_dir)
145
+
146
+ logger.log(level=20, msg=f"✓ Initialised Marker Converter")
147
+ except Exception as exc:
148
+ tb = traceback.format_exc()
149
+ logger.exception(f"Error during Marker Converter initialisation → {exc}\n{tb}", exc_info=True) # Log the full traceback
150
+
151
+ return f"✗ error during extraction → {exc}\n{tb}"
152
 
153
  # Run Marker conversion with LLM if use_llm is true
154
  try:
155
+ progress4((0,1), desc=f"Extracting File: {Path(src_path).name}")
156
+ time.sleep(0.75) #.sleep(0.25)
157
+
158
  #rendered = self.docconverter.converter(src_path)
159
+ rendered = self.converter(src_path)
160
+
161
  logger.log(level=20, msg=f"✓ File extraction successful for {Path(src_path).name}")
162
+ progress4((1,1), desc=f"✓ File extraction successful for {Path(src_path).name}")
163
+ time.sleep(0.75) #.sleep(0.25)
164
  except Exception as exc:
165
  tb = traceback.format_exc()
166
  logger.exception(f"Error during file extraction → {exc}\n{tb}", exc_info=True) # Log the full traceback
 
168
  return f"✗ error during extraction → {exc}\n{tb}"
169
 
170
  # Write Markdown file
 
 
 
 
 
 
 
171
  try:
172
+ md_file = write_markdown(src_path=src_path, output_dir=output_dir, rendered=rendered, output_format=config_load.output_format)
173
  #debug md_file = "debug_md_file dummy name" ##debug
174
  except Exception as exc:
175
  tb = traceback.format_exc()
 
188
  #return {"images": len(rendered.images), "file": md_file} ##debug
189
  return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
190
 
191
+ #duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
192
+ #@spaces.GPU(duration=duration) ## HF Spaces GPU support
193
+
194
  #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
195
  #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
196
  def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
 
207
  tb = traceback.format_exc()
208
  logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
209
  return f"✗ error creating output_dir → {exc}\n{tb}"'''
210
+ #output_dir = Path(self.output_dir) ## takes the value from gradio_ui
211
+ output_dir = Path(config_load.output_dir) # Takes the value when output_dir is created in gradio_process
212
+ self.output_dir = output_dir
213
 
214
  try:
215
  #if Path(src_path).suffix.lower() not in {".pdf", ".html", ".htm"}:
216
  #if not Path(src_path).name.endswith(tuple({".pdf", ".html"})): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
217
  #if not Path(src_path).name.endswith((".pdf", ".html", ".docx", ".doc")): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
218
+ if not Path(src_path).name.endswith(config_load.file_types_tuple): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
219
  logger.log(level=20, msg=f"skipped {Path(src_path).name}", exc_info=True)
220
  return f"skipped {Path(src_path).name}"
221
  except Exception as exc:
globals.py CHANGED
@@ -9,8 +9,50 @@ class Config:
9
  self.weasyprint_libpath = ""
10
  self.config_ini = "utils\\config.ini"
11
  self.pdf_files_count = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Create a single, shared instance of the Config class
14
  # Other modules will import and use this instance.
15
  config_load_models = Config()
 
16
 
 
 
 
9
  self.weasyprint_libpath = ""
10
  self.config_ini = "utils\\config.ini"
11
  self.pdf_files_count = 0
12
+ self.output_dir = ""
13
+
14
+ # File types
15
+ self.file_types_list = []
16
+ self.file_types_tuple = (".pdf", ".html", ".docx", ".doc")
17
+
18
+ # all other variables shared across the app
19
+ #self.pdf_files: list[str] = []
20
+ #self.pdf_files_count: int = 0
21
+ self.provider: str = ""
22
+ self.model_id: str = ""
23
+ #base_url: str
24
+ self.hf_provider: str = ""
25
+ self.endpoint: str = ""
26
+ self.backend_choice: str = ""
27
+ self.system_message: str = ""
28
+ self.max_tokens: int = 8192
29
+ self.temperature: float = 1.0
30
+ self.top_p: float = 1.0
31
+ self.stream: bool = False
32
+ self.api_token: str = ""
33
+ self.openai_base_url: str = "https://router.huggingface.co/v1"
34
+ self.openai_image_format: str = "webp"
35
+ self.max_workers: int = 1
36
+ self.max_retries: int = 2
37
+ self.debug: bool = False
38
+ #output_format: str = "markdown",
39
+ self.output_format: str = "markdown"
40
+ self.output_dir_string: str = "output_dir_default"
41
+ self.use_llm: bool = False
42
+ self.force_ocr: bool = True #False,
43
+ self.strip_existing_ocr: bool = False #bool = False,
44
+ self.disable_ocr_math: bool = None #bool = False,
45
+ self.page_range: str = None
46
+ #self.weasyprint_dll_directories: str = None,
47
+ self.tz_hours: str = None
48
+ #oauth_token: gr.OAuthToken | None=None,
49
+ #progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
50
+
51
 
52
  # Create a single, shared instance of the Config class
53
  # Other modules will import and use this instance.
54
  config_load_models = Config()
55
+ config_load = Config()
56
 
57
+ #if __name__ == "__main__":
58
+
tests/test_file_handler.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import tempfile
7
  from unittest.mock import patch
8
 
9
- from file_handler.file_utils import (
10
  collect_pdf_paths, collect_html_paths, collect_markdown_paths,
11
  process_dicts_data, create_outputdir
12
  )
 
6
  import tempfile
7
  from unittest.mock import patch
8
 
9
+ from utils.file_utils import (
10
  collect_pdf_paths, collect_html_paths, collect_markdown_paths,
11
  process_dicts_data, create_outputdir
12
  )
ui/gradio_process.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/gradio_process.py
2
+
3
+ import gradio as gr
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ from tqdm import tqdm
6
+
7
+ import time
8
+
9
+ from pathlib import Path, WindowsPath
10
+ from typing import Optional, Union, Literal #, Dict, List, Any, Tuple
11
+
12
+ from huggingface_hub import get_token
13
+ import spaces ##HuggingFace spaces to accelerate GPU support on HF Spaces
14
+
15
+ #import utilities, helpers
16
+ #import utils.file_utils
17
+ from utils.file_utils import zip_processed_files, process_dicts_data, create_temp_folder #, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
18
+ from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD #, file_types_list, file_types_tuple
19
+ from utils.utils import is_dict, is_list_of_dicts
20
+ from utils.get_config import get_config_value
21
+
22
+ from llm.llm_login import get_login_token, is_loggedin_huggingface, login_huggingface
23
+ from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
24
+ from converters.pdf_to_md import PdfToMarkdownConverter #, init_worker
25
+ #from converters.md_to_pdf import MarkdownToPdfConverter ##SMY: PENDING: implementation
26
+
27
+ import traceback ## Extract, format and print information about Python stack traces.
28
+ from utils.logger import get_logger
29
+
30
+ logger = get_logger(__name__) ##NB: setup_logging() ## set logging
31
+
32
+ # Instantiate converters class once – they are stateless
33
+ pdf2md_converter = PdfToMarkdownConverter()
34
+ #md2pdf_converter = MarkdownToPdfConverter()
35
+
36
+
37
+ # User eXperience: Load Marker models ahead of time if not already loaded in reload mode
38
+ ## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
39
+ from converters.extraction_converter import load_models
40
+ from globals import config_load_models
41
+ try:
42
+ if not config_load_models.model_dict:
43
+ model_dict = load_models()
44
+ config_load_models.model_dict = model_dict
45
+ '''if 'model_dict' not in globals():
46
+ global model_dict
47
+ model_dict = load_models()'''
48
+ logger.log(level=30, msg="Config_load_model: ", extra={"model_dict": str(model_dict)})
49
+ except Exception as exc:
50
+ #tb = traceback.format_exc() #exc.__traceback__
51
+ logger.exception(f"✗ Error loading models (reload): {exc}") #\n{tb}")
52
+ raise RuntimeError(f"✗ Error loading models (reload): {exc}") #\n{tb}")
53
+
54
+ #def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,): ##moved to llm_login
55
+
56
+
57
+ #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
58
+ #@spaces.GPU(duration=duration) ## HF Spaces GPU support
59
+ def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progress(track_tqdm=True)):
60
+ #Use progress.tqdm to integrate with the executor map
61
+
62
+ results = []
63
+
64
+ #for result_interim in progress2.tqdm(
65
+ for i, pdf_file in enumerate(iterable=progress2.tqdm(
66
+ iterable=pdf_files, #, max_retries), total=len(pdf_files)
67
+ desc=f"Processing file conversion ... pool.map",
68
+ total=pdf_files_count),
69
+ start=1):
70
+ result_interim = pdf2md_converter.convert_files(pdf_file)
71
+
72
+ # Update the Gradio UI to improve user-friendly eXperience
73
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
74
+ progress2((i,pdf_files_count), desc=f"Processing file conversion result: {i}: {str(pdf_file)} : [{str(result_interim)[:20]}]")
75
+ #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
76
+ time.sleep(0.75) #.sleep(0.25)
77
+
78
+ results.append(result_interim)
79
+
80
+ return results
81
+
82
+ ##SMY: TODO: future: refactor to gradio_process.py and
83
+ ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
84
+ #@spaces.GPU
85
+ def convert_batch(
86
+ pdf_files, #: list[str],
87
+ pdf_files_count: int,
88
+ provider: str,
89
+ model_id: str,
90
+ #base_url: str
91
+ hf_provider: str,
92
+ endpoint: str,
93
+ backend_choice: str,
94
+ system_message: str,
95
+ max_tokens: int,
96
+ temperature: float,
97
+ top_p: float,
98
+ stream: bool,
99
+ api_token_gr: str,
100
+ #max_workers: int,
101
+ #max_retries: int,
102
+ openai_base_url: str = "https://router.huggingface.co/v1",
103
+ openai_image_format: Optional[str] = "webp",
104
+ max_workers: Optional[int] = 1, #4,
105
+ max_retries: Optional[int] = 2,
106
+ debug: bool = False, #Optional[bool] = False, #True,
107
+ #output_format: str = "markdown",
108
+ output_format: Literal["markdown", "json", "html"] = "markdown",
109
+ #output_dir: Optional[Union[str, Path]] = "output_dir",
110
+ output_dir_string: str = "output_dir_default",
111
+ use_llm: bool = False, #Optional[bool] = False, #True,
112
+ force_ocr: bool = True, #Optional[bool] = False,
113
+ strip_existing_ocr: Optional[bool] = None, #bool = False,
114
+ disable_ocr_math: Optional[bool] = None, #bool = False,
115
+ page_range: str = None, #Optional[str] = None,
116
+ weasyprint_dll_directories: str = None, #weasyprint_libpath
117
+ tz_hours: str = None,
118
+ oauth_token: gr.OAuthToken | None=None,
119
+ progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
120
+ progress1: gr.Progress = gr.Progress(),
121
+ #progress2: gr.Progress = gr.Progress(track_tqdm=True),
122
+ ): #-> str:
123
+ """
124
+ Handles the conversion process using multiprocessing.
125
+ Spins up a pool and converts all uploaded files in parallel.
126
+ Aggregates per-file logs into one string.
127
+ Receives Gradio component values, starting with the list of uploaded file paths
128
+ """
129
+
130
+ # login: Update the Gradio UI to improve user-friendly eXperience - commencing
131
+ # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
132
+ yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
133
+ progress((0,16), f"Commencing Processing ...")
134
+ time.sleep(0.25)
135
+
136
+ # get token from logged-in user:
137
+ api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
138
+ ##SMY: Strictly debug. Must not be live
139
+ #logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token": api_token, "api_token_gr": api_token_gr})
140
+
141
+ '''try:
142
+ ##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
143
+ #login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
144
+
145
+ if is_loggedin_huggingface() and (api_token is None or api_token == ""):
146
+ api_token = get_token() ##SMY: might be redundant
147
+
148
+ elif is_loggedin_huggingface() is False and api_token:
149
+ login_huggingface(api_token)
150
+ # login: Update the Gradio UI to improve user-friendly eXperience
151
+ #yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
152
+ else:
153
+ pass
154
+ # login: Update the Gradio UI to improve user-friendly eXperience
155
+ #yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
156
+
157
+ except Exception as exc: # Catch all exceptions
158
+ tb = traceback.format_exc()
159
+ logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
160
+ return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
161
+ '''
162
+ progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
163
+ time.sleep(0.25)
164
+ ## debug
165
+ #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
166
+
167
+ #if not files:
168
+ if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
169
+ logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
170
+ #outputs=[log_output, files_individual_JSON, files_individual_downloads],
171
+ return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
172
+
173
+ progress((2,16), desc=f"Getting configuration values")
174
+ time.sleep(0.25)
175
+ # Get config values if not provided
176
+ #config_file = find_file("config.ini") ##from file_handler.file_utils ##takes a bit of time to process. #NeedOptimise
177
+
178
+ config_file = Path("utils") / "config.ini" ##SMY: speed up sacrificing flexibility
179
+ model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
180
+ openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
181
+ openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
182
+ max_workers = max_workers if max_workers else get_config_value(config_file, "MARKER_CAP", "MAX_WORKERS")
183
+ max_retries = max_retries if max_retries else get_config_value(config_file, "MARKER_CAP", "MAX_RETRIES")
184
+ output_format = output_format if output_format else get_config_value(config_file, "MARKER_CAP", "OUTPUT_FORMAT")
185
+ output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
186
+ use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
187
+ page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
188
+ weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
189
+ config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
190
+ config_load_models.pdf_files_count = pdf_files_count
191
+
192
+ progress((3,16), desc=f"Retrieved configuration values")
193
+ time.sleep(0.25)
194
+
195
+ # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
196
+ yield gr.update(interactive=False), f"Setting global variables : Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
197
+ progress((4,16), desc=f"Setting global variables : Initialiasing init_args")
198
+ time.sleep(0.25)
199
+ #init_args = ( ...
200
+
201
+ # set global variables
202
+ from globals import config_load
203
+ #self.pdf_files_count: int = 0
204
+ config_load.provider = provider
205
+ config_load.model_id = model_id
206
+ config_load.hf_provider = hf_provider
207
+ config_load.endpoint = endpoint
208
+ config_load.backend_choice = backend_choice
209
+ config_load.system_message = system_message
210
+ config_load.max_tokens = max_tokens
211
+ config_load.temperature = temperature
212
+ config_load.top_p = top_p
213
+ config_load.stream = stream
214
+ config_load.api_token = api_token
215
+ config_load.openai_base_url = openai_base_url
216
+ config_load.openai_image_format = openai_image_format
217
+ config_load.max_workers = max_workers
218
+ config_load.max_retries = max_retries
219
+ config_load.debug = debug
220
+ #output_format: str = "markdown",
221
+ config_load.output_format = output_format
222
+ config_load.output_dir_string = output_dir_string
223
+ config_load.use_llm = use_llm
224
+ config_load.force_ocr = force_ocr
225
+ config_load.strip_existing_ocr = strip_existing_ocr
226
+ config_load.disable_ocr_math = disable_ocr_math
227
+ config_load.page_range = page_range
228
+ #config_load.weasyprint_dll_directories: str = None,
229
+ config_load.tz_hours = tz_hours
230
+
231
+ # 1. create output_dir
232
+ try:
233
+ yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
234
+ progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
235
+ time.sleep(0.25)
236
+
237
+ #pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
238
+
239
+ # Create Marker output_dir in temporary directory where Gradio can access it. #file_utils.
240
+ output_dir = create_temp_folder(output_dir_string)
241
+ #pdf2md_converter.output_dir = output_dir ##SMY should now redirect to globals
242
+ config_load.output_dir = output_dir
243
+
244
+ logger.info(f"✓ output_dir created: ", extra={"output_dir": config_load.output_dir.name, "in": str(config_load.output_dir.parent)})
245
+ yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
246
+ progress((6,16), desc=f"✓ Created output_dir.")
247
+ time.sleep(0.25)
248
+ except Exception as exc:
249
+ tb = traceback.format_exc()
250
+ tbp = traceback.print_exc() # Print the exception traceback
251
+ logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True) # Log the full traceback
252
+
253
+ # Update the Gradio UI to improve user-friendly eXperience
254
+ yield gr.update(interactive=True), f"✗ An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
255
+ return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
256
+
257
+ # 2. Process file conversion leveraging ProcessPoolExecutor for efficiency
258
+ try:
259
+ results = [] ## Processed files result holder
260
+ logger.log(level=30, msg="Initialising Processing Files ...", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
261
+ yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
262
+ progress((7,16), desc=f"Initialising Processing Files ...")
263
+ time.sleep(0.25)
264
+
265
+ # Create a pool with init_worker initialiser
266
+ ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
267
+ '''with ProcessPoolExecutor(
268
+ max_workers=max_workers,
269
+ initializer=init_worker,
270
+ initargs=init_args
271
+ ) as pool:'''
272
+
273
+ #logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
274
+ #progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
275
+ #time.sleep(0.25)
276
+
277
+ # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
278
+ # The 'docconverter' argument is implicitly handled by the initialiser
279
+ #futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
280
+ #logs = [f.result() for f in as_completed(futures)]
281
+ #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
282
+ #logs = [f.result() for f in futures]
283
+ try:
284
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
285
+ progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
286
+ time.sleep(0.25)
287
+ yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
288
+
289
+ '''# Use progress.tqdm to integrate with the executor map
290
+ #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
291
+ for result_interim in progress.tqdm(
292
+ iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
293
+ desc="ProcessPoolExecutor: Pooling file conversion ..."):
294
+ results.append(result_interim)
295
+
296
+ # Update the Gradio UI to improve user-friendly eXperience
297
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
298
+ #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
299
+ #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
300
+ #time.sleep(0.25)'''
301
+
302
+ results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
303
+
304
+ logger.log(level=30, msg="Got Results from files conversion: ", extra={"results": str(results)[:20]})
305
+ yield gr.update(interactive=True), f"Got Results from files conversion: [{str(results)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
306
+ progress((11,16), desc=f"Got Results from files conversion")
307
+ time.sleep(0.25)
308
+ except Exception as exc:
309
+ # Raise the exception to stop the Gradio app: exception to halt execution
310
+ logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
311
+ tbp = traceback.print_exc() # Print the exception traceback
312
+ # Update the Gradio UI to improve user-friendly eXperience
313
+ yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
314
+ return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
315
+
316
+ except Exception as exc:
317
+ tb = traceback.format_exc()
318
+ logger.exception(f"✗ Error during Files processing → {exc}\n{tb}" , exc_info=True) # Log the full traceback
319
+ #traceback.print_exc() # Print the exception traceback
320
+ yield gr.update(interactive=True), f"✗ An error occurred during Files Processing → {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
321
+ return [gr.update(interactive=True), f"✗ An error occurred during files processing → {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]
322
+
323
+ # 3. Process file conversion results
324
+ try:
325
+ logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
326
+ progress((12,16), desc="Processing results from files conversion") ##rekickin
327
+ time.sleep(0.25)
328
+
329
+ logs = []
330
+ logs_files_images = []
331
+
332
+ #logs.extend(results) ## performant pythonic
333
+ #logs = list[results] ##
334
+ logs = [result for result in results] ## pythonic list comprehension
335
+ # [template] ## logs : [file , images , filepath, image_path]
336
+
337
+ #logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
338
+ logs_count = 0
339
+ #for log in logs:
340
+ for i, log in enumerate(logs):
341
+ logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
342
+ logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
343
+ i_image_count = log.get("images", 0)
344
+ # Update the Gradio UI to improve user-friendly eXperience
345
+ #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
346
+ progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
347
+ logs_count = i+i_image_count
348
+ except Exception as exc:
349
+ tbp = traceback.print_exc() # Print the exception traceback
350
+ logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True) # Log the full traceback
351
+ return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
352
+ #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
353
+
354
+
355
+ # 4. Zip Processed Files and images. Insert to first index
356
+ try: ##from file_handler.file_utils
357
+ progress((13,16), desc="Zipping processed files and images")
358
+ time.sleep(0.25)
359
+ zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
360
+ logs_files_images.insert(0, zipped_processed_files)
361
+
362
+
363
+ #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
364
+ progress((14,16), desc="Zipped processed files and images")
365
+ time.sleep(0.25)
366
+
367
+ except Exception as exc:
368
+ tb = traceback.format_exc()
369
+ logger.exception(f"✗ Error during zipping processed files → {exc}\n{tb}" , exc_info=True) # Log the full traceback
370
+ #traceback.print_exc() # Print the exception traceback
371
+ yield gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
372
+ return gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
373
+
374
+
375
+ # 5. Return processed files log
376
+ try:
377
+ progress((15,16), desc="Formatting processed log results")
378
+ time.sleep(0.25)
379
+
380
+ ## # Convert logs list of dicts to formatted json stringutils.file_utils.
381
+ logs_return_formatted_json_string = process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
382
+ #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
383
+
384
+ ## # Convert any Path objects to strings, but leave strings as-is
385
+ logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
386
+ logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
387
+
388
+ progress((16,16), desc="Complete processing and formatting file processing results")
389
+ time.sleep(0.25)
390
+ # [templates]
391
+ #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
392
+ #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
393
+
394
+ yield gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True) ##SMY: redundant
395
+ return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
396
+ #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
397
+ #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
398
+
399
+ except Exception as exc:
400
+ tb = traceback.format_exc()
401
+ logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True) # Log the full traceback
402
+ #traceback.print_exc() # Print the exception traceback
403
+ yield gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
404
+ return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
405
+
406
+ #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
407
+ #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
408
+
409
+ ## SMY: to be implemented/refactored AND moved to logic file
410
+ '''
411
+ def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
412
+ """
413
+ Gradio callback for Markdown → PDF.
414
+ Returns a list of generated PDF files (as Gradio File objects).
415
+ """
416
+ if not file and not folder:
417
+ return []
418
+
419
+ md_paths = []
420
+
421
+ # Single file
422
+ if file:
423
+ md_path = Path(file.name)
424
+ md_paths.append(md_path)
425
+
426
+ # Folder
427
+ if folder:
428
+ try:
429
+ md_paths.extend(collect_markdown_paths(folder))
430
+ except Exception as exc:
431
+ logger.exception("Folder traversal failed.")
432
+ return []
433
+
434
+ if not md_paths:
435
+ return []
436
+
437
+ output_dir = Path("./generated_pdfs")
438
+ output_dir.mkdir(exist_ok=True)
439
+
440
+ pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
441
+ # Convert to Gradio File objects
442
+ gr_files = [gr.File(path=str(p)) for p in pdf_files]
443
+ return gr_files
444
+ '''
445
+
446
+
447
+ ##====================
448
+ #Gradio interface moved to gradio_ui.py
449
+ #def build_interface() -> gr.Blocks:
450
+ # """
451
+ # Assemble the Gradio Blocks UI.
452
+ # """
453
+
454
+ if __name__ == '__name__':
455
+ convert_batch()
ui/gradio_ui.py CHANGED
@@ -1,497 +1,20 @@
1
  # ui/gradio_ui.py
2
- from ast import Interactive
3
- import gradio as gr
4
- from concurrent.futures import ProcessPoolExecutor, as_completed
5
- import tqdm
6
- import asyncio ##future
7
- import time
8
-
9
- from pathlib import Path, WindowsPath
10
- from typing import Optional, Union #, Dict, List, Any, Tuple
11
 
12
- from huggingface_hub import get_token
13
- import spaces ##HuggingFace spaces to accelerate GPU support on HF Spaces
14
-
15
- #import file_handler
16
- from file_handler import file_utils
17
- import file_handler.file_utils
18
- from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD, file_types_list, file_types_tuple
19
- from utils.utils import is_dict, is_list_of_dicts
20
- from file_handler.file_utils import zip_processed_files, process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
21
- from file_handler.file_utils import find_file
22
- from utils.get_config import get_config_value
23
 
24
  from llm.provider_validator import is_valid_provider, suggest_providers
25
- from llm.llm_login import get_login_token, is_loggedin_huggingface, login_huggingface
26
  from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
27
- from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
28
- #from converters.md_to_pdf import MarkdownToPdfConverter ##SMY: PENDING: implementation
 
29
 
30
  import traceback ## Extract, format and print information about Python stack traces.
31
  from utils.logger import get_logger
32
 
33
  logger = get_logger(__name__) ##NB: setup_logging() ## set logging
34
 
35
- # Instantiate converters class once – they are stateless
36
- pdf2md_converter = PdfToMarkdownConverter()
37
- #md2pdf_converter = MarkdownToPdfConverter()
38
-
39
-
40
- # User eXperience: Load Marker models ahead of time if not already loaded in reload mode
41
- ## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
42
- from converters.extraction_converter import load_models
43
- from globals import config_load_models
44
- try:
45
- if not config_load_models.model_dict:
46
- model_dict = load_models()
47
- config_load_models.model_dict = model_dict
48
- '''if 'model_dict' not in globals():
49
- global model_dict
50
- model_dict = load_models()'''
51
- logger.log(level=30, msg="Config_load_model: ", extra={"model_dict": str(model_dict)})
52
- except Exception as exc:
53
- #tb = traceback.format_exc() #exc.__traceback__
54
- logger.exception(f"✗ Error loading models (reload): {exc}") #\n{tb}")
55
- raise RuntimeError(f"✗ Error loading models (reload): {exc}") #\n{tb}")
56
-
57
- #def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,): ##moved to llm_login
58
-
59
- # pool executor to convert files called by Gradio
60
- ##SMY: TODO: future: refactor to gradio_process.py and
61
- ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
62
- #@spaces.GPU
63
- def convert_batch(
64
- pdf_files, #: list[str],
65
- pdf_files_count: int,
66
- provider: str,
67
- model_id: str,
68
- #base_url: str
69
- hf_provider: str,
70
- endpoint: str,
71
- backend_choice: str,
72
- system_message: str,
73
- max_tokens: int,
74
- temperature: float,
75
- top_p: float,
76
- stream: bool,
77
- api_token_gr: str,
78
- #max_workers: int,
79
- #max_retries: int,
80
- openai_base_url: str = "https://router.huggingface.co/v1",
81
- openai_image_format: Optional[str] = "webp",
82
- max_workers: Optional[int] = 4,
83
- max_retries: Optional[int] = 2,
84
- output_format: str = "markdown",
85
- #output_dir: Optional[Union[str, Path]] = "output_dir",
86
- output_dir_string: str = "output_dir_default",
87
- use_llm: bool = False, #Optional[bool] = False, #True,
88
- force_ocr: bool = True, #Optional[bool] = False,
89
- page_range: str = None, #Optional[str] = None,
90
- weasyprint_dll_directories: str = None,
91
- tz_hours: str = None,
92
- oauth_token: gr.OAuthToken | None=None,
93
- progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
94
- progress1: gr.Progress = gr.Progress(),
95
- #progress2: gr.Progress = gr.Progress(track_tqdm=True),
96
- ): #-> str:
97
- """
98
- Handles the conversion process using multiprocessing.
99
- Spins up a pool and converts all uploaded files in parallel.
100
- Aggregates per-file logs into one string.
101
- Receives Gradio component values, starting with the list of uploaded file paths
102
- """
103
-
104
- # login: Update the Gradio UI to improve user-friendly eXperience - commencing
105
- # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
106
- yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
107
- progress((0,16), f"Commencing Processing ...")
108
- time.sleep(0.25)
109
-
110
- # get token from logged-in user:
111
- api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
112
- ##SMY: Strictly debug. Must not be live
113
- #logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token": api_token, "api_token_gr": api_token_gr})
114
-
115
- '''try:
116
- ##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
117
- #login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
118
-
119
- if is_loggedin_huggingface() and (api_token is None or api_token == ""):
120
- api_token = get_token() ##SMY: might be redundant
121
-
122
- elif is_loggedin_huggingface() is False and api_token:
123
- login_huggingface(api_token)
124
- # login: Update the Gradio UI to improve user-friendly eXperience
125
- #yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
126
- else:
127
- pass
128
- # login: Update the Gradio UI to improve user-friendly eXperience
129
- #yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
130
-
131
- except Exception as exc: # Catch all exceptions
132
- tb = traceback.format_exc()
133
- logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
134
- return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
135
- '''
136
- progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
137
- time.sleep(0.25)
138
- ## debug
139
- #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
140
-
141
- #if not files:
142
- if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
143
- logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
144
- #outputs=[log_output, files_individual_JSON, files_individual_downloads],
145
- return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
146
-
147
- progress((2,16), desc=f"Getting configuration values")
148
- time.sleep(0.25)
149
- # Get config values if not provided
150
- #config_file = find_file("config.ini") ##from file_handler.file_utils ##takes a bit of time to process. #NeedOptimise
151
-
152
- config_file = Path("utils") / "config.ini" ##SMY: speed up sacrificing flexibility
153
- model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
154
- openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
155
- openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
156
- max_workers = max_workers if max_workers else get_config_value(config_file, "MARKER_CAP", "MAX_WORKERS")
157
- max_retries = max_retries if max_retries else get_config_value(config_file, "MARKER_CAP", "MAX_RETRIES")
158
- output_format = output_format if output_format else get_config_value(config_file, "MARKER_CAP", "OUTPUT_FORMAT")
159
- output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
160
- use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
161
- page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
162
- weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
163
- config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
164
- config_load_models.pdf_files_count = pdf_files_count
165
-
166
- progress((3,16), desc=f"Retrieved configuration values")
167
- time.sleep(0.25)
168
-
169
- # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
170
- yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
171
- progress((4,16), desc=f"Initialiasing init_args")
172
- time.sleep(0.25)
173
- init_args = (
174
- provider,
175
- model_id,
176
- #base_url,
177
- hf_provider,
178
- endpoint,
179
- backend_choice,
180
- system_message,
181
- max_tokens,
182
- temperature,
183
- top_p,
184
- stream,
185
- api_token,
186
- openai_base_url,
187
- openai_image_format,
188
- max_workers,
189
- max_retries,
190
- output_format,
191
- output_dir_string,
192
- use_llm,
193
- force_ocr,
194
- page_range,
195
- #progress,
196
- )
197
-
198
- # create output_dir
199
- try:
200
- yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
201
- progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
202
- time.sleep(0.25)
203
-
204
- #pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
205
-
206
- # Create Marker output_dir in temporary directory where Gradio can access it.
207
- output_dir = file_utils.create_temp_folder(output_dir_string)
208
- pdf2md_converter.output_dir = output_dir
209
-
210
- logger.info(f"✓ output_dir created: ", extra={"output_dir": pdf2md_converter.output_dir.name, "in": str(pdf2md_converter.output_dir.parent)})
211
- yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
212
- progress((6,16), desc=f"✓ Created output_dir.")
213
- time.sleep(0.25)
214
- except Exception as exc:
215
- tb = traceback.format_exc()
216
- tbp = traceback.print_exc() # Print the exception traceback
217
- logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True) # Log the full traceback
218
-
219
- # Update the Gradio UI to improve user-friendly eXperience
220
- yield gr.update(interactive=True), f"✗ An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
221
- return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
222
-
223
- # Process file conversion leveraging ProcessPoolExecutor for efficiency
224
- try:
225
- results = [] ## initialised pool result holder
226
- logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
227
- yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
228
- progress((7,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
229
- time.sleep(0.25)
230
-
231
- # Create a pool with init_worker initialiser
232
- with ProcessPoolExecutor(
233
- max_workers=max_workers,
234
- initializer=init_worker,
235
- initargs=init_args
236
- ) as pool:
237
- logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
238
- progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
239
- time.sleep(0.25)
240
-
241
- # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
242
- # The 'docconverter' argument is implicitly handled by the initialiser
243
- #futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
244
- #logs = [f.result() for f in as_completed(futures)]
245
- #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
246
- #logs = [f.result() for f in futures]
247
- try:
248
- #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
249
- progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
250
- time.sleep(0.25)
251
- yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
252
-
253
- '''# Use progress.tqdm to integrate with the executor map
254
- #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
255
- for result_interim in progress.tqdm(
256
- iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
257
- desc="ProcessPoolExecutor: Pooling file conversion ..."):
258
- results.append(result_interim)
259
-
260
- # Update the Gradio UI to improve user-friendly eXperience
261
- #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
262
- #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
263
- #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
264
- #time.sleep(0.25)'''
265
- #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
266
- #@spaces.GPU(duration=duration) ## HF Spaces GPU support
267
- def get_results_pool_map(pdf_files, pdf_files_count, progress2=gr.Progress()):
268
- #Use progress.tqdm to integrate with the executor map
269
- #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
270
- for result_interim in progress2.tqdm(
271
- iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
272
- desc=f"ProcessPoolExecutor: Pooling file conversion ... pool.map",
273
- total=pdf_files_count):
274
- results.append(result_interim)
275
-
276
- # Update the Gradio UI to improve user-friendly eXperience
277
- #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
278
- progress2((0,len(pdf_files)), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
279
- #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
280
- time.sleep(0.75) #.sleep(0.25)
281
-
282
- return results
283
- results = get_results_pool_map(pdf_files, pdf_files_count)
284
- yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(results)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
285
- progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
286
- time.sleep(0.25)
287
- except Exception as exc:
288
- # Raise the exception to stop the Gradio app: exception to halt execution
289
- logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
290
- tbp = traceback.print_exc() # Print the exception traceback
291
- # Update the Gradio UI to improve user-friendly eXperience
292
- yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
293
- return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
294
-
295
- # Process file conversion results
296
- try:
297
- logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
298
- progress((12,16), desc="Processing results from files conversion") ##rekickin
299
- time.sleep(0.25)
300
-
301
- logs = []
302
- logs_files_images = []
303
-
304
- #logs.extend(results) ## performant pythonic
305
- #logs = list[results] ##
306
- logs = [result for result in results] ## pythonic list comprehension
307
- # [template] ## logs : [file , images , filepath, image_path]
308
-
309
- #logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
310
- logs_count = 0
311
- #for log in logs:
312
- for i, log in enumerate(logs):
313
- logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
314
- logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
315
- i_image_count = log.get("images", 0)
316
- # Update the Gradio UI to improve user-friendly eXperience
317
- #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
318
- progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
319
- logs_count = i+i_image_count
320
- except Exception as exc:
321
- tbp = traceback.print_exc() # Print the exception traceback
322
- logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True) # Log the full traceback
323
- return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
324
- #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
325
- except Exception as exc:
326
- tb = traceback.format_exc()
327
- logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True) # Log the full traceback
328
- #traceback.print_exc() # Print the exception traceback
329
- yield gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
330
-
331
- # Zip Processed Files and images. Insert to first index
332
- try: ##from file_handler.file_utils
333
- progress((13,16), desc="Zipping processed files and images")
334
- time.sleep(0.25)
335
- zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
336
- logs_files_images.insert(0, zipped_processed_files)
337
-
338
-
339
- #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
340
- progress((14,16), desc="Zipped processed files and images")
341
- time.sleep(0.25)
342
-
343
- except Exception as exc:
344
- tb = traceback.format_exc()
345
- logger.exception(f"✗ Error during zipping processed files → {exc}\n{tb}" , exc_info=True) # Log the full traceback
346
- #traceback.print_exc() # Print the exception traceback
347
- yield gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
348
- return gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
349
-
350
-
351
- # Return processed files log
352
- try:
353
- progress((15,16), desc="Formatting processed log results")
354
- time.sleep(0.25)
355
-
356
- ## # Convert logs list of dicts to formatted json string
357
- logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
358
- #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
359
-
360
- ## # Convert any Path objects to strings, but leave strings as-is
361
- logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
362
- logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
363
-
364
- progress((16,16), desc="Complete processing and formatting file processing results")
365
- time.sleep(0.25)
366
- # [templates]
367
- #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
368
- #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
369
-
370
- yield gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True) ##SMY: redundant
371
- return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
372
- #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
373
- #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
374
-
375
- except Exception as exc:
376
- tb = traceback.format_exc()
377
- logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True) # Log the full traceback
378
- #traceback.print_exc() # Print the exception traceback
379
- yield gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
380
- return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
381
-
382
- #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
383
- #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
384
-
385
- # files wrapping into list ##SMY: Flagged for deprecation
386
- def pdf_files_wrap(files: list[str]):
387
- # explicitly wrap file object in a list
388
- return [files] if not isinstance(files, list) else files
389
- #return [files]
390
-
391
- ##====================
392
- ## SMY: moved to logic file: See pdf_to_md.py. Currently unused
393
- def convert_pdfs_to_md(file: gr.File | None, folder: str | None) -> dict:
394
- """
395
- Gradio callback for PDF → Markdown.
396
- Accepts either a single file or a folder path (recursively).
397
- Leverages Marker, a pipeline of deep learning models, for conversion
398
- Returns a dictionary of filename → Markdown string.
399
- """
400
- if not file and not folder:
401
- return {"error": "Please provide a PDF file or a folder."}
402
-
403
- pdf_paths = []
404
-
405
- # Single file
406
- if file:
407
- pdf_path = Path(file.name)
408
- pdf_paths.append(pdf_path)
409
-
410
- # Folder (recursively)
411
- if folder:
412
- try:
413
- pdf_paths.extend(collect_pdf_paths(folder))
414
- except Exception as exc:
415
- logger.exception("Folder traversal failed.")
416
- return {"error": str(exc)}
417
-
418
- if not pdf_paths:
419
- return {"error": "No PDF files found."}
420
-
421
- results = pdf2md_converter.batch_convert(pdf_paths)
422
- # Gradio expects a dict of {filename: content}
423
- return results
424
-
425
- ## SMY: to be implemented AND to refactor and moved to logic file
426
- def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
427
- """
428
- Gradio callback for Markdown → PDF.
429
- Returns a list of generated PDF files (as Gradio File objects).
430
- """
431
- if not file and not folder:
432
- return []
433
-
434
- md_paths = []
435
-
436
- # Single file
437
- if file:
438
- md_path = Path(file.name)
439
- md_paths.append(md_path)
440
-
441
- # Folder
442
- if folder:
443
- try:
444
- md_paths.extend(collect_markdown_paths(folder))
445
- except Exception as exc:
446
- logger.exception("Folder traversal failed.")
447
- return []
448
-
449
- if not md_paths:
450
- return []
451
-
452
- output_dir = Path("./generated_pdfs")
453
- output_dir.mkdir(exist_ok=True)
454
-
455
- pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
456
- # Convert to Gradio File objects
457
- gr_files = [gr.File(path=str(p)) for p in pdf_files]
458
- return gr_files
459
-
460
-
461
- ## SMY: to refactor and moved to logic file. Currently unused
462
- '''
463
- def convert_htmls_to_md(file: gr.File | None, folder: str | None) -> dict:
464
- """
465
- Gradio callback for HTML → Markdown.
466
- Accepts either a single file or a folder path (recursively).
467
- Returns a dictionary of filename → Markdown string.
468
- """
469
- if not file and not folder:
470
- return {"error": "Please provide a HTML file or a folder."}
471
-
472
- html_paths = []
473
-
474
- # Single file
475
- if file:
476
- html_path = Path(file.name)
477
- html_paths.append(html_path)
478
-
479
- # Folder (recursively)
480
- if folder:
481
- try:
482
- html_paths.extend(collect_html_paths(folder))
483
- except Exception as exc:
484
- logger.exception("Folder traversal failed.")
485
- return {"error": str(exc)}
486
-
487
- if not html_paths:
488
- return {"error": "No HTML files found."}
489
-
490
- results = html2md_converter.batch_convert(html_paths)
491
- # Gradio expects a dict of {filename: content}
492
- return results
493
- '''
494
-
495
  ##====================
496
 
497
  def build_interface() -> gr.Blocks:
@@ -520,45 +43,8 @@ def build_interface() -> gr.Blocks:
520
  }
521
  """
522
 
523
- ##SMY: flagged; to move to file_handler.file_utils
524
- def is_file_with_extension(path_obj: Path) -> bool:
525
- """
526
- Checks if a pathlib.Path object is a file and has a non-empty extension.
527
- """
528
- path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
529
- return path_obj.is_file() and bool(path_obj.suffix)
530
-
531
- ##SMY: flagged; to move to file_handler.file_utils
532
- def accumulate_files(uploaded_files, current_state):
533
- """
534
- Accumulates newly uploaded files with the existing state.
535
- """
536
- # Initialize state if it's the first run
537
- if current_state is None:
538
- current_state = []
539
-
540
- # If no files were uploaded in this interaction, return the current state unchanged
541
- if not uploaded_files:
542
- return current_state, f"No new files uploaded. Still tracking {len(current_state)} file(s)."
543
-
544
- # Get the temporary paths of the newly uploaded files
545
- # call is_file_with_extension to check if pathlib.Path object is a file and has a non-empty extension
546
- new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name))] #Path(f.name) and Path(f.name).is_file() and bool(Path(f.name).suffix)] #Path(f.name).suffix.lower() !=""]
547
-
548
- # Concatenate the new files with the existing ones in the state
549
- updated_files = current_state + new_file_paths
550
- updated_filenames = [Path(f).name for f in updated_files]
551
-
552
- updated_files_count = len(updated_files)
553
-
554
- # Return the updated state and a message to the user
555
- #file_info = "\n".join(updated_files)
556
- filename_info = "\n".join(updated_filenames)
557
- #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
558
- message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
559
-
560
- return updated_files, updated_files_count, message, gr.update(interactive=True), gr.update(interactive=True)
561
-
562
  # with gr.Blocks(title=TITLE) as demo
563
  with gr.Blocks(title=TITLE, css=custom_css) as demo:
564
  gr.Markdown(f"## {DESCRIPTION}")
@@ -653,18 +139,12 @@ def build_interface() -> gr.Blocks:
653
  label="Output Format",
654
  value="markdown",
655
  )
656
- output_dir_tb = gr.Textbox(
657
- label="Output Directory",
658
- value="output_dir", #"output_md",
659
- lines=1,
660
- max_lines=1,
661
- )
662
  with gr.Row():
663
  max_workers_sl = gr.Slider(
664
  label="Max Worker",
665
  minimum=1,
666
- maximum=7,
667
- value=4,
668
  step=1
669
  )
670
  max_retries_sl = gr.Slider(
@@ -674,14 +154,34 @@ def build_interface() -> gr.Blocks:
674
  value=2,
675
  step=1 #0.01
676
  )
 
 
 
 
 
 
 
677
  with gr.Column():
 
 
 
 
678
  use_llm_cb = gr.Checkbox(
679
  label="Use LLM for Marker conversion",
680
  value=False
681
  )
682
  force_ocr_cb = gr.Checkbox(
683
- label="Force OCR on all pages",
684
- value=True,
 
 
 
 
 
 
 
 
 
685
  )
686
  with gr.Column():
687
  page_range_tb = gr.Textbox(
@@ -729,14 +229,14 @@ def build_interface() -> gr.Blocks:
729
  btn_pdf_convert = gr.Button("Convert PDF(s)")
730
  '''
731
 
732
- file_types_list.extend(file_types_tuple)
733
  with gr.Column(elem_classes=["file-or-directory-area"]):
734
  with gr.Row():
735
  file_btn = gr.UploadButton(
736
  #file_btn = gr.File(
737
  label="Upload Multiple Files",
738
  file_count="multiple",
739
- file_types= file_types_list, #["file"], ##config.file_types_list
740
  #height=25, #"sm",
741
  size="sm",
742
  elem_classes=["gradio-upload-btn"]
@@ -745,7 +245,8 @@ def build_interface() -> gr.Blocks:
745
  #dir_btn = gr.File(
746
  label="Upload a Directory",
747
  file_count="directory",
748
- file_types= file_types_list, #["file"], #Warning: The `file_types` parameter is ignored when `file_count` is 'directory'
 
749
  #height=25, #"0.5",
750
  size="sm",
751
  elem_classes=["gradio-upload-btn"]
@@ -851,7 +352,7 @@ def build_interface() -> gr.Blocks:
851
  uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
852
  uploaded_files_count = gr.State(0) ## initial files count
853
 
854
- state_max_workers = gr.State(4) #max_workers_sl,
855
  state_max_retries = gr.State(2) #max_retries_sl,
856
  state_tz_hours = gr.State(value=None)
857
  state_api_token = gr.State(None)
@@ -953,10 +454,6 @@ def build_interface() -> gr.Blocks:
953
  yield [], msg, None, None
954
  return [], 0, f"Files list cleared.", None, None
955
 
956
- #hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
957
- ##unused
958
- ###hf_login_logout_btn.click(fn=custom_do_logout, inputs=[hf_login_logout_btn, state_api_token], outputs=[hf_login_logout_btn, api_token_tb, logout_status_md, state_api_token])
959
- ###logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status_md, hf_login_logout_btn, logout_btn])
960
  #logout_btn.click(fn=clear_state, inputs=None, outputs=[uploaded_file_list, output_textbox, log_output, api_token_tb])
961
  hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status_md]) #, state_api_token])
962
 
@@ -1009,21 +506,22 @@ def build_interface() -> gr.Blocks:
1009
  top_p_sl,
1010
  stream_cb,
1011
  api_token_tb, #state_api_token, #api_token_tb,
1012
- #gr.State(4), # max_workers
1013
- #gr.State(3), # max_retries
1014
  openai_base_url_tb,
1015
  openai_image_format_dd,
1016
- state_max_workers, #gr.State(4), #max_workers_sl,
1017
  state_max_retries, #gr.State(2), #max_retries_sl,
 
1018
  output_format_dd,
1019
  output_dir_tb,
1020
  use_llm_cb,
1021
  force_ocr_cb,
 
 
1022
  page_range_tb,
1023
  weasyprint_dll_directories_tb,
1024
  tz_hours_num, #state_tz_hours
1025
  ]
1026
-
1027
  ## debug
1028
  #logger.log(level=30, msg="About to execute btn_pdf_convert.click", extra={"files_len": pdf_files_count, "pdf_files": pdf_files})
1029
 
@@ -1097,22 +595,7 @@ def build_interface() -> gr.Blocks:
1097
  fn=get_file_count,
1098
  inputs=[files_upload_html],
1099
  outputs=[html_files_count, log_output]
1100
- )
1101
-
1102
- # Validate files upload on change; warn but allow continue
1103
- def on_pdf_files_change(pdf_files_value: list[str]):
1104
- # explicitly wrap file object in a list
1105
- pdf_files_value = pdf_files_wrap(pdf_files_value)
1106
- #if not isinstance(pdf_files_value, list):
1107
- # pdf_files_value = [pdf_files_value]
1108
-
1109
- pdf_files_path = [file.name for file in pdf_files_value]
1110
- pdf_files_len = len(pdf_files_value) #len(pdf_files_path)
1111
- if pdf_files_value:
1112
- #return
1113
- return pdf_files_path, pdf_files_len
1114
- #pdf_files.change(on_pdf_files_change, inputs=pdf_files, outputs=[log_output, pdf_files_count]) #, postprocess=False) ##debug
1115
-
1116
 
1117
  return demo
1118
 
 
1
  # ui/gradio_ui.py
 
 
 
 
 
 
 
 
 
2
 
3
+ import gradio as gr
4
+ from ui.gradio_process import convert_batch
5
+ from globals import config_load
 
 
 
 
 
 
 
 
6
 
7
  from llm.provider_validator import is_valid_provider, suggest_providers
 
8
  from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
9
+
10
+ from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
11
+ from utils.file_utils import accumulate_files, is_file_with_extension
12
 
13
  import traceback ## Extract, format and print information about Python stack traces.
14
  from utils.logger import get_logger
15
 
16
  logger = get_logger(__name__) ##NB: setup_logging() ## set logging
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ##====================
19
 
20
  def build_interface() -> gr.Blocks:
 
43
  }
44
  """
45
 
46
+ ##SMY: flagged; to move to file_handler.file_utils #accumulate_files()
47
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # with gr.Blocks(title=TITLE) as demo
49
  with gr.Blocks(title=TITLE, css=custom_css) as demo:
50
  gr.Markdown(f"## {DESCRIPTION}")
 
139
  label="Output Format",
140
  value="markdown",
141
  )
 
 
 
 
 
 
142
  with gr.Row():
143
  max_workers_sl = gr.Slider(
144
  label="Max Worker",
145
  minimum=1,
146
+ maximum=4,
147
+ value=1,
148
  step=1
149
  )
150
  max_retries_sl = gr.Slider(
 
154
  value=2,
155
  step=1 #0.01
156
  )
157
+ output_dir_tb = gr.Textbox(
158
+ label="Output Directory",
159
+ value="output_dir", #"output_md",
160
+ lines=1,
161
+ max_lines=1,
162
+ )
163
+ with gr.Row():
164
  with gr.Column():
165
+ debug_cb = gr.Checkbox(
166
+ label="Run in debug mode. Not recommended",
167
+ value=False, #True,
168
+ )
169
  use_llm_cb = gr.Checkbox(
170
  label="Use LLM for Marker conversion",
171
  value=False
172
  )
173
  force_ocr_cb = gr.Checkbox(
174
+ label="Force OCR on all pages. (Beware: extended processing time)",
175
+ value=False, #True,
176
+ )
177
+ #with gr.Column():
178
+ strip_existing_ocr_cb = gr.Checkbox(
179
+ label="strip embedded OCR text, re-run OCR",
180
+ value=False
181
+ )
182
+ disable_ocr_math_cb = gr.Checkbox(
183
+ label="OCR: disable math - no inline math",
184
+ value=False,
185
  )
186
  with gr.Column():
187
  page_range_tb = gr.Textbox(
 
229
  btn_pdf_convert = gr.Button("Convert PDF(s)")
230
  '''
231
 
232
+ config_load.file_types_list.extend(config_load.file_types_tuple) ##allowed file types in global
233
  with gr.Column(elem_classes=["file-or-directory-area"]):
234
  with gr.Row():
235
  file_btn = gr.UploadButton(
236
  #file_btn = gr.File(
237
  label="Upload Multiple Files",
238
  file_count="multiple",
239
+ file_types= config_load.file_types_list, #["file"], ##config.file_types_list
240
  #height=25, #"sm",
241
  size="sm",
242
  elem_classes=["gradio-upload-btn"]
 
245
  #dir_btn = gr.File(
246
  label="Upload a Directory",
247
  file_count="directory",
248
+ #file_types= config_load.file_types_list, #["file"], #Warning: The `file_types` parameter is ignored when `file_count` is 'directory'
249
+ ## [handled in accumulate_files] file_types - raised Error(gradio.exceptions.Error: "Invalid file type
250
  #height=25, #"0.5",
251
  size="sm",
252
  elem_classes=["gradio-upload-btn"]
 
352
  uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
353
  uploaded_files_count = gr.State(0) ## initial files count
354
 
355
+ state_max_workers = gr.State(1) #max_workers_sl, #4
356
  state_max_retries = gr.State(2) #max_retries_sl,
357
  state_tz_hours = gr.State(value=None)
358
  state_api_token = gr.State(None)
 
454
  yield [], msg, None, None
455
  return [], 0, f"Files list cleared.", None, None
456
 
 
 
 
 
457
  #logout_btn.click(fn=clear_state, inputs=None, outputs=[uploaded_file_list, output_textbox, log_output, api_token_tb])
458
  hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status_md]) #, state_api_token])
459
 
 
506
  top_p_sl,
507
  stream_cb,
508
  api_token_tb, #state_api_token, #api_token_tb,
 
 
509
  openai_base_url_tb,
510
  openai_image_format_dd,
511
+ state_max_workers, #gr.State(1), #max_workers_sl,
512
  state_max_retries, #gr.State(2), #max_retries_sl,
513
+ debug_cb,
514
  output_format_dd,
515
  output_dir_tb,
516
  use_llm_cb,
517
  force_ocr_cb,
518
+ strip_existing_ocr_cb,
519
+ disable_ocr_math_cb,
520
  page_range_tb,
521
  weasyprint_dll_directories_tb,
522
  tz_hours_num, #state_tz_hours
523
  ]
524
+
525
  ## debug
526
  #logger.log(level=30, msg="About to execute btn_pdf_convert.click", extra={"files_len": pdf_files_count, "pdf_files": pdf_files})
527
 
 
595
  fn=get_file_count,
596
  inputs=[files_upload_html],
597
  outputs=[html_files_count, log_output]
598
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
 
600
  return demo
601
 
utils/config.py CHANGED
@@ -28,13 +28,10 @@ DESCRIPTION_MD = (
28
  "Upload Markdown/LaTeX files and generate a polished PDF."
29
  )
30
 
31
- # File types
32
- file_types_list = []
33
- file_types_tuple = (".pdf", ".html", ".docx", ".doc")
34
- #file_types_list = list[file_types_tuple]
35
- #file_types_list.extend(file_types_tuple)
36
-
37
 
 
 
 
38
  # Conversion defaults
39
  DEFAULT_MARKER_OPTIONS = {
40
  "include_images": True,
@@ -86,4 +83,5 @@ hf_client = None
86
  artifact_dict = None
87
  pdf_converter = None
88
  html_converter = None
 
89
 
 
28
  "Upload Markdown/LaTeX files and generate a polished PDF."
29
  )
30
 
 
 
 
 
 
 
31
 
32
+ ##SMY: See config.ini
33
+ ##===================
34
+ '''
35
  # Conversion defaults
36
  DEFAULT_MARKER_OPTIONS = {
37
  "include_images": True,
 
83
  artifact_dict = None
84
  pdf_converter = None
85
  html_converter = None
86
+ '''
87
 
{file_handler → utils}/file_utils.py RENAMED
@@ -252,7 +252,7 @@ def zip_processed_files(root_dir: str, file_paths: list[str], tz_hours=None, dat
252
  """
253
 
254
  import zipfile
255
- from file_handler import file_utils
256
  from utils import utils
257
 
258
  root_path = Path(root_dir)
@@ -373,6 +373,40 @@ def process_dicts_data(data:Union[dict, list[dict]]):
373
 
374
  return formatted_string
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  ##NB: Python =>3.10, X | Y equiv to the type checker as Union[X, Y]
377
  def collect_pdf_html_paths(root: Union[str, Path]) -> List[Path]:
378
  """
@@ -425,6 +459,7 @@ def write_markdown(
425
  src_path: Union[str, Path],
426
  output_dir: Union[str, Path],
427
  rendered: Any,
 
428
  ) -> Path:
429
 
430
  """
@@ -468,7 +503,15 @@ def write_markdown(
468
  #out_dir = Path(output_dir)
469
  #out_dir.mkdir(parents=True, exist_ok=True)
470
 
471
- md_name = f"{src.stem}.md"
 
 
 
 
 
 
 
 
472
  if isinstance(output_dir, Path):
473
  md_path = output_dir / f"{src.stem}" / md_name
474
  else:
@@ -484,10 +527,12 @@ def write_markdown(
484
  md_path.parent.chmod(0)
485
 
486
  try:
487
- markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
 
488
  except AttributeError as exc: # pragma: no cover
489
  raise AttributeError(
490
- "Extractor Rendered object must have a 'markdown' attribute"
 
491
  ) from exc
492
 
493
  with md_path.open(mode="w", encoding="utf-8") as md_f:
@@ -562,58 +607,3 @@ def dump_images(
562
  return images_count, img_path_list ##SMY: return number of images and path
563
  #return images.items().count
564
  #return len(images)
565
-
566
- # Dummp Markdown extracted images ##SMY: Marked for deprecated
567
- '''
568
- def dump_images(
569
- src_path: Union[str, Path],
570
- output_dir: Union[str, Path],
571
- rendered: Any,
572
- ) -> int:
573
-
574
- """
575
- Dump the images of the Markdown representation of a source file to an output directory.
576
-
577
- Parameters
578
- ----------
579
- src_path : str | Path
580
- Path to the original source file. Only its base name is used for naming
581
- the resulting Markdown file.
582
- output_dir : str | Path
583
- Directory where the Markdown file will be written. It was created if it does not
584
- exist with create_outputdir().
585
- rendered : object
586
- Object that provides a ``markdown`` attribute containing the text to write.
587
-
588
- Returns
589
- -------
590
- Number of images dumped from the Markdown file.
591
- """
592
-
593
- try:
594
- images: Mapping[str, bytes] = getattr(rendered, "images")
595
- except TypeError as exc: # pragma: no cover
596
- raise AttributeError(
597
- "Extracted images from rendered.images must be a mapping of str -> bytes"
598
- ) from exc
599
-
600
- images_count = 0
601
- ##SMY: See marker.output.save_output() : https://github.com/datalab-to/marker/blob/master/marker/output.py
602
- #for img_name, img_bytes in images.items():
603
- for img_name, img in images.items():
604
- # Resolve the full path and make sure any sub‑directories exist.
605
- img_path = Path(output_dir) / src_path / img_name ##SMY: image files ##concatenate Path + str
606
- img_path.parent.mkdir(parents=True, exist_ok=True)
607
-
608
- #'' '
609
- #with img_path.open("wb") as fp:
610
- # fp.write(img_bytes) ##SMY: write images to markdown folder
611
- #images_count += 1
612
- #'' '
613
- img.save(img_path) ##SMY: save images (of type PIL.Image.Image) to markdown folder
614
- images_count += 1
615
-
616
- return images_count ##SMY: return number of images
617
- #return images.items().count
618
- #return len(images)
619
- '''
 
252
  """
253
 
254
  import zipfile
255
+ from utils import file_utils
256
  from utils import utils
257
 
258
  root_path = Path(root_dir)
 
373
 
374
  return formatted_string
375
 
376
+ def accumulate_files(uploaded_files, current_state):
377
+ """
378
+ Accumulates newly uploaded files with the existing state.
379
+ """
380
+
381
+ from globals import config_load
382
+ import gradio as gr
383
+ # Initialize state if it's the first run
384
+ if current_state is None:
385
+ current_state = []
386
+
387
+ # If no files were uploaded in this interaction, return the current state unchanged
388
+ if not uploaded_files:
389
+ return current_state, f"No new files uploaded. Still tracking {len(current_state)} file(s)."
390
+
391
+ # Get the temporary paths of the newly uploaded files
392
+ # call is_file_with_extension to check if pathlib.Path object is a file and has a non-empty extension
393
+ #new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name))] #Path(f.name) and Path(f.name).is_file() and bool(Path(f.name).suffix)] #Path(f.name).suffix.lower() !=""]
394
+ new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name)) and f.name.endswith(config_load.file_types_tuple)]
395
+
396
+ # Concatenate the new files with the existing ones in the state
397
+ updated_files = current_state + new_file_paths
398
+ updated_filenames = [Path(f).name for f in updated_files]
399
+
400
+ updated_files_count = len(updated_files)
401
+
402
+ # Return the updated state and a message to the user
403
+ #file_info = "\n".join(updated_files)
404
+ filename_info = "\n".join(updated_filenames)
405
+ #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
406
+ message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
407
+
408
+ return updated_files, updated_files_count, message, gr.update(interactive=True), gr.update(interactive=True)
409
+
410
  ##NB: Python =>3.10, X | Y equiv to the type checker as Union[X, Y]
411
  def collect_pdf_html_paths(root: Union[str, Path]) -> List[Path]:
412
  """
 
459
  src_path: Union[str, Path],
460
  output_dir: Union[str, Path],
461
  rendered: Any,
462
+ output_format: str,
463
  ) -> Path:
464
 
465
  """
 
503
  #out_dir = Path(output_dir)
504
  #out_dir.mkdir(parents=True, exist_ok=True)
505
 
506
+ #md_name = f"{src.stem}.md"
507
+ output_handler = {
508
+ "markdown": "md",
509
+ "json": "json",
510
+ "html": "html",
511
+ }
512
+ output_ext = output_handler.get(output_format, "md")
513
+
514
+ md_name = f"{src.stem}.{output_ext}"
515
  if isinstance(output_dir, Path):
516
  md_path = output_dir / f"{src.stem}" / md_name
517
  else:
 
527
  md_path.parent.chmod(0)
528
 
529
  try:
530
+ #markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
531
+ markdown_text = getattr(rendered, output_format)
532
  except AttributeError as exc: # pragma: no cover
533
  raise AttributeError(
534
+ #"Extractor Rendered object must have a 'markdown' attribute"
535
+ f"Extractor Rendered object must have a '{output_format}' attribute"
536
  ) from exc
537
 
538
  with md_path.open(mode="w", encoding="utf-8") as md_f:
 
607
  return images_count, img_path_list ##SMY: return number of images and path
608
  #return images.items().count
609
  #return len(images)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/get_config.py CHANGED
@@ -14,7 +14,7 @@ sys.path.insert(0, f"{grandparent_dir}") #\\file_handler")
14
  ##end debug
15
  #'''
16
  #import file_handler
17
- from file_handler.file_utils import find_file
18
 
19
  def get_config_value(config_file:Path, section_key:str, parameter:str, fallback:str=None) -> str: # configfile: Union[str, Path]="utils\\config.ini"):
20
  """ Load config file, locate section, read parameter and return value
 
14
  ##end debug
15
  #'''
16
  #import file_handler
17
+ from utils.file_utils import find_file
18
 
19
  def get_config_value(config_file:Path, section_key:str, parameter:str, fallback:str=None) -> str: # configfile: Union[str, Path]="utils\\config.ini"):
20
  """ Load config file, locate section, read parameter and return value
utils/logger.py CHANGED
@@ -72,7 +72,7 @@ def setup_logging(level: int = None, tz_hours=None, date_format:str="%d%b%Y") ->
72
  # File handler
73
  #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
74
  #file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
75
- from file_handler.file_utils import check_create_logfile
76
  file_handler = logging.FileHandler(check_create_logfile(filename="app_logging.log", tz_hours=tz_hours, date_format=date_format), mode="a", encoding="utf-8")
77
  ## Getting filepermission error
78
 
 
72
  # File handler
73
  #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
74
  #file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
75
+ from utils.file_utils import check_create_logfile
76
  file_handler = logging.FileHandler(check_create_logfile(filename="app_logging.log", tz_hours=tz_hours, date_format=date_format), mode="a", encoding="utf-8")
77
  ## Getting filepermission error
78