semmyk commited on
Commit
7757db2
·
1 Parent(s): 8290881

baseline08_beta0.3.1_01Oct25: refactor progress feedback, gradio ui tweak, weasyprint dll path

Browse files
README.md CHANGED
@@ -193,7 +193,8 @@ parserpdf/
193
  - Process: Outputs Markdown files with extracted text/images to `output_dir`.
194
 
195
  ## Configuration
196
- - Edit `utils/config.py` or `utils/config.ini` for defaults (e.g., model ID, output dir).
 
197
  - UI overrides: Adjust sliders for max tokens, temperature, workers, etc.
198
 
199
  ## LLM Providers
 
193
  - Process: Outputs Markdown files with extracted text/images to `output_dir`.
194
 
195
  ## Configuration
196
+ - Edit `utils/config.ini` or `utils/config.py` for defaults (e.g., model ID, output dir).
197
+ - On windows, set weasyprint's GTK path: e.g. "C:\\Dat\\dev\\gtk3-runtime\\bin" or "C:\\msys64\\mingw64\\bin"
198
  - UI overrides: Adjust sliders for max tokens, temperature, workers, etc.
199
 
200
  ## LLM Providers
converters/pdf_to_md.py CHANGED
@@ -5,11 +5,9 @@ from typing import List, Dict, Union, Optional
5
  import traceback ## Extract, format and print information about Python stack traces.
6
  import time
7
 
8
- from ui.gradio_ui import gr
9
  from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
10
  from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
11
 
12
-
13
  from utils import config
14
  from utils.lib_loader import set_weasyprint_library
15
  from utils.logger import get_logger
@@ -45,9 +43,7 @@ def init_worker(#self,
45
  use_llm: bool, #: bool | None = False,
46
  force_ocr: bool,
47
  page_range: str, #: str | None = None
48
- progress: gr.Progress = gr.Progress(),
49
  ):
50
-
51
  #'''
52
  """
53
  instantiate DocumentConverter/DocumentExtractor for use in each pool worker
@@ -64,6 +60,7 @@ def init_worker(#self,
64
  #'''
65
  # 1) Instantiate the DocumentConverter
66
  logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
 
67
  try:
68
  docconverter = DocumentConverter(
69
  model_id, #: str,
@@ -121,10 +118,12 @@ class PdfToMarkdownConverter:
121
  Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
122
  """
123
 
 
124
  try:
125
  ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
 
126
  # Set a new environment variable
127
- set_weasyprint_library() ##utils.lib_loader.set_weasyprint_library()
128
  except Exception as exc:
129
  tb = traceback.format_exc()
130
  logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
@@ -173,7 +172,7 @@ class PdfToMarkdownConverter:
173
 
174
  #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
175
  #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
176
- def convert_files(self, src_path: str, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]:
177
  #def convert_files(self, src_path: str) -> str:
178
  """
179
  Worker task: use `extractor` to convert file with retry/backoff.
 
5
  import traceback ## Extract, format and print information about Python stack traces.
6
  import time
7
 
 
8
  from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
9
  from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
10
 
 
11
  from utils import config
12
  from utils.lib_loader import set_weasyprint_library
13
  from utils.logger import get_logger
 
43
  use_llm: bool, #: bool | None = False,
44
  force_ocr: bool,
45
  page_range: str, #: str | None = None
 
46
  ):
 
47
  #'''
48
  """
49
  instantiate DocumentConverter/DocumentExtractor for use in each pool worker
 
60
  #'''
61
  # 1) Instantiate the DocumentConverter
62
  logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
63
+
64
  try:
65
  docconverter = DocumentConverter(
66
  model_id, #: str,
 
118
  Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
119
  """
120
 
121
+ from globals import config_load_models
122
  try:
123
  ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
124
+ weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
125
  # Set a new environment variable
126
+ set_weasyprint_library(weasyprint_libpath) ##utils.lib_loader.set_weasyprint_library()
127
  except Exception as exc:
128
  tb = traceback.format_exc()
129
  logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
 
172
 
173
  #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
174
  #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
175
+ def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
176
  #def convert_files(self, src_path: str) -> str:
177
  """
178
  Worker task: use `extractor` to convert file with retry/backoff.
file_handler/file_utils.py CHANGED
@@ -479,8 +479,8 @@ def write_markdown(
479
  #md_path = Path("data") / output_dir / f"{src.stem}" / md_name ##debug
480
  md_path = Path(output_dir) / f"{src.stem}" / md_name ##debug
481
  ##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
482
- #md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
483
- md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
484
  md_path.parent.chmod(0)
485
 
486
  try:
 
479
  #md_path = Path("data") / output_dir / f"{src.stem}" / md_name ##debug
480
  md_path = Path(output_dir) / f"{src.stem}" / md_name ##debug
481
  ##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
482
+ md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
483
+ #md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
484
  md_path.parent.chmod(0)
485
 
486
  try:
globals.py CHANGED
@@ -6,6 +6,8 @@ class Config:
6
  """ Single model_dict use across the app"""
7
  def __init__(self):
8
  self.model_dict = {}
 
 
9
 
10
  # Create a single, shared instance of the Config class
11
  # Other modules will import and use this instance.
 
6
  """ Single model_dict use across the app"""
7
  def __init__(self):
8
  self.model_dict = {}
9
+ self.weasyprint_libpath = ""
10
+ self.config_ini = "utils\\config.ini"
11
 
12
  # Create a single, shared instance of the Config class
13
  # Other modules will import and use this instance.
ui/gradio_ui.py CHANGED
@@ -2,7 +2,8 @@
2
  from ast import Interactive
3
  import gradio as gr
4
  from concurrent.futures import ProcessPoolExecutor, as_completed
5
- import asyncio
 
6
 
7
  from pathlib import Path, WindowsPath
8
  from typing import Optional, Union #, Dict, List, Any, Tuple
@@ -83,9 +84,12 @@ def convert_batch(
83
  use_llm: bool = False, #Optional[bool] = False, #True,
84
  force_ocr: bool = True, #Optional[bool] = False,
85
  page_range: str = None, #Optional[str] = None,
 
86
  tz_hours: str = None,
87
  oauth_token: gr.OAuthToken | None=None,
88
- progress: gr.Progress = gr.Progress(), #Progress tracker to keep tab on pool queue executor
 
 
89
  ): #-> str:
90
  """
91
  Handles the conversion process using multiprocessing.
@@ -98,6 +102,7 @@ def convert_batch(
98
  # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
99
  yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
100
  progress((0,16), f"Commencing Processing ...")
 
101
 
102
  # get token from logged-in user:
103
  api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
@@ -126,6 +131,7 @@ def convert_batch(
126
  return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
127
  '''
128
  progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
 
129
  ## debug
130
  #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
131
 
@@ -136,8 +142,11 @@ def convert_batch(
136
  return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
137
 
138
  progress((2,16), desc=f"Getting configuration values")
 
139
  # Get config values if not provided
140
- config_file = find_file("config.ini") ##from file_handler.file_utils ##takes a bit of time to process. #NeedOptimise
 
 
141
  model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
142
  openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
143
  openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
@@ -147,12 +156,16 @@ def convert_batch(
147
  output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
148
  use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
149
  page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
 
 
150
 
151
  progress((3,16), desc=f"Retrieved configuration values")
 
152
 
153
  # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
154
- progress((4,16), desc=f"Initialiasing init_args")
155
  yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
 
 
156
  init_args = (
157
  provider,
158
  model_id,
@@ -175,13 +188,15 @@ def convert_batch(
175
  use_llm,
176
  force_ocr,
177
  page_range,
 
178
  )
179
 
180
  # create output_dir
181
  try:
182
  yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
183
  progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
184
-
 
185
  #pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
186
 
187
  # Create Marker output_dir in temporary directory where Gradio can access it.
@@ -191,6 +206,7 @@ def convert_batch(
191
  logger.info(f"✓ output_dir created: ", extra={"output_dir": pdf2md_converter.output_dir.name, "in": str(pdf2md_converter.output_dir.parent)})
192
  yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
193
  progress((6,16), desc=f"✓ Created output_dir.")
 
194
  except Exception as exc:
195
  tb = traceback.format_exc()
196
  tbp = traceback.print_exc() # Print the exception traceback
@@ -206,6 +222,7 @@ def convert_batch(
206
  logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
207
  yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
208
  progress((7,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
 
209
 
210
  # Create a pool with init_worker initialiser
211
  with ProcessPoolExecutor(
@@ -215,6 +232,7 @@ def convert_batch(
215
  ) as pool:
216
  logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
217
  progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
 
218
 
219
  # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
220
  # The 'docconverter' argument is implicitly handled by the initialiser
@@ -223,21 +241,27 @@ def convert_batch(
223
  #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
224
  #logs = [f.result() for f in futures]
225
  try:
226
- yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
227
  progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
 
 
228
 
229
  # Use progress.tqdm to integrate with the executor map
230
  #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
231
  for result_interim in progress.tqdm(
232
- iterable=pool.map(pdf2md_converter.convert_files, pdf_files) #, max_retries), total=len(pdf_files)
233
- ):
234
  results.append(result_interim)
 
235
  # Update the Gradio UI to improve user-friendly eXperience
236
- yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
237
- progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
 
 
238
 
239
  yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
240
  progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
 
241
  except Exception as exc:
242
  # Raise the exception to stop the Gradio app: exception to halt execution
243
  logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
@@ -248,8 +272,9 @@ def convert_batch(
248
 
249
  # Process file conversion results
250
  try:
251
- progress((12,16), desc="Processing results from files conversion") ##rekickin
252
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
 
 
253
 
254
  logs = []
255
  logs_files_images = []
@@ -265,10 +290,11 @@ def convert_batch(
265
  for i, log in enumerate(logs):
266
  logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
267
  logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
268
- i_image = log.get("images", 0)
269
  # Update the Gradio UI to improve user-friendly eXperience
270
  #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
271
- logs_count = i+i_image
 
272
  except Exception as exc:
273
  tbp = traceback.print_exc() # Print the exception traceback
274
  logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True) # Log the full traceback
@@ -283,11 +309,14 @@ def convert_batch(
283
  # Zip Processed Files and images. Insert to first index
284
  try: ##from file_handler.file_utils
285
  progress((13,16), desc="Zipping processed files and images")
 
286
  zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
287
  logs_files_images.insert(0, zipped_processed_files)
288
 
289
- progress((14,16), desc="Zipped processed files and images")
290
  #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
 
 
291
 
292
  except Exception as exc:
293
  tb = traceback.format_exc()
@@ -300,6 +329,7 @@ def convert_batch(
300
  # Return processed files log
301
  try:
302
  progress((15,16), desc="Formatting processed log results")
 
303
 
304
  ## # Convert logs list of dicts to formatted json string
305
  logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
@@ -310,6 +340,7 @@ def convert_batch(
310
  logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
311
 
312
  progress((16,16), desc="Complete processing and formatting file processing results")
 
313
  # [templates]
314
  #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
315
  #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
@@ -581,7 +612,8 @@ def build_interface() -> gr.Blocks:
581
  gr.Markdown(f"#### **Marker Configuration**")
582
  with gr.Row():
583
  openai_base_url_tb = gr.Textbox(
584
- label="OpenAI Base URL: Default HuggingFace",
 
585
  value="https://router.huggingface.co/v1",
586
  lines=1,
587
  max_lines=1,
@@ -624,15 +656,24 @@ def build_interface() -> gr.Blocks:
624
  value=False
625
  )
626
  force_ocr_cb = gr.Checkbox(
627
- label="force OCR on all pages",
628
  value=True,
629
  )
630
- page_range_tb = gr.Textbox(
631
- label="Page Range (Optional)",
632
- placeholder="Example: 0,1-5,8,12-15",
633
- lines=1,
634
- max_lines=1,
635
- )
 
 
 
 
 
 
 
 
 
636
 
637
 
638
  with gr.Accordion("🤗 HuggingFace Client Logout", open=True): #, open=False):
@@ -952,6 +993,7 @@ def build_interface() -> gr.Blocks:
952
  use_llm_cb,
953
  force_ocr_cb,
954
  page_range_tb,
 
955
  tz_hours_num, #state_tz_hours
956
  ]
957
 
 
2
  from ast import Interactive
3
  import gradio as gr
4
  from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ import asyncio ##future
6
+ import time
7
 
8
  from pathlib import Path, WindowsPath
9
  from typing import Optional, Union #, Dict, List, Any, Tuple
 
84
  use_llm: bool = False, #Optional[bool] = False, #True,
85
  force_ocr: bool = True, #Optional[bool] = False,
86
  page_range: str = None, #Optional[str] = None,
87
+ weasyprint_dll_directories: str = None,
88
  tz_hours: str = None,
89
  oauth_token: gr.OAuthToken | None=None,
90
+ progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
91
+ progress1: gr.Progress = gr.Progress(),
92
+ #progress2: gr.Progress = gr.Progress(track_tqdm=True),
93
  ): #-> str:
94
  """
95
  Handles the conversion process using multiprocessing.
 
102
  # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
103
  yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
104
  progress((0,16), f"Commencing Processing ...")
105
+ time.sleep(0.25)
106
 
107
  # get token from logged-in user:
108
  api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
 
131
  return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
132
  '''
133
  progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
134
+ time.sleep(0.25)
135
  ## debug
136
  #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
137
 
 
142
  return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
143
 
144
  progress((2,16), desc=f"Getting configuration values")
145
+ time.sleep(0.25)
146
  # Get config values if not provided
147
+ #config_file = find_file("config.ini") ##from file_handler.file_utils ##takes a bit of time to process. #NeedOptimise
148
+
149
+ config_file = Path("utils") / "config.ini" ##SMY: speed up sacrificing flexibility
150
  model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
151
  openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
152
  openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
 
156
  output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
157
  use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
158
  page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
159
+ weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
160
+ config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
161
 
162
  progress((3,16), desc=f"Retrieved configuration values")
163
+ time.sleep(0.25)
164
 
165
  # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
 
166
  yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
167
+ progress((4,16), desc=f"Initialiasing init_args")
168
+ time.sleep(0.25)
169
  init_args = (
170
  provider,
171
  model_id,
 
188
  use_llm,
189
  force_ocr,
190
  page_range,
191
+ #progress,
192
  )
193
 
194
  # create output_dir
195
  try:
196
  yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
197
  progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
198
+ time.sleep(0.25)
199
+
200
  #pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
201
 
202
  # Create Marker output_dir in temporary directory where Gradio can access it.
 
206
  logger.info(f"✓ output_dir created: ", extra={"output_dir": pdf2md_converter.output_dir.name, "in": str(pdf2md_converter.output_dir.parent)})
207
  yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
208
  progress((6,16), desc=f"✓ Created output_dir.")
209
+ time.sleep(0.25)
210
  except Exception as exc:
211
  tb = traceback.format_exc()
212
  tbp = traceback.print_exc() # Print the exception traceback
 
222
  logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
223
  yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
224
  progress((7,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
225
+ time.sleep(0.25)
226
 
227
  # Create a pool with init_worker initialiser
228
  with ProcessPoolExecutor(
 
232
  ) as pool:
233
  logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
234
  progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
235
+ time.sleep(0.25)
236
 
237
  # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
238
  # The 'docconverter' argument is implicitly handled by the initialiser
 
241
  #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
242
  #logs = [f.result() for f in futures]
243
  try:
244
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
245
  progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
246
+ time.sleep(0.25)
247
+ yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
248
 
249
  # Use progress.tqdm to integrate with the executor map
250
  #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
251
  for result_interim in progress.tqdm(
252
+ iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
253
+ desc="ProcessPoolExecutor: Pooling file conversion ..."):
254
  results.append(result_interim)
255
+
256
  # Update the Gradio UI to improve user-friendly eXperience
257
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
258
+ #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
259
+ #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
260
+ #time.sleep(0.25)
261
 
262
  yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
263
  progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
264
+ time.sleep(0.25)
265
  except Exception as exc:
266
  # Raise the exception to stop the Gradio app: exception to halt execution
267
  logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
 
272
 
273
  # Process file conversion results
274
  try:
 
275
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
276
+ progress((12,16), desc="Processing results from files conversion") ##rekickin
277
+ time.sleep(0.25)
278
 
279
  logs = []
280
  logs_files_images = []
 
290
  for i, log in enumerate(logs):
291
  logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
292
  logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
293
+ i_image_count = log.get("images", 0)
294
  # Update the Gradio UI to improve user-friendly eXperience
295
  #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
296
+ progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
297
+ logs_count = i+i_image_count
298
  except Exception as exc:
299
  tbp = traceback.print_exc() # Print the exception traceback
300
  logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True) # Log the full traceback
 
309
  # Zip Processed Files and images. Insert to first index
310
  try: ##from file_handler.file_utils
311
  progress((13,16), desc="Zipping processed files and images")
312
+ time.sleep(0.25)
313
  zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
314
  logs_files_images.insert(0, zipped_processed_files)
315
 
316
+
317
  #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
318
+ progress((14,16), desc="Zipped processed files and images")
319
+ time.sleep(0.25)
320
 
321
  except Exception as exc:
322
  tb = traceback.format_exc()
 
329
  # Return processed files log
330
  try:
331
  progress((15,16), desc="Formatting processed log results")
332
+ time.sleep(0.25)
333
 
334
  ## # Convert logs list of dicts to formatted json string
335
  logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
 
340
  logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
341
 
342
  progress((16,16), desc="Complete processing and formatting file processing results")
343
+ time.sleep(0.25)
344
  # [templates]
345
  #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
346
  #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
 
612
  gr.Markdown(f"#### **Marker Configuration**")
613
  with gr.Row():
614
  openai_base_url_tb = gr.Textbox(
615
+ label="OpenAI Base URL",
616
+ info = "default HuggingFace",
617
  value="https://router.huggingface.co/v1",
618
  lines=1,
619
  max_lines=1,
 
656
  value=False
657
  )
658
  force_ocr_cb = gr.Checkbox(
659
+ label="Force OCR on all pages",
660
  value=True,
661
  )
662
+ with gr.Column():
663
+ page_range_tb = gr.Textbox(
664
+ label="Page Range (Optional)",
665
+ value=0,
666
+ placeholder="Example: 0,1-5,8,12-15 ~(default: first page)",
667
+ lines=1,
668
+ max_lines=1,
669
+ )
670
+ weasyprint_dll_directories_tb = gr.Textbox(
671
+ label="Path to weasyprint DLL libraries",
672
+ info='"C:\\Dat\\dev\\gtk3-runtime\\bin" or "C:\\msys64\\mingw64\\bin"',
673
+ placeholder="C:\\msys64\\mingw64\\bin",
674
+ lines=1,
675
+ max_lines=1,
676
+ )
677
 
678
 
679
  with gr.Accordion("🤗 HuggingFace Client Logout", open=True): #, open=False):
 
993
  use_llm_cb,
994
  force_ocr_cb,
995
  page_range_tb,
996
+ weasyprint_dll_directories_tb,
997
  tz_hours_num, #state_tz_hours
998
  ]
999
 
utils/get_config.py CHANGED
@@ -35,6 +35,7 @@ def get_config_value(config_file:Path, section_key:str, parameter:str, fallback:
35
  try:
36
  #config_file = find_config(config_file)
37
  cfg = config()
 
38
  if config_file.is_file():
39
  cfg.read(config_file)
40
  param_value = cfg[section_key].get(option=parameter, fallback=fallback) #"C:\\Dat\\dev\\gtk3-runtime\\bin")
 
35
  try:
36
  #config_file = find_config(config_file)
37
  cfg = config()
38
+ config_file = config_file if isinstance(config_file, Path) else Path(config_file)
39
  if config_file.is_file():
40
  cfg.read(config_file)
41
  param_value = cfg[section_key].get(option=parameter, fallback=fallback) #"C:\\Dat\\dev\\gtk3-runtime\\bin")
utils/lib_loader.py CHANGED
@@ -20,13 +20,14 @@ def set_weasyprint_library(libpath: Union[str, Path] = None, config_file: Union[
20
 
21
  #libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
22
  if not libpath:
23
- '''cfg = config()
24
- cfg.read(config_file) #"utils\\config.ini")
25
- lib_path = cfg["LIBRARIES_CAP"].get(f"WEASYPRINT_DLL_DIRECTORIES", "C:\\Dat\\dev\\gtk3-runtime\\bin")
26
- '''
27
- from file_handler.file_utils import find_file
28
- config_file = find_file("config.ini") ##from file_handler.file_utils
29
- lib_path = get_config_value(config_file, "LIBRARIES_CAP", "WEASYPRINT_DLL_DIRECTORIES") if not libpath else "C:\\msys64\\mingw64\\bin"
 
30
 
31
  # Check if the file exists before attempting to load it
32
  #if not os.path.exists(libobject):
 
20
 
21
  #libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
22
  if not libpath:
23
+ #from file_handler.file_utils import find_file
24
+ #config_file = find_file("config.ini") ##from file_handler.file_utils
25
+
26
+ ## Alternate to speed up while sacrificing
27
+ from globals import config_load_models
28
+ config_file = config_load_models.config_ini
29
+
30
+ lib_path = get_config_value(Path(config_file), "LIBRARIES_CAP", "WEASYPRINT_DLL_DIRECTORIES") if not libpath else "C:\\msys64\\mingw64\\bin"
31
 
32
  # Check if the file exists before attempting to load it
33
  #if not os.path.exists(libobject):