File size: 30,288 Bytes
c6fb648
 
15e9c77
 
c6fb648
15e9c77
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e9c77
 
c6fb648
 
 
 
 
 
 
 
 
 
 
 
15e9c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e9c77
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e9c77
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e9c77
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e9c77
 
 
 
 
 
 
c6fb648
15e9c77
 
c6fb648
15e9c77
 
 
 
 
 
 
 
 
 
c6fb648
15e9c77
 
 
 
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e9c77
c6fb648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# ui/gradio_process.py

from re import Match
from unittest import result
import gradio as gr
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from tqdm import tqdm

import time

from pathlib import Path, WindowsPath
from typing import Optional, Union, Literal #, Dict, List, Any, Tuple

from huggingface_hub import get_token
import spaces    ##HuggingFace spaces to accelerate GPU support on HF Spaces

#import utilities, helpers
#import utils.file_utils
from utils.file_utils import zip_processed_files, process_dicts_data, create_temp_folder   #, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir  ## should move to handling file
from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD  #, file_types_list, file_types_tuple
from utils.utils import is_dict, is_list_of_dicts
from utils.get_config import get_config_value

from llm.llm_login import get_login_token, is_loggedin_huggingface, login_huggingface
from converters.extraction_converter import DocumentConverter as docconverter  #DocumentExtractor #as docextractor
from converters.pdf_to_md import PdfToMarkdownConverter   #, init_worker
#from converters.md_to_pdf import MarkdownToPdfConverter  ##SMY: PENDING: implementation

import traceback  ## Extract, format and print information about Python stack traces.
from utils.logger import get_logger

logger = get_logger(__name__)   ##NB: setup_logging()  ## set logging

# Instantiate converters class once – they are stateless
pdf2md_converter = PdfToMarkdownConverter()
#md2pdf_converter = MarkdownToPdfConverter()

    
# User eXperience: Load Marker models ahead of time if not already loaded in reload mode
## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
from converters.extraction_converter import load_models
from globals import config_load_models
try:
    if not config_load_models.model_dict:
        model_dict = load_models()
        config_load_models.model_dict = model_dict
    '''if 'model_dict' not in globals():
        global model_dict
        model_dict = load_models()'''
    logger.log(level=30, msg="Config_load_model: ", extra={"model_dict": str(model_dict)})
except Exception as exc:
    #tb = traceback.format_exc()   #exc.__traceback__
    logger.exception(f"βœ— Error loading models (reload): {exc}")  #\n{tb}")
    raise RuntimeError(f"βœ— Error loading models (reload): {exc}")  #\n{tb}") 

#def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):  ##moved to llm_login


#duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
#@spaces.GPU(duration=duration)   ## HF Spaces GPU support
def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progress(track_tqdm=True)):
    #Use progress.tqdm to integrate with the executor map

    results = []
    
    #for result_interim in progress2.tqdm(
    for i, pdf_file in enumerate(iterable=progress2.tqdm(
                iterable=pdf_files,  #, max_retries), total=len(pdf_files)
                desc=f"Processing file conversion ... pool.map",
                total=pdf_files_count)
                ):
        result_interim = pdf2md_converter.convert_files(pdf_file)

        # Update the Gradio UI to improve user-friendly eXperience
        #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
        progress2((i,pdf_files_count), desc=f"Processing file conversion result: {i}: {str(pdf_file)} : [{str(result_interim)[:20]}]")
        #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
        time.sleep(0.75)  #.sleep(0.25)
        
        results.append(result_interim)
        
    return results

def get_results_files_conversion_with_pool(pdf_files, pdf_files_count, max_workers: int, progress2=gr.Progress(track_tqdm=True)):
    #Use progress.tqdm to integrate with the executor map

    results = []
    try:
        # Create a pool with init_worker initialiser
        ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
        with ProcessPoolExecutor(
            max_workers=max_workers,
            ) as pool:
    
                logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files[:3], "files_len": len(pdf_files), "progress": str(progress2),}) 
                progress2((10,16), desc=f"Starting ProcessPool queue: Processing Files ...")
                time.sleep(0.25)

                # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
                #try:
                    #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
                #    progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
                #    time.sleep(0.25)
                #    yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
    
                # Use progress.tqdm to integrate with the executor mapresults = pool.map(pdf2md_converter.convert_files, pdf_files)  ##SMY iterables  #max_retries #output_dir_string)
                for i, result_interim in enumerate(progress2.tqdm(
                    iterable=pool.map(pdf2md_converter.convert_files, pdf_files),  #, max_retries), total=len(pdf_files)
                    desc="ProcessPoolExecutor: Pooling file conversion ...",
                    total=pdf_files_count, unit="files")
                    ):

                        results.append(result_interim)
        
                        # Update the Gradio UI to improve user-friendly eXperience
                        yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
                        #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)[:20]}]")
                        progress2((i, pdf_files_count), desc=f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]")
                        time.sleep(0.25)
    except Exception as exc:
        # Raise the exception to stop the Gradio app: exception to halt execution
        logger.exception("Error during pooling file conversion", exc_info=True)  # Log the full traceback
        tbp = traceback.print_exc()  # Print the exception traceback
        # Update the Gradio UI to improve user-friendly eXperience
        yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"  ## return the exception message
        return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"]  ## return the exception message
        ##======
        
    return results

def get_results_files_conversion_with_pool_ascomplete(pdf_files, pdf_files_count, max_workers: int, progress2=gr.Progress(track_tqdm=True)):
    """
        This function wraps the as_completed call to process results
        as they become available.
    """
    #Use progress.tqdm to integrate with the executor map

    results = []
    try:
        # Create a pool with init_worker initialiser
        ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
        with ProcessPoolExecutor(
            max_workers=max_workers,
            ) as pool:
    
                logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "progress": str(progress2)})  #pdf_files_count
                progress2((10,16), desc=f"Starting ProcessPool queue: Processing Files ...")
                time.sleep(0.25)

                # Submit each task individually and collect the futures
                futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
                
                # Use progress.tqdm to integrate with the executor mapresults = pool.map(pdf2md_converter.convert_files, pdf_files)  ##SMY iterables  #max_retries #output_dir_string)
                for i, future in enumerate(progress2.tqdm(
                    iterable=as_completed(futures),  #pdf_files,
                    desc="ProcessPoolExecutor: Pooling file conversion ...",
                    total=pdf_files_count, unit="files")
                    ):
                        result_interim = future.result()
                        results.append(result_interim)
        
                        # Update the Gradio UI to improve user-friendly eXperience
                        yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
                        #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)[:20]}]")
                        progress2((i, pdf_files_count), desc=f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]")
                        time.sleep(0.25)
    except Exception as exc:
        # Raise the exception to stop the Gradio app: exception to halt execution
        logger.exception("Error during pooling file conversion", exc_info=True)  # Log the full traceback
        tbp = traceback.print_exc()  # Print the exception traceback
        # Update the Gradio UI to improve user-friendly eXperience
        yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"  ## return the exception message
        return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"]  ## return the exception message
        ##======
        
    return results

##SMY: TODO: future: refactor to gradio_process.py and 
## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
#@spaces.GPU
def convert_batch(
    pdf_files, #: list[str],
    pdf_files_count: int,
    provider: str,
    model_id: str,
    #base_url: str
    hf_provider: str,
    endpoint: str,
    backend_choice: str,
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    stream: bool,
    api_token_gr: str,
    #max_workers: int,
    #max_retries: int,
    openai_base_url: str = "https://router.huggingface.co/v1",
    openai_image_format: Optional[str] = "webp",
    max_workers: Optional[int] = 1,  #4,
    max_retries: Optional[int] = 2,
    debug: bool = False,        #Optional[bool] = False,  #True,
    #output_format: str = "markdown",
    output_format: Literal["markdown", "json", "html"] = "markdown",
    #output_dir: Optional[Union[str, Path]] = "output_dir",
    output_dir_string: str = "output_dir_default",
    use_llm: bool = False,      #Optional[bool] = False,  #True,
    force_ocr: bool = True,     #Optional[bool] = False,
    strip_existing_ocr: Optional[bool] = None,  #bool = False,
    disable_ocr_math: Optional[bool] = None,    #bool = False,
    page_range: str = None,     #Optional[str] = None,
    weasyprint_dll_directories: str = None,     #weasyprint_libpath 
    tz_hours: str = None,
    pooling: str = "no_pooling",   #bool = True,
    oauth_token: gr.OAuthToken | None=None,
    progress: gr.Progress = gr.Progress(track_tqdm=True),  #Progress tracker to keep tab on pool queue executor
    progress1: gr.Progress = gr.Progress(),
    #progress2: gr.Progress = gr.Progress(track_tqdm=True),
    ): #-> str:
    """
    Handles the conversion process using multiprocessing.
    Spins up a pool and converts all uploaded files in parallel.
    Aggregates per-file logs into one string.
    Receives Gradio component values, starting with the list of uploaded file paths
    """

    # login: Update the Gradio UI to improve user-friendly eXperience - commencing
    # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],            
    yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
    progress((0,16), f"Commencing Processing ...")
    time.sleep(0.25)
    
    # get token from logged-in user: 
    api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
    ##SMY: Strictly debug. Must not be live
    #logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token": api_token, "api_token_gr": api_token_gr})

    '''try:
        ##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
        #login_huggingface(api_token)  ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
        
        if is_loggedin_huggingface() and (api_token is None or api_token == ""):
            api_token = get_token()   ##SMY: might be redundant
        
        elif is_loggedin_huggingface() is False and api_token:
            login_huggingface(api_token)
            # login: Update the Gradio UI to improve user-friendly eXperience
            #yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
        else:
            pass
            # login: Update the Gradio UI to improve user-friendly eXperience
            #yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
        
    except Exception as exc:  # Catch all exceptions
        tb = traceback.format_exc()
        logger.exception(f"βœ— Error during login_huggingface β†’ {exc}\n{tb}", exc_info=True) # Log the full traceback
        return [gr.update(interactive=True), f"βœ— An error occurred during login_huggingface β†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  # return the exception message
    '''
    progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
    time.sleep(0.25)
    ## debug
    #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})

    #if not files:
    if not pdf_files or pdf_files is None:  ## Check if files is None. This handles the case where no files are uploaded.
        logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
        #outputs=[log_output, files_individual_JSON, files_individual_downloads],
        return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
    
    progress((2,16), desc=f"Getting configuration values")
    time.sleep(0.25)
    # Get config values if not provided
    #config_file = find_file("config.ini")  ##from file_handler.file_utils  ##takes a bit of time to process. #NeedOptimise
    
    config_file = Path("utils") / "config.ini"  ##SMY: speed up sacrificing flexibility
    model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
    openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
    openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
    max_workers = max_workers if max_workers else get_config_value(config_file, "MARKER_CAP", "MAX_WORKERS")
    max_retries = max_retries if max_retries else get_config_value(config_file, "MARKER_CAP", "MAX_RETRIES")
    output_format = output_format if output_format else get_config_value(config_file, "MARKER_CAP", "OUTPUT_FORMAT")
    output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
    use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
    page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
    weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
    config_load_models.weasyprint_libpath = weasyprint_dll_directories  ## Assign user's weasyprint path to Global var
    config_load_models.pdf_files_count = pdf_files_count
    #pooling = True   ##SMY: placeholder
    
    progress((3,16), desc=f"Retrieved configuration values")
    time.sleep(0.25)

    # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
    yield gr.update(interactive=False), f"Setting global variables : Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
    progress((4,16), desc=f"Setting global variables : Initialiasing init_args")
    time.sleep(0.25)
    #init_args = ( ...
    
    # set global variables
    from globals import config_load
    #self.pdf_files_count: int = 0
    config_load.provider = provider
    config_load.model_id = model_id
    config_load.hf_provider = hf_provider
    config_load.endpoint = endpoint
    config_load.backend_choice = backend_choice
    config_load.system_message = system_message
    config_load.max_tokens = max_tokens
    config_load.temperature = temperature
    config_load.top_p = top_p
    config_load.stream = stream
    config_load.api_token = api_token
    config_load.openai_base_url = openai_base_url
    config_load.openai_image_format = openai_image_format
    config_load.max_workers = max_workers
    config_load.max_retries = max_retries
    config_load.debug = debug
    #output_format: str = "markdown",
    config_load.output_format = output_format
    config_load.output_dir_string = output_dir_string
    config_load.use_llm = use_llm
    config_load.force_ocr = force_ocr
    config_load.strip_existing_ocr = strip_existing_ocr
    config_load.disable_ocr_math = disable_ocr_math
    config_load.page_range = page_range
    #config_load.weasyprint_dll_directories: str = None,
    config_load.tz_hours = tz_hours
    config_load.pooling = pooling   ## placeholder for ProcessPoolExecutor flag
   
    # 1. create output_dir
    try:
        yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
        progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
        time.sleep(0.25)

        #pdf2md_converter.output_dir_string = output_dir_string   ##SMY: attempt setting directly to resolve pool.map iterable

        # Create Marker output_dir in temporary directory where Gradio can access it.  #file_utils.
        output_dir = create_temp_folder(output_dir_string)
        #pdf2md_converter.output_dir = output_dir  ##SMY should now redirect to globals
        config_load.output_dir = output_dir
        
        logger.info(f"βœ“ output_dir created: ", extra={"output_dir": config_load.output_dir.name, "in": str(config_load.output_dir.parent)})
        yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
        progress((6,16), desc=f"βœ“ Created output_dir.")
        time.sleep(0.25)
    except Exception as exc:
            tb = traceback.format_exc()
            tbp = traceback.print_exc()  # Print the exception traceback
            logger.exception("βœ— error creating output_dir β†’ {exc}\n{tb}", exc_info=True)  # Log the full traceback
            
            # Update the Gradio UI to improve user-friendly eXperience
            yield gr.update(interactive=True), f"βœ— An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  ## return the exception message
            return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}"  ## return the exception message

    # 2. Process file conversion leveraging ProcessPoolExecutor for efficiency 
    results = []  ## Processed files result holder
    logger.log(level=30, msg="Initialising Processing Files ...", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string})  #pdf_files_count
    yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
    progress((7,16), desc=f"Initialising Processing Files ...")
    time.sleep(0.25)

    try:
        #yield gr.update(interactive=True), f"Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
        progress((8,16), desc=f"Pooling file conversion ...")
        time.sleep(0.25)
        yield gr.update(interactive=False), f"Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
        
        ##SMY: Future: users choose sequential or pooling from Gradio ui
        match pooling:
            case "no_pooling":
                results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
            case "pooling":
                results = get_results_files_conversion_with_pool(pdf_files, pdf_files_count, max_workers, progress)
            case "as_completed":
                results = get_results_files_conversion_with_pool_ascomplete(pdf_files, pdf_files_count, max_workers, progress)
            
        logger.log(level=30, msg="Got Results from files conversion: ", extra={"results": str(results)[:20]}) 
        yield gr.update(interactive=True), f"Got Results from files conversion: [{str(results)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
        progress((9,16), desc=f"Got Results from files conversion")
        time.sleep(0.25)
    except Exception as exc:
        tb = traceback.format_exc()
        logger.exception(f"βœ— Error during Files processing β†’ {exc}\n{tb}" , exc_info=True)  # Log the full traceback
        #traceback.print_exc()  # Print the exception traceback
        yield gr.update(interactive=True), f"βœ— An error occurred during Files Processing β†’ {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
        return [gr.update(interactive=True), f"βœ— An error occurred during files processing β†’ {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]
    
    # 3. Process file conversion results
    try:
        logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
        progress((12,16), desc="Processing results from files conversion")  ##rekickin
        time.sleep(0.25)
        
        logs = []
        logs_files_images = []

        #logs.extend(results)   ## performant pythonic
        #logs = list[results]  ## 
        logs = [result for result in results]  ## pythonic list comprehension
        # [template]  ## logs : [file , images , filepath, image_path]
        
        #logs_files_images = logs_files.extend(logs_images)  #zip(logs_files, logs_images)   ##SMY: in progress
        logs_count =  0
        #for log in logs:
        for i, log in enumerate(logs):
            logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path")  # isinstance(log, (dict, str))
            logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
            i_image_count = log.get("images", 0)
            # Update the Gradio UI to improve user-friendly eXperience
            #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
            progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
            logs_count = i+i_image_count
    except Exception as exc:
        tbp = traceback.print_exc()  # Print the exception traceback
        logger.exception("Error during processing results logs β†’ {exc}\n{tbp}", exc_info=True)  # Log the full traceback
        return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tbp}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  ## return the exception message
        #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  ## return the exception message
    
    
    # 4. Zip Processed Files and images. Insert to first index
    try:  ##from file_handler.file_utils
        progress((13,16), desc="Zipping processed files and images")
        time.sleep(0.25)
        zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S')  #date_format='%d%b%Y'
        logs_files_images.insert(0, zipped_processed_files)

        
        #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
        progress((14,16), desc="Zipped processed files and images")
        time.sleep(0.25)

    except Exception as exc:
        tb = traceback.format_exc()
        logger.exception(f"βœ— Error during zipping processed files β†’ {exc}\n{tb}" , exc_info=True)  # Log the full traceback
        #traceback.print_exc()  # Print the exception traceback
        yield gr.update(interactive=True), f"βœ— An error occurred during zipping files β†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
        return gr.update(interactive=True), f"βœ— An error occurred during zipping files β†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message

    
    # 5. Return processed files log
    try:
        progress((15,16), desc="Formatting processed log results")
        time.sleep(0.25)
        
        ## # Convert logs list of dicts to formatted json stringutils.file_utils.
        logs_return_formatted_json_string = process_dicts_data(logs)   #"\n".join(log for log in logs)  ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
        #logs_files_images_return = "\n".join(path for path in logs_files_images)  ##TypeError: sequence item 0: expected str instance, WindowsPath found  
        
        ## # Convert any Path objects to strings, but leave strings as-is
        logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
        logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)})  ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
        
        progress((16,16), desc="Complete processing and formatting file processing results")
        time.sleep(0.25)
        # [templates]
        #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
        #return "\n".join(logs), "\n".join(logs_files_images)    #"\n".join(logs_files)
        
        yield  gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)    ##SMY: redundant
        return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
        #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
        #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
        
    except Exception as exc:
        tb = traceback.format_exc()
        logger.exception(f"βœ— Error during returning result logs β†’ {exc}\n{tb}" , exc_info=True)  # Log the full traceback
        #traceback.print_exc()  # Print the exception traceback
        yield   gr.update(interactive=True), f"βœ— An error occurred during returning result logsβ†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
        return [gr.update(interactive=True), f"βœ— An error occurred during returning result logsβ†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  # return the exception message

    #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
    #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')

## SMY: to be implemented/refactored AND moved to logic file
'''
def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
    """
    Gradio callback for Markdown β†’ PDF.
    Returns a list of generated PDF files (as Gradio File objects).
    """
    if not file and not folder:
        return []

    md_paths = []

    # Single file
    if file:
        md_path = Path(file.name)
        md_paths.append(md_path)

    # Folder
    if folder:
        try:
            md_paths.extend(collect_markdown_paths(folder))
        except Exception as exc:
            logger.exception("Folder traversal failed.")
            return []

    if not md_paths:
        return []

    output_dir = Path("./generated_pdfs")
    output_dir.mkdir(exist_ok=True)

    pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
    # Convert to Gradio File objects
    gr_files = [gr.File(path=str(p)) for p in pdf_files]
    return gr_files
'''


##====================
#Gradio interface moved to gradio_ui.py
#def build_interface() -> gr.Blocks:
#    """
#    Assemble the Gradio Blocks UI.
#    """

if __name__ == '__name__':
    convert_batch()