Spaces:

semmyk
/

parserPDF

Sleeping

App Files Files Community

semmyk commited on Oct 6

Commit

c6fb648

1 Parent(s): f9088c5

baseline08_beta0.4.0_06Oct25: Refactored. now runs without ProcessPoolExecutor. Marker inherently handles ThreadPoolExecutor and ProcessPoolExecutor. Gradio ui separated from Gradio process logics

Browse files

Files changed (10) hide show

converters/extraction_converter.py +59 -108
converters/pdf_to_md.py +114 -105
globals.py +42 -0
tests/test_file_handler.py +1 -1
ui/gradio_process.py +455 -0
ui/gradio_ui.py +43 -560
utils/config.py +4 -6
{file_handler → utils}/file_utils.py +49 -59
utils/get_config.py +1 -1
utils/logger.py +1 -1

converters/extraction_converter.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from pathlib import Path
 import traceback
 #import time
-from typing import Dict, Any, Type, Optional, Union #, BaseModel
 from pydantic import BaseModel
 from marker.models import create_model_dict
@@ -10,13 +10,9 @@ from marker.models import create_model_dict
 from marker.converters.pdf import PdfConverter as MarkerConverter  ## full document convertion/extraction
 from marker.config.parser import ConfigParser  ## Process custom configuration
 from marker.services.openai import OpenAIService as MarkerOpenAIService
 #from sympy import Union
-#from llm.hf_client import HFChatClient
-from llm.openai_client import OpenAIChatClient
-from file_handler.file_utils import collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
-from utils.lib_loader import load_library
 from utils.logger import get_logger
 logger = get_logger(__name__)
@@ -48,13 +44,17 @@ class DocumentConverter:
         api_token: str,
         openai_base_url: str = "https://router.huggingface.co/v1",
         openai_image_format: Optional[str] = "webp",
-        max_workers: Optional[str] =1,  #4,  for config_dict["pdftext_workers"]
         max_retries: Optional[int] = 2,
-        output_format: str = "markdown",
         output_dir: Optional[Union[str, Path]] = "output_dir",
         use_llm: Optional[bool] = None,  #bool = False,  #Optional[bool] = False,  #True,
         force_ocr: Optional[bool] = None, #bool = False,
-        page_range: Optional[str] = None,  #str = None  #Optional[str] = None,
         ):
         #self.converter = None  #MarkerConverter
@@ -65,20 +65,21 @@ class DocumentConverter:
         self.top_p = top_p               # self.client.top_p,
         self.llm_service = MarkerOpenAIService
         self.openai_image_format = openai_image_format  #"png"  #better compatibility
-        self.max_workers = max_workers  ## pass to config_dict["pdftext_workers"]
         self.max_retries = max_retries  ## pass to __call__
-        self.output_dir = output_dir    ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
-        self.use_llm = use_llm if use_llm else False  #use_llm[0] if isinstance(use_llm, tuple) else use_llm,  #False,  #True,
         self.force_ocr = force_ocr if force_ocr else False
         #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range   ##SMY: iterating twice because self.page casting as hint type tuple!
         self.page_range = page_range if page_range else None
         # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None,  ##Example: "0,4-8,16"  ##Marker parses as List[int]  #]debug  #len(pdf_file)
-        '''
-        if isinstance(page_range, tuple | str):
-            self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range
-        else:
-            self.page_range = None
-        '''
         # 0) Instantiate the LLM Client (OPENAIChatClient): Get a provider‐agnostic chat function
         ##SMY: #future. Plan to integrate into Marker: uses its own LLM services (clients). As at 1.9.2, there's no huggingface client service.
@@ -102,12 +103,14 @@ class DocumentConverter:
         # 1) # Define the custom configuration for the Hugging Face LLM.
                 # Use typing.Dict and typing.Any for flexible dictionary type hints
         try:
-            self.config_dict: Dict[str, Any] = self.get_config_dict(model_id=model_id, llm_service=str(self.llm_service), output_format=output_format)
             ##SMY: execute if page_range is none. `else None` ensures valid syntactic expression
             ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
             self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
-            self.config_dict.pop("use_llm", None) if not self.config_dict.get("use_llm") or self.config_dict.get("use_llm") is False or self.config_dict.get("use_llm") == 'False'  else None
             self.config_dict.pop("force_ocr", None) if not self.config_dict.get("force_ocr") or self.config_dict.get("force_ocr") is False or self.config_dict.get("force_ocr") == 'False'  else None
             logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"})  #, "config": str(self.config_dict)})
@@ -150,7 +153,8 @@ class DocumentConverter:
         # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
         try:  # Assign llm_service if api_token.  ##SMY: split and slicing  ##Gets the string value
-            llm_service_str = None if api_token == '' or api_token is None or self.use_llm is False else str(self.llm_service).split("'")[1]  #
             # sets api_key required by Marker ## to handle Marker's assertion test on OpenAI
             if llm_service_str:
@@ -174,13 +178,15 @@ class DocumentConverter:
             logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
             #return self.converter  ##SMY: to query why did I comment out?. Bingo: "__init__() should return None, not 'PdfConverter'"
         except Exception as exc:
             tb = traceback.format_exc
             logger.exception(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
         # Define the custom configuration for HF LLM.
-    def get_config_dict(self, model_id: str, llm_service=MarkerOpenAIService, output_format: Optional[str] = "markdown" ) -> Dict[str, Any]:
         """ Define the custom configuration for the Hugging Face LLM: combining Markers cli_options and LLM. """
         try:
@@ -191,8 +197,28 @@ class DocumentConverter:
             self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None,  ##SMY: passing as hint type tuple!
             ##SMY: TODO: convert to {inputs} and called from gradio_ui
-            config_dict = {
-                "output_format" : output_format,     #"markdown",
                 "openai_model"   : self.model_id,    #self.client.model_id,  #"model_name"
                 "openai_api_key" : self.openai_api_key,   #self.client.openai_api_key,  #self.api_token,
                 "openai_base_url": self.openai_base_url,  #self.client.base_url,  #self.base_url,
@@ -200,94 +226,19 @@ class DocumentConverter:
                 "top_p"          : self.top_p,            #self.client.top_p,
                 "openai_image_format": self.openai_image_format, #"webp",  #"png"  #better compatibility
                 "pdftext_workers": self.max_workers,  ## number of workers to use for pdftext."
-                "max_retries"    : self.max_retries,  #3,  ## pass to __call__
                 "output_dir"     : self.output_dir,
-                "use_llm"        : self.use_llm,      #False,  #True,
-                "force_ocr"      : self.force_ocr,    #False,
                 "page_range"     : self.page_range,   ##debug  #len(pdf_file)
-            }
             return config_dict
         except Exception as exc:
             tb = traceback.format_exc()   #exc.__traceback__
             logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}")  #").with_traceback(tb)
-            #raise
-    ##SMY: flagged for deprecation
-    ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
-    #def get_extraction_converter(self, chat_fn):
-    def get_create_model_dict(self):
-        """
-        Wraps the LLM chat_fn into marker’s artifact_dict
-        and returns an ExtractionConverter for PDFs & HTML.
-        """
-        return create_model_dict()
-        #artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
-        #return artifact_dict
-## SMY: Kept for future implementation (and historic reasoning). Keeping the classes separate to avoid confusion with the original implementation
-'''
-class DocumentExtractor:
-    """
-    Business logic wrapper using HFChatClient and Marker to
-    convert documents (PDF, HTML files) into markdowns + assets
-    Wrapper around the Marker extraction converter for PDFs & HTML.
-    """
-    def __init__(self,
-        provider: str,
-        model_id: str,
-        hf_provider: str,
-        endpoint_url: str,
-        backend_choice: str,
-        system_message: str,
-        max_tokens: int,
-        temperature: float,
-        top_p: float,
-        stream: bool,
-        api_token: str,
-        ):
-        # 1) Instantiate the LLM Client (HFChatClient): Get a provider‐agnostic chat function
-        try:
-            self.client = HFChatClient(
-            provider=provider,
-            model_id=model_id,
-            hf_provider=hf_provider,
-            endpoint_url=endpoint_url,
-            backend_choice=backend_choice,       #choices=["model-id", "provider", "endpoint"]
-            system_message=system_message,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=stream,
-            api_token=api_token,
-            )
-            logger.log(level=20, msg="✔️ HFChatClient instantiated:", extra={"model_id": model_id, "chatclient": str(self.client)})
-        except Exception as exc:
-            tb = traceback.format_exc()   #exc.__traceback__
-            logger.exception(f"✗ Error initialising HFChatClient: {exc}")
-            raise RuntimeError(f"✗ Error initialising HFChatClient: {exc}").with_traceback(tb)
-            #raise
-        # 2) Build Marker's artifact dict using the client's chat method
-        self.artifact_dict = self.get_extraction_converter(self.client)
-        # 3) Instantiate Marker's ExtractionConverter (ExtractionConverter)
-        try:
-            self.extractor = MarkerExtractor(artifact_dict=self.artifact_dict)
-        except Exception as exc:
-            logger.exception(f"✗ Error initialising MarkerExtractor: {exc}")
-            raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}")
-    ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
-    def get_extraction_converter(self, chat_fn):
-        """
-        Wraps the LLM chat_fn into marker’s artifact_dict
-        and returns an ExtractionConverter for PDFs & HTML.
-        """
-        artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
-        return artifact_dict
-'''

 from pathlib import Path
 import traceback
 #import time
+from typing import Dict, Any, Type, Optional, Union, Literal #, BaseModel
 from pydantic import BaseModel
 from marker.models import create_model_dict
 from marker.converters.pdf import PdfConverter as MarkerConverter  ## full document convertion/extraction
 from marker.config.parser import ConfigParser  ## Process custom configuration
 from marker.services.openai import OpenAIService as MarkerOpenAIService
+from marker.settings import settings
 #from sympy import Union
 from utils.logger import get_logger
 logger = get_logger(__name__)
         api_token: str,
         openai_base_url: str = "https://router.huggingface.co/v1",
         openai_image_format: Optional[str] = "webp",
+        max_workers: Optional[str] = 1,  #4,  for config_dict["pdftext_workers"]
         max_retries: Optional[int] = 2,
+        debug: Optional[bool] = None, #bool = False,
+        #output_format: str = "markdown",
+        output_format: Literal["markdown", "json", "html"] = "markdown",
         output_dir: Optional[Union[str, Path]] = "output_dir",
         use_llm: Optional[bool] = None,  #bool = False,  #Optional[bool] = False,  #True,
         force_ocr: Optional[bool] = None, #bool = False,
+        strip_existing_ocr: Optional[bool] = None, #bool = False,
+        disable_ocr_math: Optional[bool] = None, #bool = False,
+        page_range: Optional[str] = None,  #str = None  #Optional[str] = None,
         ):
         #self.converter = None  #MarkerConverter
         self.top_p = top_p               # self.client.top_p,
         self.llm_service = MarkerOpenAIService
         self.openai_image_format = openai_image_format  #"png"  #better compatibility
+        self.max_workers = max_workers #int(1)  ## pass to config_dict["pdftext_workers"]
         self.max_retries = max_retries  ## pass to __call__
+        self.debug = debug
+        #self.output_format = output_format
+        self.output_format = output_format
+        self.output_dir = settings.DEBUG_DATA_FOLDER if debug else output_dir,
+        self.use_llm = use_llm if use_llm else False   #use_llm[0] if isinstance(use_llm, tuple) else use_llm,  #False,  #True,
         self.force_ocr = force_ocr if force_ocr else False
+        self.strip_existing_ocr = strip_existing_ocr   #if strip_existing_ocr else False
+        self.disable_ocr_math = disable_ocr_math                 #if disable_ocr else False
         #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range   ##SMY: iterating twice because self.page casting as hint type tuple!
         self.page_range = page_range if page_range else None
         # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None,  ##Example: "0,4-8,16"  ##Marker parses as List[int]  #]debug  #len(pdf_file)
+        self.converter = None
         # 0) Instantiate the LLM Client (OPENAIChatClient): Get a provider‐agnostic chat function
         ##SMY: #future. Plan to integrate into Marker: uses its own LLM services (clients). As at 1.9.2, there's no huggingface client service.
         # 1) # Define the custom configuration for the Hugging Face LLM.
                 # Use typing.Dict and typing.Any for flexible dictionary type hints
         try:
+            #self.config_dict: Dict[str, Any] = self.get_config_dict(model_id=model_id, llm_service=str(self.llm_service), output_format=output_format)
+            self.config_dict: Dict[str, Any] = self.get_config_dict()
             ##SMY: execute if page_range is none. `else None` ensures valid syntactic expression
             ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
             self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
+            # use_llm test moved to config_dict
+            #self.config_dict.pop("use_llm", None) if not self.config_dict.get("use_llm") or self.config_dict.get("use_llm") is False or self.config_dict.get("use_llm") == 'False'  else None
             self.config_dict.pop("force_ocr", None) if not self.config_dict.get("force_ocr") or self.config_dict.get("force_ocr") is False or self.config_dict.get("force_ocr") == 'False'  else None
             logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"})  #, "config": str(self.config_dict)})
         # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
         try:  # Assign llm_service if api_token.  ##SMY: split and slicing  ##Gets the string value
+            #llm_service_str = None if api_token == '' or api_token is None or self.use_llm is False else str(self.llm_service).split("'")[1]  #
+            llm_service_str = None if not self.use_llm or self.use_llm == "False" or self.use_llm is False else str(self.llm_service).split("'")[1]  #
             # sets api_key required by Marker ## to handle Marker's assertion test on OpenAI
             if llm_service_str:
             logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
             #return self.converter  ##SMY: to query why did I comment out?. Bingo: "__init__() should return None, not 'PdfConverter'"
         except Exception as exc:
             tb = traceback.format_exc
             logger.exception(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
         # Define the custom configuration for HF LLM.
+    #def get_config_dict(self, model_id: str, llm_service=MarkerOpenAIService, output_format: Optional[str] = "markdown" ) -> Dict[str, Any]:
+    def get_config_dict(self, ) -> Dict[str, Any]:
         """ Define the custom configuration for the Hugging Face LLM: combining Markers cli_options and LLM. """
         try:
             self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None,  ##SMY: passing as hint type tuple!
             ##SMY: TODO: convert to {inputs} and called from gradio_ui
+            if not self.use_llm or self.use_llm == 'False':
+                config_dict = {
+                    "output_format" : self.output_format,     #"markdown",
+                    #"openai_model"   : self.model_id,    #self.client.model_id,  #"model_name"
+                    #"openai_api_key" : self.openai_api_key,   #self.client.openai_api_key,  #self.api_token,
+                    #"openai_base_url": self.openai_base_url,  #self.client.base_url,  #self.base_url,
+                    #"temperature"    : self.temperature,      #self.client.temperature,
+                    #"top_p"          : self.top_p,            #self.client.top_p,
+                    #"openai_image_format": self.openai_image_format, #"webp",  #"png"  #better compatibility
+                    "pdftext_workers": self.max_workers,  ## number of workers to use for pdftext."
+                    #"max_retries"    : self.max_retries,  #3,  ## pass to __call__
+                    "debug"          : self.debug,
+                    "output_dir"     : self.output_dir,
+                    #"use_llm"        : self.use_llm,                #False,  #True,
+                    "force_ocr"      : self.force_ocr,              #False,
+                    "strip_existing_ocr": self.strip_existing_ocr,  #False
+                    "disable_ocr_math": self.disable_ocr_math,
+                    "page_range"     : self.page_range,   ##debug  #len(pdf_file)
+                }
+            else:
+                config_dict = {
+                "output_format" : self.output_format,     #"markdown",
                 "openai_model"   : self.model_id,    #self.client.model_id,  #"model_name"
                 "openai_api_key" : self.openai_api_key,   #self.client.openai_api_key,  #self.api_token,
                 "openai_base_url": self.openai_base_url,  #self.client.base_url,  #self.base_url,
                 "top_p"          : self.top_p,            #self.client.top_p,
                 "openai_image_format": self.openai_image_format, #"webp",  #"png"  #better compatibility
                 "pdftext_workers": self.max_workers,  ## number of workers to use for pdftext."
+                #"max_retries"    : self.max_retries,  #3,  ## pass to __call__
+                "debug"          : self.debug,
                 "output_dir"     : self.output_dir,
+                "use_llm"        : self.use_llm,                #False,  #True,
+                "force_ocr"      : self.force_ocr,              #False,
+                "strip_existing_ocr": self.strip_existing_ocr,  #False
+                "disable_ocr_math": self.disable_ocr_math,
                 "page_range"     : self.page_range,   ##debug  #len(pdf_file)
+                }
             return config_dict
         except Exception as exc:
             tb = traceback.format_exc()   #exc.__traceback__
             logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}")  #").with_traceback(tb)
+            #raise

converters/pdf_to_md.py CHANGED Viewed

@@ -5,97 +5,27 @@ from typing import List, Dict, Union, Optional
 import traceback  ## Extract, format and print information about Python stack traces.
 import time
 import spaces
-from globals import config_load_models
 from converters.extraction_converter import DocumentConverter  #, DocumentExtractor #as docextractor #ExtractionConverter  #get_extraction_converter  ## SMY: should disuse
-from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
-from utils import config
 from utils.lib_loader import set_weasyprint_library
 from utils.logger import get_logger
 logger = get_logger(__name__)
 # Define global variables   ##SMY: TODO: consider moving to Globals sigleton constructor
-docconverter: DocumentConverter = None
-converter = None  #DocumentConverter
-#converter:DocumentConverter.converter = None
-#@spaces.GPU
-duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360  ## sec
-@spaces.GPU(duration=duration)   ## HF Spaces GPU support
 # Define docextractor in the pool as serialised object and passed to each worker process.
 # Note: DocumentConverter must be "picklable".
-def init_worker(#self,
-    provider: str,
-    model_id: str,
-    #base_url,
-    hf_provider: str,
-    endpoint_url: str,
-    backend_choice: str,
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-    stream: bool,
-    api_token: str,
-    openai_base_url: str,  #: str = "https://router.huggingface.co/v1",
-    openai_image_format: str,  #: str | None = "webp",
-    max_workers: int,
-    max_retries: int,  #: int | None = 2,
-    output_format: str,  #: str = "markdown",
-    output_dir: str,  #: Union | None = "output_dir",
-    use_llm: bool,  #: bool | None = False,
-    force_ocr: bool,
-    page_range: str,  #: str | None = None
-    ):
-    #'''
-    """
-    instantiate DocumentConverter/DocumentExtractor for use in each pool worker
-    Args:
-    """
-    ## moved to class
-    #    Initialise the global `converter` in each worker
-    # Define global variables
-    global docconverter
-    global converter
-    #'''
-    # 1) Instantiate the DocumentConverter
-    logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider})  ##debug
-    try:
-        docconverter = DocumentConverter(
-            model_id,  #: str,
-            hf_provider,  #: str,
-            temperature,  #: float,
-            top_p,  #: float,
-            api_token,  #: str,
-            openai_base_url,  #: str = "https://router.huggingface.co/v1",
-            openai_image_format,  #: str | None = "webp",
-            max_workers,  #: int  | None = 1,
-            max_retries,  #: int | None = 2,
-            output_format,  #: str = "markdown",
-            output_dir,  #: Union | None = "output_dir",
-            use_llm,  #: bool | None = False,
-            force_ocr,
-            page_range,  #: str | None = None
-        )
-        logger.log(level=20, msg="✔️ docextractor initialised:", extra={"docconverter model_id": docconverter.converter.config.get("openai_model"), "docconverter use_llm": docconverter.converter.use_llm, "docconverter output_dir": docconverter.output_dir})
-    except Exception as exc:
-        #logger.error(f"Failed to initialise DocumentConverter: {exc}")  #debug
-        tb = traceback.format_exc()
-        logger.exception(f"init_worker: Error initialising DocumentConverter → {exc}\n{tb}", exc_info=True)
-        return f"✗ init_worker: error initialising DocumentConverter → {exc}\n{tb}"
-    #docconverter = docconverter
-    converter = docconverter.converter
-    #self.llm_service = docconverter.llm_service  ##duplicate?
-    #self.model_id = model_id   ##duplicate?
-    #'''
 class PdfToMarkdownConverter:
     """
@@ -106,22 +36,90 @@ class PdfToMarkdownConverter:
     def __init__(self, options: Dict | None = None): #extractor: DocumentExtractor, options: Dict | None = None):
         self.options = options or {}    ##SMY: TOBE implemented - bring all Marker's options
         self.output_dir_string = ''
-        self.output_dir = self.output_dir_string  ## placeholder
-        #self.OUTPUT_DIR = config.OUTPUT_DIR     ##flag unused
-        #self.MAX_RETRIES = config.MAX_RETRIES   ##flag unused
-        #self.docconverter = None  #DocumentConverter
-        #self.converter = self.docconverter.converter #None
-    # This global will be set (re-initialised) in each worker after init_worker runs
-    #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
-    #duration = 10
-    #@spaces.GPU(duration=duration)   ## HF Spaces GPU support
-    #@spaces.GPU
-    ## moved from extraction_converter ( to standalone extract_to_md)
     #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
-    def extract(self, src_path: str, output_dir: str):   #Dict:
     #def extract(src_path: str, output_dir: str) -> Dict[str, int]:  #, extractor: DocumentExtractor) -> Dict[str, int]:
         """
         Convert one file (PDF/HTML) to Markdown + images.
@@ -140,13 +138,29 @@ class PdfToMarkdownConverter:
             logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True)  # Log the full traceback
             raise RuntimeWarning(f"✗ error during loading weasyprint backend dependency → {exc}\n{tb}")
         # Run Marker conversion with LLM if use_llm is true
         try:
-            #rendered = self.docconverter.converter(src_path, use_llm=True)
             #rendered = self.docconverter.converter(src_path)
-            rendered = converter(src_path)
             logger.log(level=20, msg=f"✓ File extraction successful for {Path(src_path).name}")
         except Exception as exc:
             tb = traceback.format_exc()
             logger.exception(f"Error during file extraction → {exc}\n{tb}", exc_info=True)  # Log the full traceback
@@ -154,15 +168,8 @@ class PdfToMarkdownConverter:
             return f"✗ error during extraction → {exc}\n{tb}"
         # Write Markdown file
-        '''
-        base = Path(str_path).stem   ## Get filename without extension
-        md_path = output_dir / f"{base}.md"  # Join output dir and new markdown file with the slash operator
-        with open(md_path, "w", encoding="utf-8") as f:
-            f.write(rendered.markdown)
-        '''
         try:
-            md_file = write_markdown(src_path=src_path, output_dir=output_dir, rendered=rendered)
             #debug md_file = "debug_md_file dummy name" ##debug
         except Exception as exc:
             tb = traceback.format_exc()
@@ -181,9 +188,9 @@ class PdfToMarkdownConverter:
         #return {"images": len(rendered.images), "file": md_file}  ##debug
         return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path}  ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
-    #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
-    duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360  ## sec
-    @spaces.GPU(duration=duration)   ## HF Spaces GPU support
     #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
     #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]:  #str:
     def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
@@ -200,13 +207,15 @@ class PdfToMarkdownConverter:
             tb = traceback.format_exc()
             logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
             return f"✗ error creating output_dir → {exc}\n{tb}"'''
-        output_dir = Path(self.output_dir)  ## takes the value from gradio_ui
         try:
             #if Path(src_path).suffix.lower() not in {".pdf", ".html", ".htm"}:
             #if not Path(src_path).name.endswith(tuple({".pdf", ".html"})):  #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
             #if not Path(src_path).name.endswith((".pdf", ".html", ".docx", ".doc")):  #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
-            if not Path(src_path).name.endswith(config.file_types_tuple):  #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
                 logger.log(level=20, msg=f"skipped {Path(src_path).name}", exc_info=True)
                 return f"skipped {Path(src_path).name}"
         except Exception as exc:

 import traceback  ## Extract, format and print information about Python stack traces.
 import time
+from gradio import Progress as grP
 import spaces
+from globals import config_load_models, config_load
 from converters.extraction_converter import DocumentConverter  #, DocumentExtractor #as docextractor #ExtractionConverter  #get_extraction_converter  ## SMY: should disuse
+from utils.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
+#from utils import config
 from utils.lib_loader import set_weasyprint_library
 from utils.logger import get_logger
 logger = get_logger(__name__)
 # Define global variables   ##SMY: TODO: consider moving to Globals sigleton constructor
+## moved to class
+#docconverter: DocumentConverter = None
+#converter = None  #DocumentConverter
 # Define docextractor in the pool as serialised object and passed to each worker process.
 # Note: DocumentConverter must be "picklable".
+#def init_worker(#self, ...
 class PdfToMarkdownConverter:
     """
     def __init__(self, options: Dict | None = None): #extractor: DocumentExtractor, options: Dict | None = None):
         self.options = options or {}    ##SMY: TOBE implemented - bring all Marker's options
         self.output_dir_string = ''
+        self.output_dir = ''   #self.output_dir_string  ## placeholder
+        self.docconverter = None  #DocumentConverter
+        self.converter = None  #self.docconverter.converter #None
+    def init_docconverter(self, output_dir: Union[str, Path] = config_load.output_dir, progress3=grP(track_tqdm=True)):
+        #'''
+        """
+        instantiate DocumentConverter/DocumentExtractor for use
+        Args:
+            ##TODO
+        """
+        provider: str = config_load.provider
+        model_id: str = config_load.model_id
+        #base_url,
+        hf_provider: str = config_load.hf_provider
+        endpoint_url: str = config_load.endpoint
+        backend_choice: str = config_load.backend_choice
+        system_message: str = config_load.system_message
+        max_tokens: int = config_load.max_tokens
+        temperature: float = config_load.temperature
+        top_p: float = config_load.top_p
+        stream: bool = config_load.stream
+        api_token: str = config_load.api_token
+        openai_base_url: str = config_load.openai_base_url
+        openai_image_format: str = config_load.openai_image_format
+        max_workers: int = config_load.max_workers
+        max_retries: int = config_load.max_retries
+        debug: bool = config_load.debug
+        output_format: str = config_load.output_format
+        output_dir: Union[str, Path] = config_load.output_dir_string   #output_dir #
+        use_llm: bool = config_load.use_llm
+        force_ocr: bool = config_load.force_ocr
+        strip_existing_ocr: bool = config_load.strip_existing_ocr
+        disable_ocr_math: bool = config_load.disable_ocr_math
+        page_range: str = config_load.page_range
+        # 1) Instantiate the DocumentConverter
+        logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider})  ##debug
+        progress3((0,1), desc=f"initialising docconverter: ...")
+        #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
+        time.sleep(0.75)  #.sleep(0.25)
+        try:
+            docconverter = DocumentConverter(
+                model_id,   #: str,
+                hf_provider,    #: str,
+                temperature,    #: float,
+                top_p,          #: float,
+                api_token,  #: str,
+                openai_base_url,    #: str = "https://router.huggingface.co/v1",
+                openai_image_format,    #: str | None = "webp",
+                max_workers,      #: int  | None = 1,
+                max_retries,      #: int | None = 2,
+                debug,                  #: bool = False
+                output_format,  #: str = "markdown",
+                output_dir,        #: Union | None = "output_dir",
+                use_llm,              #: bool | None = False,
+                force_ocr,          #: bool | None = False,
+                strip_existing_ocr, #bool = False,
+                disable_ocr_math,     #bool = False,
+                page_range,        #: str | None = None
+            )
+            logger.log(level=20, msg="✔️ docextractor initialised:", extra={"docconverter model_id": docconverter.converter.config.get("openai_model"), "docconverter use_llm": docconverter.converter.use_llm, "docconverter output_dir": docconverter.output_dir})
+            progress3((1,1), desc=f"✔️ docextractor initialised:")
+            time.sleep(0.75)  #.sleep(0.25)
+        except Exception as exc:
+            #logger.error(f"Failed to initialise DocumentConverter: {exc}")  #debug
+            tb = traceback.format_exc()
+            logger.exception(f"init_worker: Error initialising DocumentConverter → {exc}\n{tb}", exc_info=True)
+            return f"✗ init_worker: error initialising DocumentConverter → {exc}\n{tb}"
+        converter = docconverter.converter
+        self.docconverter = docconverter
+        self.converter = converter
+        #return converter
+    #duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360  ## sec
+    duration = 60*config_load_models.pdf_files_count if config_load_models.use_llm else 90  ## sec
+    @spaces.GPU(duration=duration)   ## HF Spaces GPU support
     #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
+    def extract(self, src_path: str, output_dir: str, progress4=grP()):   #Dict:
     #def extract(src_path: str, output_dir: str) -> Dict[str, int]:  #, extractor: DocumentExtractor) -> Dict[str, int]:
         """
         Convert one file (PDF/HTML) to Markdown + images.
             logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True)  # Log the full traceback
             raise RuntimeWarning(f"✗ error during loading weasyprint backend dependency → {exc}\n{tb}")
+        # Initialise Marker Converter
+        try:
+            if not self.converter:
+                self.init_docconverter(output_dir)
+            logger.log(level=20, msg=f"✓ Initialised Marker Converter")
+        except Exception as exc:
+            tb = traceback.format_exc()
+            logger.exception(f"Error during Marker Converter initialisation → {exc}\n{tb}", exc_info=True)  # Log the full traceback
+            return f"✗ error during extraction → {exc}\n{tb}"
         # Run Marker conversion with LLM if use_llm is true
         try:
+            progress4((0,1), desc=f"Extracting File: {Path(src_path).name}")
+            time.sleep(0.75)  #.sleep(0.25)
             #rendered = self.docconverter.converter(src_path)
+            rendered = self.converter(src_path)
             logger.log(level=20, msg=f"✓ File extraction successful for {Path(src_path).name}")
+            progress4((1,1), desc=f"✓ File extraction successful for {Path(src_path).name}")
+            time.sleep(0.75)  #.sleep(0.25)
         except Exception as exc:
             tb = traceback.format_exc()
             logger.exception(f"Error during file extraction → {exc}\n{tb}", exc_info=True)  # Log the full traceback
             return f"✗ error during extraction → {exc}\n{tb}"
         # Write Markdown file
         try:
+            md_file = write_markdown(src_path=src_path, output_dir=output_dir, rendered=rendered, output_format=config_load.output_format)
             #debug md_file = "debug_md_file dummy name" ##debug
         except Exception as exc:
             tb = traceback.format_exc()
         #return {"images": len(rendered.images), "file": md_file}  ##debug
         return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path}  ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
+    #duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360  ## sec
+    #@spaces.GPU(duration=duration)   ## HF Spaces GPU support
     #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
     #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]:  #str:
     def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
             tb = traceback.format_exc()
             logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
             return f"✗ error creating output_dir → {exc}\n{tb}"'''
+        #output_dir = Path(self.output_dir)  ## takes the value from gradio_ui
+        output_dir = Path(config_load.output_dir)  # Takes the value when output_dir is created in gradio_process
+        self.output_dir = output_dir
         try:
             #if Path(src_path).suffix.lower() not in {".pdf", ".html", ".htm"}:
             #if not Path(src_path).name.endswith(tuple({".pdf", ".html"})):  #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
             #if not Path(src_path).name.endswith((".pdf", ".html", ".docx", ".doc")):  #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
+            if not Path(src_path).name.endswith(config_load.file_types_tuple):  #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
                 logger.log(level=20, msg=f"skipped {Path(src_path).name}", exc_info=True)
                 return f"skipped {Path(src_path).name}"
         except Exception as exc:

globals.py CHANGED Viewed

@@ -9,8 +9,50 @@ class Config:
         self.weasyprint_libpath = ""
         self.config_ini = "utils\\config.ini"
         self.pdf_files_count = 0
 # Create a single, shared instance of the Config class
 # Other modules will import and use this instance.
 config_load_models = Config()

         self.weasyprint_libpath = ""
         self.config_ini = "utils\\config.ini"
         self.pdf_files_count = 0
+        self.output_dir = ""
+        # File types
+        self.file_types_list  = []
+        self.file_types_tuple = (".pdf", ".html", ".docx", ".doc")
+        # all other variables shared across the app
+        #self.pdf_files: list[str] = []
+        #self.pdf_files_count: int = 0
+        self.provider: str = ""
+        self.model_id: str = ""
+        #base_url: str
+        self.hf_provider: str = ""
+        self.endpoint: str = ""
+        self.backend_choice: str = ""
+        self.system_message: str = ""
+        self.max_tokens: int = 8192
+        self.temperature: float = 1.0
+        self.top_p: float = 1.0
+        self.stream: bool = False
+        self.api_token: str = ""
+        self.openai_base_url: str = "https://router.huggingface.co/v1"
+        self.openai_image_format: str = "webp"
+        self.max_workers: int = 1
+        self.max_retries: int = 2
+        self.debug: bool = False
+        #output_format: str = "markdown",
+        self.output_format: str = "markdown"
+        self.output_dir_string: str = "output_dir_default"
+        self.use_llm: bool = False
+        self.force_ocr: bool = True             #False,
+        self.strip_existing_ocr: bool = False    #bool = False,
+        self.disable_ocr_math: bool = None      #bool = False,
+        self.page_range: str = None
+        #self.weasyprint_dll_directories: str = None,
+        self.tz_hours: str = None
+        #oauth_token: gr.OAuthToken | None=None,
+        #progress: gr.Progress = gr.Progress(track_tqdm=True),  #Progress tracker to keep tab on pool queue executor
 # Create a single, shared instance of the Config class
 # Other modules will import and use this instance.
 config_load_models = Config()
+config_load = Config()
+#if __name__ == "__main__":

tests/test_file_handler.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pathlib import Path
 import tempfile
 from unittest.mock import patch
-from file_handler.file_utils import (
     collect_pdf_paths, collect_html_paths, collect_markdown_paths,
     process_dicts_data, create_outputdir
 )

 import tempfile
 from unittest.mock import patch
+from utils.file_utils import (
     collect_pdf_paths, collect_html_paths, collect_markdown_paths,
     process_dicts_data, create_outputdir
 )

ui/gradio_process.py ADDED Viewed

	@@ -0,0 +1,455 @@

+# ui/gradio_process.py
+import gradio as gr
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+import time
+from pathlib import Path, WindowsPath
+from typing import Optional, Union, Literal #, Dict, List, Any, Tuple
+from huggingface_hub import get_token
+import spaces    ##HuggingFace spaces to accelerate GPU support on HF Spaces
+#import utilities, helpers
+#import utils.file_utils
+from utils.file_utils import zip_processed_files, process_dicts_data, create_temp_folder   #, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir  ## should move to handling file
+from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD  #, file_types_list, file_types_tuple
+from utils.utils import is_dict, is_list_of_dicts
+from utils.get_config import get_config_value
+from llm.llm_login import get_login_token, is_loggedin_huggingface, login_huggingface
+from converters.extraction_converter import DocumentConverter as docconverter  #DocumentExtractor #as docextractor
+from converters.pdf_to_md import PdfToMarkdownConverter   #, init_worker
+#from converters.md_to_pdf import MarkdownToPdfConverter  ##SMY: PENDING: implementation
+import traceback  ## Extract, format and print information about Python stack traces.
+from utils.logger import get_logger
+logger = get_logger(__name__)   ##NB: setup_logging()  ## set logging
+# Instantiate converters class once – they are stateless
+pdf2md_converter = PdfToMarkdownConverter()
+#md2pdf_converter = MarkdownToPdfConverter()
+# User eXperience: Load Marker models ahead of time if not already loaded in reload mode
+## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
+from converters.extraction_converter import load_models
+from globals import config_load_models
+try:
+    if not config_load_models.model_dict:
+        model_dict = load_models()
+        config_load_models.model_dict = model_dict
+    '''if 'model_dict' not in globals():
+        global model_dict
+        model_dict = load_models()'''
+    logger.log(level=30, msg="Config_load_model: ", extra={"model_dict": str(model_dict)})
+except Exception as exc:
+    #tb = traceback.format_exc()   #exc.__traceback__
+    logger.exception(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
+    raise RuntimeError(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
+#def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):  ##moved to llm_login
+#duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
+#@spaces.GPU(duration=duration)   ## HF Spaces GPU support
+def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progress(track_tqdm=True)):
+    #Use progress.tqdm to integrate with the executor map
+    results = []
+    #for result_interim in progress2.tqdm(
+    for i, pdf_file in enumerate(iterable=progress2.tqdm(
+                iterable=pdf_files,  #, max_retries), total=len(pdf_files)
+                desc=f"Processing file conversion ... pool.map",
+                total=pdf_files_count),
+            start=1):
+        result_interim = pdf2md_converter.convert_files(pdf_file)
+        # Update the Gradio UI to improve user-friendly eXperience
+        #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
+        progress2((i,pdf_files_count), desc=f"Processing file conversion result: {i}: {str(pdf_file)} : [{str(result_interim)[:20]}]")
+        #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
+        time.sleep(0.75)  #.sleep(0.25)
+        results.append(result_interim)
+    return results
+##SMY: TODO: future: refactor to gradio_process.py and
+## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
+#@spaces.GPU
+def convert_batch(
+    pdf_files, #: list[str],
+    pdf_files_count: int,
+    provider: str,
+    model_id: str,
+    #base_url: str
+    hf_provider: str,
+    endpoint: str,
+    backend_choice: str,
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    stream: bool,
+    api_token_gr: str,
+    #max_workers: int,
+    #max_retries: int,
+    openai_base_url: str = "https://router.huggingface.co/v1",
+    openai_image_format: Optional[str] = "webp",
+    max_workers: Optional[int] = 1,  #4,
+    max_retries: Optional[int] = 2,
+    debug: bool = False,        #Optional[bool] = False,  #True,
+    #output_format: str = "markdown",
+    output_format: Literal["markdown", "json", "html"] = "markdown",
+    #output_dir: Optional[Union[str, Path]] = "output_dir",
+    output_dir_string: str = "output_dir_default",
+    use_llm: bool = False,      #Optional[bool] = False,  #True,
+    force_ocr: bool = True,     #Optional[bool] = False,
+    strip_existing_ocr: Optional[bool] = None,  #bool = False,
+    disable_ocr_math: Optional[bool] = None,    #bool = False,
+    page_range: str = None,     #Optional[str] = None,
+    weasyprint_dll_directories: str = None,     #weasyprint_libpath
+    tz_hours: str = None,
+    oauth_token: gr.OAuthToken | None=None,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),  #Progress tracker to keep tab on pool queue executor
+    progress1: gr.Progress = gr.Progress(),
+    #progress2: gr.Progress = gr.Progress(track_tqdm=True),
+    ): #-> str:
+    """
+    Handles the conversion process using multiprocessing.
+    Spins up a pool and converts all uploaded files in parallel.
+    Aggregates per-file logs into one string.
+    Receives Gradio component values, starting with the list of uploaded file paths
+    """
+    # login: Update the Gradio UI to improve user-friendly eXperience - commencing
+    # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
+    yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
+    progress((0,16), f"Commencing Processing ...")
+    time.sleep(0.25)
+    # get token from logged-in user:
+    api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
+    ##SMY: Strictly debug. Must not be live
+    #logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token": api_token, "api_token_gr": api_token_gr})
+    '''try:
+        ##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
+        #login_huggingface(api_token)  ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
+        if is_loggedin_huggingface() and (api_token is None or api_token == ""):
+            api_token = get_token()   ##SMY: might be redundant
+        elif is_loggedin_huggingface() is False and api_token:
+            login_huggingface(api_token)
+            # login: Update the Gradio UI to improve user-friendly eXperience
+            #yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
+        else:
+            pass
+            # login: Update the Gradio UI to improve user-friendly eXperience
+            #yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
+    except Exception as exc:  # Catch all exceptions
+        tb = traceback.format_exc()
+        logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
+        return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  # return the exception message
+    '''
+    progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
+    time.sleep(0.25)
+    ## debug
+    #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
+    #if not files:
+    if not pdf_files or pdf_files is None:  ## Check if files is None. This handles the case where no files are uploaded.
+        logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
+        #outputs=[log_output, files_individual_JSON, files_individual_downloads],
+        return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
+    progress((2,16), desc=f"Getting configuration values")
+    time.sleep(0.25)
+    # Get config values if not provided
+    #config_file = find_file("config.ini")  ##from file_handler.file_utils  ##takes a bit of time to process. #NeedOptimise
+    config_file = Path("utils") / "config.ini"  ##SMY: speed up sacrificing flexibility
+    model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
+    openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
+    openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
+    max_workers = max_workers if max_workers else get_config_value(config_file, "MARKER_CAP", "MAX_WORKERS")
+    max_retries = max_retries if max_retries else get_config_value(config_file, "MARKER_CAP", "MAX_RETRIES")
+    output_format = output_format if output_format else get_config_value(config_file, "MARKER_CAP", "OUTPUT_FORMAT")
+    output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
+    use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
+    page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
+    weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
+    config_load_models.weasyprint_libpath = weasyprint_dll_directories  ## Assign user's weasyprint path to Global var
+    config_load_models.pdf_files_count = pdf_files_count
+    progress((3,16), desc=f"Retrieved configuration values")
+    time.sleep(0.25)
+    # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
+    yield gr.update(interactive=False), f"Setting global variables : Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
+    progress((4,16), desc=f"Setting global variables : Initialiasing init_args")
+    time.sleep(0.25)
+    #init_args = ( ...
+    # set global variables
+    from globals import config_load
+    #self.pdf_files_count: int = 0
+    config_load.provider = provider
+    config_load.model_id = model_id
+    config_load.hf_provider = hf_provider
+    config_load.endpoint = endpoint
+    config_load.backend_choice = backend_choice
+    config_load.system_message = system_message
+    config_load.max_tokens = max_tokens
+    config_load.temperature = temperature
+    config_load.top_p = top_p
+    config_load.stream = stream
+    config_load.api_token = api_token
+    config_load.openai_base_url = openai_base_url
+    config_load.openai_image_format = openai_image_format
+    config_load.max_workers = max_workers
+    config_load.max_retries = max_retries
+    config_load.debug = debug
+    #output_format: str = "markdown",
+    config_load.output_format = output_format
+    config_load.output_dir_string = output_dir_string
+    config_load.use_llm = use_llm
+    config_load.force_ocr = force_ocr
+    config_load.strip_existing_ocr = strip_existing_ocr
+    config_load.disable_ocr_math = disable_ocr_math
+    config_load.page_range = page_range
+    #config_load.weasyprint_dll_directories: str = None,
+    config_load.tz_hours = tz_hours
+    # 1. create output_dir
+    try:
+        yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
+        progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
+        time.sleep(0.25)
+        #pdf2md_converter.output_dir_string = output_dir_string   ##SMY: attempt setting directly to resolve pool.map iterable
+        # Create Marker output_dir in temporary directory where Gradio can access it.  #file_utils.
+        output_dir = create_temp_folder(output_dir_string)
+        #pdf2md_converter.output_dir = output_dir  ##SMY should now redirect to globals
+        config_load.output_dir = output_dir
+        logger.info(f"✓ output_dir created: ", extra={"output_dir": config_load.output_dir.name, "in": str(config_load.output_dir.parent)})
+        yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
+        progress((6,16), desc=f"✓ Created output_dir.")
+        time.sleep(0.25)
+    except Exception as exc:
+            tb = traceback.format_exc()
+            tbp = traceback.print_exc()  # Print the exception traceback
+            logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)  # Log the full traceback
+            # Update the Gradio UI to improve user-friendly eXperience
+            yield gr.update(interactive=True), f"✗ An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  ## return the exception message
+            return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}"  ## return the exception message
+    # 2. Process file conversion leveraging ProcessPoolExecutor for efficiency
+    try:
+        results = []  ## Processed files result holder
+        logger.log(level=30, msg="Initialising Processing Files ...", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string})  #pdf_files_count
+        yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
+        progress((7,16), desc=f"Initialising Processing Files ...")
+        time.sleep(0.25)
+        # Create a pool with init_worker initialiser
+        ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
+        '''with ProcessPoolExecutor(
+            max_workers=max_workers,
+            initializer=init_worker,
+            initargs=init_args
+        ) as pool:'''
+        #logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string})  #pdf_files_count
+        #progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
+        #time.sleep(0.25)
+            # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
+            # The 'docconverter' argument is implicitly handled by the initialiser
+            #futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
+            #logs = [f.result() for f in as_completed(futures)]
+            #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
+            #logs = [f.result() for f in futures]
+        try:
+            #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
+            progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
+            time.sleep(0.25)
+            yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
+            '''# Use progress.tqdm to integrate with the executor map
+            #results = pool.map(pdf2md_converter.convert_files, pdf_files)  ##SMY iterables  #max_retries #output_dir_string)
+            for result_interim in progress.tqdm(
+                iterable=pool.map(pdf2md_converter.convert_files, pdf_files),  #, max_retries), total=len(pdf_files)
+                desc="ProcessPoolExecutor: Pooling file conversion ..."):
+                results.append(result_interim)
+                # Update the Gradio UI to improve user-friendly eXperience
+                #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
+                #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
+                #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
+                #time.sleep(0.25)'''
+            results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
+            logger.log(level=30, msg="Got Results from files conversion: ", extra={"results": str(results)[:20]})
+            yield gr.update(interactive=True), f"Got Results from files conversion: [{str(results)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
+            progress((11,16), desc=f"Got Results from files conversion")
+            time.sleep(0.25)
+        except Exception as exc:
+            # Raise the exception to stop the Gradio app: exception to halt execution
+            logger.exception("Error during pooling file conversion", exc_info=True)  # Log the full traceback
+            tbp = traceback.print_exc()  # Print the exception traceback
+            # Update the Gradio UI to improve user-friendly eXperience
+            yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"  ## return the exception message
+            return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"]  ## return the exception message
+    except Exception as exc:
+        tb = traceback.format_exc()
+        logger.exception(f"✗ Error during Files processing → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
+        #traceback.print_exc()  # Print the exception traceback
+        yield gr.update(interactive=True), f"✗ An error occurred during Files Processing → {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
+        return [gr.update(interactive=True), f"✗ An error occurred during files processing → {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]
+    # 3. Process file conversion results
+    try:
+        logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
+        progress((12,16), desc="Processing results from files conversion")  ##rekickin
+        time.sleep(0.25)
+        logs = []
+        logs_files_images = []
+        #logs.extend(results)   ## performant pythonic
+        #logs = list[results]  ##
+        logs = [result for result in results]  ## pythonic list comprehension
+        # [template]  ## logs : [file , images , filepath, image_path]
+        #logs_files_images = logs_files.extend(logs_images)  #zip(logs_files, logs_images)   ##SMY: in progress
+        logs_count =  0
+        #for log in logs:
+        for i, log in enumerate(logs):
+            logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path")  # isinstance(log, (dict, str))
+            logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
+            i_image_count = log.get("images", 0)
+            # Update the Gradio UI to improve user-friendly eXperience
+            #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
+            progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
+            logs_count = i+i_image_count
+    except Exception as exc:
+        tbp = traceback.print_exc()  # Print the exception traceback
+        logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True)  # Log the full traceback
+        return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  ## return the exception message
+        #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  ## return the exception message
+    # 4. Zip Processed Files and images. Insert to first index
+    try:  ##from file_handler.file_utils
+        progress((13,16), desc="Zipping processed files and images")
+        time.sleep(0.25)
+        zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S')  #date_format='%d%b%Y'
+        logs_files_images.insert(0, zipped_processed_files)
+        #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
+        progress((14,16), desc="Zipped processed files and images")
+        time.sleep(0.25)
+    except Exception as exc:
+        tb = traceback.format_exc()
+        logger.exception(f"✗ Error during zipping processed files → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
+        #traceback.print_exc()  # Print the exception traceback
+        yield gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
+        return gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
+    # 5. Return processed files log
+    try:
+        progress((15,16), desc="Formatting processed log results")
+        time.sleep(0.25)
+        ## # Convert logs list of dicts to formatted json stringutils.file_utils.
+        logs_return_formatted_json_string = process_dicts_data(logs)   #"\n".join(log for log in logs)  ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
+        #logs_files_images_return = "\n".join(path for path in logs_files_images)  ##TypeError: sequence item 0: expected str instance, WindowsPath found
+        ## # Convert any Path objects to strings, but leave strings as-is
+        logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
+        logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)})  ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
+        progress((16,16), desc="Complete processing and formatting file processing results")
+        time.sleep(0.25)
+        # [templates]
+        #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
+        #return "\n".join(logs), "\n".join(logs_files_images)    #"\n".join(logs_files)
+        yield  gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)    ##SMY: redundant
+        return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
+        #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
+        #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
+    except Exception as exc:
+        tb = traceback.format_exc()
+        logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
+        #traceback.print_exc()  # Print the exception traceback
+        yield   gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
+        return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  # return the exception message
+    #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
+    #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
+## SMY: to be implemented/refactored AND moved to logic file
+'''
+def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
+    """
+    Gradio callback for Markdown → PDF.
+    Returns a list of generated PDF files (as Gradio File objects).
+    """
+    if not file and not folder:
+        return []
+    md_paths = []
+    # Single file
+    if file:
+        md_path = Path(file.name)
+        md_paths.append(md_path)
+    # Folder
+    if folder:
+        try:
+            md_paths.extend(collect_markdown_paths(folder))
+        except Exception as exc:
+            logger.exception("Folder traversal failed.")
+            return []
+    if not md_paths:
+        return []
+    output_dir = Path("./generated_pdfs")
+    output_dir.mkdir(exist_ok=True)
+    pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
+    # Convert to Gradio File objects
+    gr_files = [gr.File(path=str(p)) for p in pdf_files]
+    return gr_files
+'''
+##====================
+#Gradio interface moved to gradio_ui.py
+#def build_interface() -> gr.Blocks:
+#    """
+#    Assemble the Gradio Blocks UI.
+#    """
+if __name__ == '__name__':
+    convert_batch()

ui/gradio_ui.py CHANGED Viewed

@@ -1,497 +1,20 @@
 # ui/gradio_ui.py
-from ast import Interactive
-import gradio as gr
-from concurrent.futures import ProcessPoolExecutor, as_completed
-import tqdm
-import asyncio   ##future
-import time
-from pathlib import Path, WindowsPath
-from typing import Optional, Union #, Dict, List, Any, Tuple
-from huggingface_hub import get_token
-import spaces    ##HuggingFace spaces to accelerate GPU support on HF Spaces
-#import file_handler
-from file_handler import file_utils
-import file_handler.file_utils
-from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD, file_types_list, file_types_tuple
-from utils.utils import is_dict, is_list_of_dicts
-from file_handler.file_utils import zip_processed_files, process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir  ## should move to handling file
-from file_handler.file_utils import find_file
-from utils.get_config import get_config_value
 from llm.provider_validator import is_valid_provider, suggest_providers
-from llm.llm_login import get_login_token, is_loggedin_huggingface, login_huggingface
 from converters.extraction_converter import DocumentConverter as docconverter  #DocumentExtractor #as docextractor
-from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
-#from converters.md_to_pdf import MarkdownToPdfConverter  ##SMY: PENDING: implementation
 import traceback  ## Extract, format and print information about Python stack traces.
 from utils.logger import get_logger
 logger = get_logger(__name__)   ##NB: setup_logging()  ## set logging
-# Instantiate converters class once – they are stateless
-pdf2md_converter = PdfToMarkdownConverter()
-#md2pdf_converter = MarkdownToPdfConverter()
-# User eXperience: Load Marker models ahead of time if not already loaded in reload mode
-## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
-from converters.extraction_converter import load_models
-from globals import config_load_models
-try:
-    if not config_load_models.model_dict:
-        model_dict = load_models()
-        config_load_models.model_dict = model_dict
-    '''if 'model_dict' not in globals():
-        global model_dict
-        model_dict = load_models()'''
-    logger.log(level=30, msg="Config_load_model: ", extra={"model_dict": str(model_dict)})
-except Exception as exc:
-    #tb = traceback.format_exc()   #exc.__traceback__
-    logger.exception(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
-    raise RuntimeError(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
-#def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):  ##moved to llm_login
-# pool executor to convert files called by Gradio
-##SMY: TODO: future: refactor to gradio_process.py and
-## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
-#@spaces.GPU
-def convert_batch(
-    pdf_files, #: list[str],
-    pdf_files_count: int,
-    provider: str,
-    model_id: str,
-    #base_url: str
-    hf_provider: str,
-    endpoint: str,
-    backend_choice: str,
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-    stream: bool,
-    api_token_gr: str,
-    #max_workers: int,
-    #max_retries: int,
-    openai_base_url: str = "https://router.huggingface.co/v1",
-    openai_image_format: Optional[str] = "webp",
-    max_workers: Optional[int] = 4,
-    max_retries: Optional[int] = 2,
-    output_format: str = "markdown",
-    #output_dir: Optional[Union[str, Path]] = "output_dir",
-    output_dir_string: str = "output_dir_default",
-    use_llm: bool = False,   #Optional[bool] = False,  #True,
-    force_ocr: bool = True,  #Optional[bool] = False,
-    page_range: str = None,  #Optional[str] = None,
-    weasyprint_dll_directories: str = None,
-    tz_hours: str = None,
-    oauth_token: gr.OAuthToken | None=None,
-    progress: gr.Progress = gr.Progress(track_tqdm=True),  #Progress tracker to keep tab on pool queue executor
-    progress1: gr.Progress = gr.Progress(),
-    #progress2: gr.Progress = gr.Progress(track_tqdm=True),
-    ): #-> str:
-    """
-    Handles the conversion process using multiprocessing.
-    Spins up a pool and converts all uploaded files in parallel.
-    Aggregates per-file logs into one string.
-    Receives Gradio component values, starting with the list of uploaded file paths
-    """
-    # login: Update the Gradio UI to improve user-friendly eXperience - commencing
-    # [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
-    yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
-    progress((0,16), f"Commencing Processing ...")
-    time.sleep(0.25)
-    # get token from logged-in user:
-    api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
-    ##SMY: Strictly debug. Must not be live
-    #logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token": api_token, "api_token_gr": api_token_gr})
-    '''try:
-        ##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
-        #login_huggingface(api_token)  ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
-        if is_loggedin_huggingface() and (api_token is None or api_token == ""):
-            api_token = get_token()   ##SMY: might be redundant
-        elif is_loggedin_huggingface() is False and api_token:
-            login_huggingface(api_token)
-            # login: Update the Gradio UI to improve user-friendly eXperience
-            #yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
-        else:
-            pass
-            # login: Update the Gradio UI to improve user-friendly eXperience
-            #yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
-    except Exception as exc:  # Catch all exceptions
-        tb = traceback.format_exc()
-        logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
-        return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  # return the exception message
-    '''
-    progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
-    time.sleep(0.25)
-    ## debug
-    #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
-    #if not files:
-    if not pdf_files or pdf_files is None:  ## Check if files is None. This handles the case where no files are uploaded.
-        logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
-        #outputs=[log_output, files_individual_JSON, files_individual_downloads],
-        return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
-    progress((2,16), desc=f"Getting configuration values")
-    time.sleep(0.25)
-    # Get config values if not provided
-    #config_file = find_file("config.ini")  ##from file_handler.file_utils  ##takes a bit of time to process. #NeedOptimise
-    config_file = Path("utils") / "config.ini"  ##SMY: speed up sacrificing flexibility
-    model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
-    openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
-    openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
-    max_workers = max_workers if max_workers else get_config_value(config_file, "MARKER_CAP", "MAX_WORKERS")
-    max_retries = max_retries if max_retries else get_config_value(config_file, "MARKER_CAP", "MAX_RETRIES")
-    output_format = output_format if output_format else get_config_value(config_file, "MARKER_CAP", "OUTPUT_FORMAT")
-    output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
-    use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
-    page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
-    weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
-    config_load_models.weasyprint_libpath = weasyprint_dll_directories  ## Assign user's weasyprint path to Global var
-    config_load_models.pdf_files_count = pdf_files_count
-    progress((3,16), desc=f"Retrieved configuration values")
-    time.sleep(0.25)
-    # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
-    yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
-    progress((4,16), desc=f"Initialiasing init_args")
-    time.sleep(0.25)
-    init_args = (
-            provider,
-            model_id,
-            #base_url,
-            hf_provider,
-            endpoint,
-            backend_choice,
-            system_message,
-            max_tokens,
-            temperature,
-            top_p,
-            stream,
-            api_token,
-            openai_base_url,
-            openai_image_format,
-            max_workers,
-            max_retries,
-            output_format,
-            output_dir_string,
-            use_llm,
-            force_ocr,
-            page_range,
-            #progress,
-        )
-    # create output_dir
-    try:
-        yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
-        progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
-        time.sleep(0.25)
-        #pdf2md_converter.output_dir_string = output_dir_string   ##SMY: attempt setting directly to resolve pool.map iterable
-        # Create Marker output_dir in temporary directory where Gradio can access it.
-        output_dir = file_utils.create_temp_folder(output_dir_string)
-        pdf2md_converter.output_dir = output_dir
-        logger.info(f"✓ output_dir created: ", extra={"output_dir": pdf2md_converter.output_dir.name, "in": str(pdf2md_converter.output_dir.parent)})
-        yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
-        progress((6,16), desc=f"✓ Created output_dir.")
-        time.sleep(0.25)
-    except Exception as exc:
-            tb = traceback.format_exc()
-            tbp = traceback.print_exc()  # Print the exception traceback
-            logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)  # Log the full traceback
-            # Update the Gradio UI to improve user-friendly eXperience
-            yield gr.update(interactive=True), f"✗ An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  ## return the exception message
-            return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}"  ## return the exception message
-    # Process file conversion leveraging ProcessPoolExecutor for efficiency
-    try:
-        results = []  ## initialised pool result holder
-        logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string})  #pdf_files_count
-        yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
-        progress((7,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
-        time.sleep(0.25)
-        # Create a pool with init_worker initialiser
-        with ProcessPoolExecutor(
-            max_workers=max_workers,
-            initializer=init_worker,
-            initargs=init_args
-        ) as pool:
-            logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string})  #pdf_files_count
-            progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
-            time.sleep(0.25)
-            # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
-            # The 'docconverter' argument is implicitly handled by the initialiser
-            #futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
-            #logs = [f.result() for f in as_completed(futures)]
-            #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
-            #logs = [f.result() for f in futures]
-            try:
-                #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
-                progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
-                time.sleep(0.25)
-                yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
-                '''# Use progress.tqdm to integrate with the executor map
-                #results = pool.map(pdf2md_converter.convert_files, pdf_files)  ##SMY iterables  #max_retries #output_dir_string)
-                for result_interim in progress.tqdm(
-                    iterable=pool.map(pdf2md_converter.convert_files, pdf_files),  #, max_retries), total=len(pdf_files)
-                    desc="ProcessPoolExecutor: Pooling file conversion ..."):
-                    results.append(result_interim)
-                    # Update the Gradio UI to improve user-friendly eXperience
-                    #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
-                    #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
-                    #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
-                    #time.sleep(0.25)'''
-                #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
-                #@spaces.GPU(duration=duration)   ## HF Spaces GPU support
-                def get_results_pool_map(pdf_files, pdf_files_count, progress2=gr.Progress()):
-                    #Use progress.tqdm to integrate with the executor map
-                    #results = pool.map(pdf2md_converter.convert_files, pdf_files)  ##SMY iterables  #max_retries #output_dir_string)
-                    for result_interim in progress2.tqdm(
-                        iterable=pool.map(pdf2md_converter.convert_files, pdf_files),  #, max_retries), total=len(pdf_files)
-                        desc=f"ProcessPoolExecutor: Pooling file conversion ... pool.map",
-                        total=pdf_files_count):
-                        results.append(result_interim)
-                        # Update the Gradio UI to improve user-friendly eXperience
-                        #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
-                        progress2((0,len(pdf_files)), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
-                        #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
-                        time.sleep(0.75)  #.sleep(0.25)
-                        return results
-                results = get_results_pool_map(pdf_files, pdf_files_count)
-                yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(results)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
-                progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
-                time.sleep(0.25)
-            except Exception as exc:
-                # Raise the exception to stop the Gradio app: exception to halt execution
-                logger.exception("Error during pooling file conversion", exc_info=True)  # Log the full traceback
-                tbp = traceback.print_exc()  # Print the exception traceback
-                # Update the Gradio UI to improve user-friendly eXperience
-                yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"  ## return the exception message
-                return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"]  ## return the exception message
-            # Process file conversion results
-            try:
-                logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
-                progress((12,16), desc="Processing results from files conversion")  ##rekickin
-                time.sleep(0.25)
-                logs = []
-                logs_files_images = []
-                #logs.extend(results)   ## performant pythonic
-                #logs = list[results]  ##
-                logs = [result for result in results]  ## pythonic list comprehension
-                # [template]  ## logs : [file , images , filepath, image_path]
-                #logs_files_images = logs_files.extend(logs_images)  #zip(logs_files, logs_images)   ##SMY: in progress
-                logs_count =  0
-                #for log in logs:
-                for i, log in enumerate(logs):
-                    logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path")  # isinstance(log, (dict, str))
-                    logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
-                    i_image_count = log.get("images", 0)
-                    # Update the Gradio UI to improve user-friendly eXperience
-                    #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
-                    progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
-                    logs_count = i+i_image_count
-            except Exception as exc:
-                tbp = traceback.print_exc()  # Print the exception traceback
-                logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True)  # Log the full traceback
-                return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  ## return the exception message
-                #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  ## return the exception message
-    except Exception as exc:
-        tb = traceback.format_exc()
-        logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
-        #traceback.print_exc()  # Print the exception traceback
-        yield gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
-    # Zip Processed Files and images. Insert to first index
-    try:  ##from file_handler.file_utils
-        progress((13,16), desc="Zipping processed files and images")
-        time.sleep(0.25)
-        zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S')  #date_format='%d%b%Y'
-        logs_files_images.insert(0, zipped_processed_files)
-        #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
-        progress((14,16), desc="Zipped processed files and images")
-        time.sleep(0.25)
-    except Exception as exc:
-        tb = traceback.format_exc()
-        logger.exception(f"✗ Error during zipping processed files → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
-        #traceback.print_exc()  # Print the exception traceback
-        yield gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
-        return gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
-    # Return processed files log
-    try:
-        progress((15,16), desc="Formatting processed log results")
-        time.sleep(0.25)
-        ## # Convert logs list of dicts to formatted json string
-        logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs)   #"\n".join(log for log in logs)  ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
-        #logs_files_images_return = "\n".join(path for path in logs_files_images)  ##TypeError: sequence item 0: expected str instance, WindowsPath found
-        ## # Convert any Path objects to strings, but leave strings as-is
-        logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
-        logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)})  ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
-        progress((16,16), desc="Complete processing and formatting file processing results")
-        time.sleep(0.25)
-        # [templates]
-        #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
-        #return "\n".join(logs), "\n".join(logs_files_images)    #"\n".join(logs_files)
-        yield  gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)    ##SMY: redundant
-        return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
-        #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
-        #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
-    except Exception as exc:
-        tb = traceback.format_exc()
-        logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
-        #traceback.print_exc()  # Print the exception traceback
-        yield   gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"  # return the exception message
-        return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"]  # return the exception message
-    #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
-    #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
-# files wrapping into list  ##SMY: Flagged for deprecation
-def pdf_files_wrap(files: list[str]):
-    # explicitly wrap file object in a list
-    return [files] if not isinstance(files, list) else files
-    #return [files]
-##====================
-## SMY: moved to logic file: See pdf_to_md.py. Currently unused
-def convert_pdfs_to_md(file: gr.File | None, folder: str | None) -> dict:
-    """
-    Gradio callback for PDF → Markdown.
-    Accepts either a single file or a folder path (recursively).
-    Leverages Marker, a pipeline of deep learning models, for conversion
-    Returns a dictionary of filename → Markdown string.
-    """
-    if not file and not folder:
-        return {"error": "Please provide a PDF file or a folder."}
-    pdf_paths = []
-    # Single file
-    if file:
-        pdf_path = Path(file.name)
-        pdf_paths.append(pdf_path)
-    # Folder (recursively)
-    if folder:
-        try:
-            pdf_paths.extend(collect_pdf_paths(folder))
-        except Exception as exc:
-            logger.exception("Folder traversal failed.")
-            return {"error": str(exc)}
-    if not pdf_paths:
-        return {"error": "No PDF files found."}
-    results = pdf2md_converter.batch_convert(pdf_paths)
-    # Gradio expects a dict of {filename: content}
-    return results
-## SMY: to be implemented AND to refactor and moved to logic file
-def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
-    """
-    Gradio callback for Markdown → PDF.
-    Returns a list of generated PDF files (as Gradio File objects).
-    """
-    if not file and not folder:
-        return []
-    md_paths = []
-    # Single file
-    if file:
-        md_path = Path(file.name)
-        md_paths.append(md_path)
-    # Folder
-    if folder:
-        try:
-            md_paths.extend(collect_markdown_paths(folder))
-        except Exception as exc:
-            logger.exception("Folder traversal failed.")
-            return []
-    if not md_paths:
-        return []
-    output_dir = Path("./generated_pdfs")
-    output_dir.mkdir(exist_ok=True)
-    pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
-    # Convert to Gradio File objects
-    gr_files = [gr.File(path=str(p)) for p in pdf_files]
-    return gr_files
-## SMY: to refactor and moved to logic file. Currently unused
-'''
-def convert_htmls_to_md(file: gr.File | None, folder: str | None) -> dict:
-    """
-    Gradio callback for HTML → Markdown.
-    Accepts either a single file or a folder path (recursively).
-    Returns a dictionary of filename → Markdown string.
-    """
-    if not file and not folder:
-        return {"error": "Please provide a HTML file or a folder."}
-    html_paths = []
-    # Single file
-    if file:
-        html_path = Path(file.name)
-        html_paths.append(html_path)
-    # Folder (recursively)
-    if folder:
-        try:
-            html_paths.extend(collect_html_paths(folder))
-        except Exception as exc:
-            logger.exception("Folder traversal failed.")
-            return {"error": str(exc)}
-    if not html_paths:
-        return {"error": "No HTML files found."}
-    results = html2md_converter.batch_convert(html_paths)
-    # Gradio expects a dict of {filename: content}
-    return results
-'''
 ##====================
 def build_interface() -> gr.Blocks:
@@ -520,45 +43,8 @@ def build_interface() -> gr.Blocks:
     }
     """
-    ##SMY: flagged; to move to file_handler.file_utils
-    def is_file_with_extension(path_obj: Path) -> bool:
-        """
-        Checks if a pathlib.Path object is a file and has a non-empty extension.
-        """
-        path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
-        return path_obj.is_file() and bool(path_obj.suffix)
-    ##SMY: flagged; to move to file_handler.file_utils
-    def accumulate_files(uploaded_files, current_state):
-        """
-        Accumulates newly uploaded files with the existing state.
-        """
-        # Initialize state if it's the first run
-        if current_state is None:
-            current_state = []
-        # If no files were uploaded in this interaction, return the current state unchanged
-        if not uploaded_files:
-            return current_state, f"No new files uploaded. Still tracking {len(current_state)} file(s)."
-        # Get the temporary paths of the newly uploaded files
-        # call is_file_with_extension to check if pathlib.Path object is a file and has a non-empty extension
-        new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name))]  #Path(f.name) and Path(f.name).is_file() and bool(Path(f.name).suffix)]  #Path(f.name).suffix.lower() !=""]
-        # Concatenate the new files with the existing ones in the state
-        updated_files = current_state + new_file_paths
-        updated_filenames = [Path(f).name for f in updated_files]
-        updated_files_count = len(updated_files)
-        # Return the updated state and a message to the user
-        #file_info = "\n".join(updated_files)
-        filename_info = "\n".join(updated_filenames)
-        #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
-        message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
-        return updated_files, updated_files_count, message, gr.update(interactive=True), gr.update(interactive=True)
     # with gr.Blocks(title=TITLE) as demo
     with gr.Blocks(title=TITLE, css=custom_css) as demo:
         gr.Markdown(f"## {DESCRIPTION}")
@@ -653,18 +139,12 @@ def build_interface() -> gr.Blocks:
                     label="Output Format",
                     value="markdown",
                 )
-                output_dir_tb = gr.Textbox(
-                    label="Output Directory",
-                    value="output_dir",  #"output_md",
-                    lines=1,
-                    max_lines=1,
-                )
             with gr.Row():
                 max_workers_sl = gr.Slider(
                     label="Max Worker",
                     minimum=1,
-                    maximum=7,
-                    value=4,
                     step=1
                 )
                 max_retries_sl = gr.Slider(
@@ -674,14 +154,34 @@ def build_interface() -> gr.Blocks:
                     value=2,
                     step=1  #0.01
                 )
                 with gr.Column():
                     use_llm_cb = gr.Checkbox(
                         label="Use LLM for Marker conversion",
                         value=False
                     )
                     force_ocr_cb = gr.Checkbox(
-                        label="Force OCR on all pages",
-                        value=True,
                     )
                 with gr.Column():
                     page_range_tb = gr.Textbox(
@@ -729,14 +229,14 @@ def build_interface() -> gr.Blocks:
                 btn_pdf_convert = gr.Button("Convert PDF(s)")
             '''
-            file_types_list.extend(file_types_tuple)
             with gr.Column(elem_classes=["file-or-directory-area"]):
                 with gr.Row():
                     file_btn = gr.UploadButton(
                     #file_btn = gr.File(
                         label="Upload Multiple Files",
                         file_count="multiple",
-                        file_types= file_types_list,  #["file"],  ##config.file_types_list
                         #height=25,  #"sm",
                         size="sm",
                         elem_classes=["gradio-upload-btn"]
@@ -745,7 +245,8 @@ def build_interface() -> gr.Blocks:
                     #dir_btn = gr.File(
                         label="Upload a Directory",
                         file_count="directory",
-                        file_types= file_types_list,   #["file"],  #Warning: The `file_types` parameter is ignored when `file_count` is 'directory'
                         #height=25,  #"0.5",
                         size="sm",
                         elem_classes=["gradio-upload-btn"]
@@ -851,7 +352,7 @@ def build_interface() -> gr.Blocks:
         uploaded_file_list = gr.State([])   ##NB: initial value of `gr.State` must be able to be deepcopied
         uploaded_files_count = gr.State(0)   ## initial files count
-        state_max_workers = gr.State(4)  #max_workers_sl,
         state_max_retries = gr.State(2) #max_retries_sl,
         state_tz_hours    = gr.State(value=None)
         state_api_token   = gr.State(None)
@@ -953,10 +454,6 @@ def build_interface() -> gr.Blocks:
             yield [], msg, None, None
             return [], 0, f"Files list cleared.", None, None
-        #hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
-        ##unused
-        ###hf_login_logout_btn.click(fn=custom_do_logout, inputs=[hf_login_logout_btn, state_api_token], outputs=[hf_login_logout_btn, api_token_tb, logout_status_md, state_api_token])
-        ###logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status_md, hf_login_logout_btn, logout_btn])
         #logout_btn.click(fn=clear_state, inputs=None, outputs=[uploaded_file_list, output_textbox, log_output, api_token_tb])
         hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status_md])  #, state_api_token])
@@ -1009,21 +506,22 @@ def build_interface() -> gr.Blocks:
             top_p_sl,
             stream_cb,
             api_token_tb,   #state_api_token,  #api_token_tb,
-            #gr.State(4),   # max_workers
-            #gr.State(3),    # max_retries
             openai_base_url_tb,
             openai_image_format_dd,
-            state_max_workers, #gr.State(4),  #max_workers_sl,
             state_max_retries, #gr.State(2), #max_retries_sl,
             output_format_dd,
             output_dir_tb,
             use_llm_cb,
             force_ocr_cb,
             page_range_tb,
             weasyprint_dll_directories_tb,
             tz_hours_num,   #state_tz_hours
         ]
         ## debug
         #logger.log(level=30, msg="About to execute btn_pdf_convert.click", extra={"files_len": pdf_files_count, "pdf_files": pdf_files})
@@ -1097,22 +595,7 @@ def build_interface() -> gr.Blocks:
             fn=get_file_count,
             inputs=[files_upload_html],
             outputs=[html_files_count, log_output]
-        )
-        # Validate files upload on change; warn but allow continue
-        def on_pdf_files_change(pdf_files_value: list[str]):
-            # explicitly wrap file object in a list
-            pdf_files_value = pdf_files_wrap(pdf_files_value)
-            #if not isinstance(pdf_files_value, list):
-            #    pdf_files_value = [pdf_files_value]
-            pdf_files_path = [file.name for file in pdf_files_value]
-            pdf_files_len = len(pdf_files_value)  #len(pdf_files_path)
-            if pdf_files_value:
-                #return
-                return pdf_files_path, pdf_files_len
-        #pdf_files.change(on_pdf_files_change, inputs=pdf_files, outputs=[log_output, pdf_files_count])  #, postprocess=False)  ##debug
     return demo

 # ui/gradio_ui.py
+import gradio as gr
+from ui.gradio_process import convert_batch
+from globals import config_load
 from llm.provider_validator import is_valid_provider, suggest_providers
 from converters.extraction_converter import DocumentConverter as docconverter  #DocumentExtractor #as docextractor
+from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
+from utils.file_utils import accumulate_files, is_file_with_extension
 import traceback  ## Extract, format and print information about Python stack traces.
 from utils.logger import get_logger
 logger = get_logger(__name__)   ##NB: setup_logging()  ## set logging
 ##====================
 def build_interface() -> gr.Blocks:
     }
     """
+    ##SMY: flagged; to move to file_handler.file_utils #accumulate_files()
     # with gr.Blocks(title=TITLE) as demo
     with gr.Blocks(title=TITLE, css=custom_css) as demo:
         gr.Markdown(f"## {DESCRIPTION}")
                     label="Output Format",
                     value="markdown",
                 )
             with gr.Row():
                 max_workers_sl = gr.Slider(
                     label="Max Worker",
                     minimum=1,
+                    maximum=4,
+                    value=1,
                     step=1
                 )
                 max_retries_sl = gr.Slider(
                     value=2,
                     step=1  #0.01
                 )
+                output_dir_tb = gr.Textbox(
+                    label="Output Directory",
+                    value="output_dir",  #"output_md",
+                    lines=1,
+                    max_lines=1,
+                )
+            with gr.Row():
                 with gr.Column():
+                    debug_cb = gr.Checkbox(
+                        label="Run in debug mode. Not recommended",
+                        value=False,  #True,
+                    )
                     use_llm_cb = gr.Checkbox(
                         label="Use LLM for Marker conversion",
                         value=False
                     )
                     force_ocr_cb = gr.Checkbox(
+                        label="Force OCR on all pages. (Beware: extended processing time)",
+                        value=False,  #True,
+                    )
+                #with gr.Column():
+                    strip_existing_ocr_cb = gr.Checkbox(
+                        label="strip embedded OCR text, re-run OCR",
+                        value=False
+                    )
+                    disable_ocr_math_cb = gr.Checkbox(
+                        label="OCR: disable math - no inline math",
+                        value=False,
                     )
                 with gr.Column():
                     page_range_tb = gr.Textbox(
                 btn_pdf_convert = gr.Button("Convert PDF(s)")
             '''
+            config_load.file_types_list.extend(config_load.file_types_tuple)  ##allowed file types in global
             with gr.Column(elem_classes=["file-or-directory-area"]):
                 with gr.Row():
                     file_btn = gr.UploadButton(
                     #file_btn = gr.File(
                         label="Upload Multiple Files",
                         file_count="multiple",
+                        file_types= config_load.file_types_list,  #["file"],  ##config.file_types_list
                         #height=25,  #"sm",
                         size="sm",
                         elem_classes=["gradio-upload-btn"]
                     #dir_btn = gr.File(
                         label="Upload a Directory",
                         file_count="directory",
+                        #file_types= config_load.file_types_list,   #["file"],  #Warning: The `file_types` parameter is ignored when `file_count` is 'directory'
+                        ## [handled in accumulate_files] file_types - raised Error(gradio.exceptions.Error: "Invalid file type
                         #height=25,  #"0.5",
                         size="sm",
                         elem_classes=["gradio-upload-btn"]
         uploaded_file_list = gr.State([])   ##NB: initial value of `gr.State` must be able to be deepcopied
         uploaded_files_count = gr.State(0)   ## initial files count
+        state_max_workers = gr.State(1)  #max_workers_sl,  #4
         state_max_retries = gr.State(2) #max_retries_sl,
         state_tz_hours    = gr.State(value=None)
         state_api_token   = gr.State(None)
             yield [], msg, None, None
             return [], 0, f"Files list cleared.", None, None
         #logout_btn.click(fn=clear_state, inputs=None, outputs=[uploaded_file_list, output_textbox, log_output, api_token_tb])
         hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status_md])  #, state_api_token])
             top_p_sl,
             stream_cb,
             api_token_tb,   #state_api_token,  #api_token_tb,
             openai_base_url_tb,
             openai_image_format_dd,
+            state_max_workers, #gr.State(1),  #max_workers_sl,
             state_max_retries, #gr.State(2), #max_retries_sl,
+            debug_cb,
             output_format_dd,
             output_dir_tb,
             use_llm_cb,
             force_ocr_cb,
+            strip_existing_ocr_cb,
+            disable_ocr_math_cb,
             page_range_tb,
             weasyprint_dll_directories_tb,
             tz_hours_num,   #state_tz_hours
         ]
         ## debug
         #logger.log(level=30, msg="About to execute btn_pdf_convert.click", extra={"files_len": pdf_files_count, "pdf_files": pdf_files})
             fn=get_file_count,
             inputs=[files_upload_html],
             outputs=[html_files_count, log_output]
+        )
     return demo

utils/config.py CHANGED Viewed

@@ -28,13 +28,10 @@ DESCRIPTION_MD = (
     "Upload Markdown/LaTeX files and generate a polished PDF."
 )
-# File types
-file_types_list  = []
-file_types_tuple = (".pdf", ".html", ".docx", ".doc")
-#file_types_list = list[file_types_tuple]
-#file_types_list.extend(file_types_tuple)
 # Conversion defaults
 DEFAULT_MARKER_OPTIONS = {
     "include_images": True,
@@ -86,4 +83,5 @@ hf_client      = None
 artifact_dict  = None
 pdf_converter  = None
 html_converter = None

     "Upload Markdown/LaTeX files and generate a polished PDF."
 )
+##SMY: See config.ini
+##===================
+'''
 # Conversion defaults
 DEFAULT_MARKER_OPTIONS = {
     "include_images": True,
 artifact_dict  = None
 pdf_converter  = None
 html_converter = None
+'''

{file_handler → utils}/file_utils.py RENAMED Viewed

@@ -252,7 +252,7 @@ def zip_processed_files(root_dir: str, file_paths: list[str], tz_hours=None, dat
     """
     import zipfile
-    from file_handler import file_utils
     from utils import utils
     root_path = Path(root_dir)
@@ -373,6 +373,40 @@ def process_dicts_data(data:Union[dict, list[dict]]):
     return formatted_string
 ##NB: Python =>3.10, X | Y equiv to the type checker as Union[X, Y]
 def collect_pdf_html_paths(root: Union[str, Path]) -> List[Path]:
     """
@@ -425,6 +459,7 @@ def write_markdown(
     src_path: Union[str, Path],
     output_dir: Union[str, Path],
     rendered: Any,
 ) -> Path:
     """
@@ -468,7 +503,15 @@ def write_markdown(
     #out_dir = Path(output_dir)
     #out_dir.mkdir(parents=True, exist_ok=True)
-    md_name = f"{src.stem}.md"
     if isinstance(output_dir, Path):
         md_path = output_dir / f"{src.stem}" / md_name
     else:
@@ -484,10 +527,12 @@ def write_markdown(
     md_path.parent.chmod(0)
     try:
-        markdown_text = getattr(rendered, "markdown")  ##SMY: get extracted markdown
     except AttributeError as exc:  # pragma: no cover
         raise AttributeError(
-            "Extractor Rendered object must have a 'markdown' attribute"
         ) from exc
     with md_path.open(mode="w", encoding="utf-8") as md_f:
@@ -562,58 +607,3 @@ def dump_images(
     return images_count, img_path_list        ##SMY: return number of images and path
     #return images.items().count
     #return len(images)
-# Dummp Markdown extracted images  ##SMY: Marked for deprecated
-'''
-def dump_images(
-    src_path: Union[str, Path],
-    output_dir: Union[str, Path],
-    rendered: Any,
-) -> int:
-    """
-    Dump the images  of the Markdown representation of a source file to an output directory.
-    Parameters
-    ----------
-    src_path : str | Path
-        Path to the original source file. Only its base name is used for naming
-        the resulting Markdown file.
-    output_dir : str | Path
-        Directory where the Markdown file will be written. It was created if it does not
-        exist with create_outputdir().
-    rendered : object
-        Object that provides a ``markdown`` attribute containing the text to write.
-    Returns
-    -------
-    Number of images dumped from the  Markdown file.
-    """
-    try:
-        images: Mapping[str, bytes] = getattr(rendered, "images")
-    except TypeError as exc:  # pragma: no cover
-        raise AttributeError(
-            "Extracted images from rendered.images must be a mapping of str -> bytes"
-        ) from exc
-    images_count = 0
-    ##SMY: See marker.output.save_output()  : https://github.com/datalab-to/marker/blob/master/marker/output.py
-    #for img_name, img_bytes in images.items():
-    for img_name, img in images.items():
-        # Resolve the full path and make sure any sub‑directories exist.
-        img_path = Path(output_dir) / src_path / img_name    ##SMY: image files  ##concatenate Path + str
-        img_path.parent.mkdir(parents=True, exist_ok=True)
-        #'' '
-        #with img_path.open("wb") as fp:
-        #    fp.write(img_bytes)    ##SMY: write images to markdown folder
-        #images_count += 1
-        #'' '
-        img.save(img_path)    ##SMY: save images (of type PIL.Image.Image) to markdown folder
-        images_count += 1
-    return images_count        ##SMY: return number of images
-    #return images.items().count
-    #return len(images)
-'''

     """
     import zipfile
+    from utils import file_utils
     from utils import utils
     root_path = Path(root_dir)
     return formatted_string
+def accumulate_files(uploaded_files, current_state):
+    """
+    Accumulates newly uploaded files with the existing state.
+    """
+    from globals import config_load
+    import gradio as gr
+    # Initialize state if it's the first run
+    if current_state is None:
+        current_state = []
+    # If no files were uploaded in this interaction, return the current state unchanged
+    if not uploaded_files:
+        return current_state, f"No new files uploaded. Still tracking {len(current_state)} file(s)."
+    # Get the temporary paths of the newly uploaded files
+    # call is_file_with_extension to check if pathlib.Path object is a file and has a non-empty extension
+    #new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name))]  #Path(f.name) and Path(f.name).is_file() and bool(Path(f.name).suffix)]  #Path(f.name).suffix.lower() !=""]
+    new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name)) and f.name.endswith(config_load.file_types_tuple)]
+    # Concatenate the new files with the existing ones in the state
+    updated_files = current_state + new_file_paths
+    updated_filenames = [Path(f).name for f in updated_files]
+    updated_files_count = len(updated_files)
+    # Return the updated state and a message to the user
+    #file_info = "\n".join(updated_files)
+    filename_info = "\n".join(updated_filenames)
+    #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
+    message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
+    return updated_files, updated_files_count, message, gr.update(interactive=True), gr.update(interactive=True)
 ##NB: Python =>3.10, X | Y equiv to the type checker as Union[X, Y]
 def collect_pdf_html_paths(root: Union[str, Path]) -> List[Path]:
     """
     src_path: Union[str, Path],
     output_dir: Union[str, Path],
     rendered: Any,
+    output_format: str,
 ) -> Path:
     """
     #out_dir = Path(output_dir)
     #out_dir.mkdir(parents=True, exist_ok=True)
+    #md_name = f"{src.stem}.md"
+    output_handler = {
+        "markdown": "md",
+        "json": "json",
+        "html": "html",
+                }
+    output_ext = output_handler.get(output_format, "md")
+    md_name = f"{src.stem}.{output_ext}"
     if isinstance(output_dir, Path):
         md_path = output_dir / f"{src.stem}" / md_name
     else:
     md_path.parent.chmod(0)
     try:
+        #markdown_text = getattr(rendered, "markdown")  ##SMY: get extracted markdown
+        markdown_text = getattr(rendered, output_format)
     except AttributeError as exc:  # pragma: no cover
         raise AttributeError(
+            #"Extractor Rendered object must have a 'markdown' attribute"
+            f"Extractor Rendered object must have a '{output_format}' attribute"
         ) from exc
     with md_path.open(mode="w", encoding="utf-8") as md_f:
     return images_count, img_path_list        ##SMY: return number of images and path
     #return images.items().count
     #return len(images)

utils/get_config.py CHANGED Viewed

@@ -14,7 +14,7 @@ sys.path.insert(0, f"{grandparent_dir}")  #\\file_handler")
 ##end debug
 #'''
 #import file_handler
-from file_handler.file_utils import find_file
 def get_config_value(config_file:Path, section_key:str, parameter:str, fallback:str=None) -> str:   # configfile: Union[str, Path]="utils\\config.ini"):
     """ Load config file, locate section, read parameter and return value

 ##end debug
 #'''
 #import file_handler
+from utils.file_utils import find_file
 def get_config_value(config_file:Path, section_key:str, parameter:str, fallback:str=None) -> str:   # configfile: Union[str, Path]="utils\\config.ini"):
     """ Load config file, locate section, read parameter and return value

utils/logger.py CHANGED Viewed

@@ -72,7 +72,7 @@ def setup_logging(level: int = None, tz_hours=None, date_format:str="%d%b%Y") ->
     # File handler
     #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
     #file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
-    from file_handler.file_utils import check_create_logfile
     file_handler = logging.FileHandler(check_create_logfile(filename="app_logging.log", tz_hours=tz_hours, date_format=date_format), mode="a", encoding="utf-8")
     ## Getting filepermission error

     # File handler
     #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
     #file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
+    from utils.file_utils import check_create_logfile
     file_handler = logging.FileHandler(check_create_logfile(filename="app_logging.log", tz_hours=tz_hours, date_format=date_format), mode="a", encoding="utf-8")
     ## Getting filepermission error