Spaces:

semmyk
/

parserPDF

Sleeping

App Files Files Community

semmyk commited on Sep 29

Commit

bfbdd1d

1 Parent(s): d82ee51

baseline08_beta0.2.0_29Sept25: fix oauth_token.token (convert_batch). - fixing models load. - update README, requirements

Browse files

Files changed (7) hide show

README.md +31 -3
converters/extraction_converter.py +40 -13
converters/pdf_to_md.py +1 -0
llm/llm_login.py +2 -2
main.py +3 -1
requirements.txt +2 -2
ui/gradio_ui.py +13 -1

README.md CHANGED Viewed

@@ -4,17 +4,37 @@ emoji: 📚
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
 command: python main.py
 app_file: main.py
 hf_oauth: true
 oauth_scopes: [read-access]
-python_version: 3.12
 license: mit
 pinned: true
 short_description: PDF & HTML parser to markdown
-models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b]
 tags: [markdown, PDF, parser, converter, extractor]
-preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
 owner: research-semmyk
 #---
 #
@@ -46,6 +66,14 @@ requires-python: ">=3.12"
 #  - huggingface.co/datalab-to/line_detector0
 #  - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
 #owner: research-semmyk
 ---
 # parserPDF

 colorFrom: yellow
 colorTo: purple
 sdk: gradio
+sdk_version: 5.44.1
+python_version: 3.12
 command: python main.py
 app_file: main.py
 hf_oauth: true
 oauth_scopes: [read-access]
 license: mit
 pinned: true
 short_description: PDF & HTML parser to markdown
+#models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b, ]
+models:
+  - meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  - openai/gpt-oss-120b, openai/gpt-oss-20b
+  - vikp/surya_det3
+  - vikp/surya_rec2
+  - vikp/surya_tablerec
+  - datalab-to/surya_layout
+  - datalab-to/surya_tablerec
+  - datalab-to/texify
+  - datalab-to/ocr_error_detection
+  - datalab-to/inline_math_det0
+  - datalab-to/line_detector0
+  - xiaoyao9184/surya_text_detection
+  - xiaoyao9184/surya_text_recognition
+  - xiaoyao9184/surya_table_recognition
+  - xiaoyao9184/surya_texify
+  - xiaoyao9184/surya_layout
+  - xiaoyao9184/surya_ocr_error_detection
+  - xiaoyao9184/surya_inline_math_detection]
 tags: [markdown, PDF, parser, converter, extractor]
+#preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
 owner: research-semmyk
 #---
 #
 #  - huggingface.co/datalab-to/line_detector0
 #  - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
 #owner: research-semmyk
+## Model list
+#[
+#    "datalab/models/text_recognition/2025_08_29",
+#    "datalab/models/layout/2025_02_18",
+#    "datalab/models/table_recognition/2025_02_18",
+#    "datalab/models/text_detection/2025_05_07",
+#    "datalab/models/ocr_error_detection/2025_02_18",
+#]
 ---
 # parserPDF

converters/extraction_converter.py CHANGED Viewed

@@ -21,6 +21,10 @@ from utils.logger import get_logger
 logger = get_logger(__name__)
 # Full document converter
 class DocumentConverter:
     """
@@ -43,7 +47,7 @@ class DocumentConverter:
         api_token: str,
         openai_base_url: str = "https://router.huggingface.co/v1",
         openai_image_format: Optional[str] = "webp",
-        #max_workers: Optional[str] = 4,
         max_retries: Optional[int] = 2,
         output_format: str = "markdown",
         output_dir: Optional[Union[str, Path]] = "output_dir",
@@ -59,8 +63,9 @@ class DocumentConverter:
         self.top_p = top_p               # self.client.top_p,
         self.llm_service = MarkerOpenAIService
         self.openai_image_format = openai_image_format  #"png"  #better compatibility
         self.max_retries = max_retries  ## pass to __call__
-        self.output_dir = output_dir
         self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm,  #False,  #True,
         #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range   ##SMY: iterating twice because self.page casting as hint type tuple!
         self.page_range = page_range if page_range else None
@@ -117,18 +122,30 @@ class DocumentConverter:
             logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")  #.with_traceback(tb)
-        # 3) Create the artifact dictionary and retrieve the LLM service.
         try:
-            #self.artifact_dict: Dict[str, Any] = self.get_create_model_dict  ##SMY: Might have to eliminate function afterall
-            self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict()  ##SMY: BaseModel for Any??
-            #logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
         except Exception as exc:
             tb = traceback.format_exc()   #exc.__traceback__
             logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")  #.with_traceback(tb)
-        # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
         try:
             llm_service_str = str(self.llm_service).split("'")[1]  ## SMY: split and slicing  ##Gets the string value
@@ -136,13 +153,18 @@ class DocumentConverter:
             os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key  ## to handle Marker's assertion test on OpenAI
             logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token})  ##debug
             #self.converter: MarkerConverter = MarkerConverter(
             self.converter = MarkerConverter(
-                #artifact_dict=self.artifact_dict,
-                artifact_dict=create_model_dict(),
-                config=config_parser.generate_config_dict(),
                 #llm_service=self.llm_service  ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
-                llm_service=llm_service_str    ##resolve
             )
             logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
@@ -160,7 +182,7 @@ class DocumentConverter:
             ## Enable higher quality processing with LLMs.  ## See MarkerOpenAIService,
             #llm_service = llm_service.removeprefix("<class '").removesuffix("'>")  # e.g <class 'marker.services.openai.OpenAIService'>
             llm_service  = str(llm_service).split("'")[1]  ## SMY: split and slicing
-            self.use_llm = self.use_llm[0]
             self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None,  ##SMY: passing as hint type tuple!
@@ -172,10 +194,11 @@ class DocumentConverter:
                 "temperature"    : self.temperature,      #self.client.temperature,
                 "top_p"          : self.top_p,            #self.client.top_p,
                 "openai_image_format": self.openai_image_format, #"webp",  #"png"  #better compatibility
                 "max_retries"    : self.max_retries,  #3,  ## pass to __call__
                 "output_dir"     : self.output_dir,
                 "use_llm"        : self.use_llm,      #False,  #True,
-                "page_range"     : self.page_range,   #]debug  #len(pdf_file)
             }
             return config_dict
         except Exception as exc:
@@ -184,6 +207,10 @@ class DocumentConverter:
             raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}")  #").with_traceback(tb)
             #raise
     ##SMY: flagged for deprecation
     ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
     #def get_extraction_converter(self, chat_fn):

 logger = get_logger(__name__)
+# create/load models. Called to curtail reloading models at each instance
+def load_models():
+    return create_model_dict()
 # Full document converter
 class DocumentConverter:
     """
         api_token: str,
         openai_base_url: str = "https://router.huggingface.co/v1",
         openai_image_format: Optional[str] = "webp",
+        max_workers: Optional[str] =1,  #4,  for config_dict["pdftext_workers"]
         max_retries: Optional[int] = 2,
         output_format: str = "markdown",
         output_dir: Optional[Union[str, Path]] = "output_dir",
         self.top_p = top_p               # self.client.top_p,
         self.llm_service = MarkerOpenAIService
         self.openai_image_format = openai_image_format  #"png"  #better compatibility
+        self.max_workers = max_workers  ## pass to config_dict["pdftext_workers"]
         self.max_retries = max_retries  ## pass to __call__
+        self.output_dir = output_dir    ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
         self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm,  #False,  #True,
         #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range   ##SMY: iterating twice because self.page casting as hint type tuple!
         self.page_range = page_range if page_range else None
             logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")  #.with_traceback(tb)
+        # 3) Create the artifact dictionary and retrieve the LLM service.  ##SMY: disused
         try:
+            ##self.artifact_dict: Dict[str, Any] = self.get_create_model_dict  ##SMY: Might have to eliminate function afterall
+            #self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict()  ##SMY: BaseModel for Any??
+            self.artifact_dict = {}  ##dummy
+            ##logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
         except Exception as exc:
             tb = traceback.format_exc()   #exc.__traceback__
             logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
             raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")  #.with_traceback(tb)
+        # 4) Load models if not already loaded in reload mode
+        try:
+            if 'model_dict' not in globals():
+                #model_dict = self.load_models()
+                model_dict = load_models()
+        except Exception as exc:
+            tb = traceback.format_exc()   #exc.__traceback__
+            logger.exception(f"✗ Error loading models (reload): {exc}\n{tb}")
+            raise RuntimeError(f"✗ Error loading models (reload): {exc}\n{tb}")  #.with_traceback(tb)
+        # 5) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
         try:
             llm_service_str = str(self.llm_service).split("'")[1]  ## SMY: split and slicing  ##Gets the string value
             os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key  ## to handle Marker's assertion test on OpenAI
             logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token})  ##debug
+            config_dict = config_parser.generate_config_dict()
+            #config_dict["pdftext_worker"] = self.max_workers  #1  ##SMY: move to get_config_dicts()
             #self.converter: MarkerConverter = MarkerConverter(
             self.converter = MarkerConverter(
+                ##artifact_dict=self.artifact_dict,
+                #artifact_dict=create_model_dict(),
+                artifact_dict=model_dict,
+                config=config_dict,
+                #config=config_parser.generate_config_dict(),
                 #llm_service=self.llm_service  ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
+                llm_service=llm_service_str,    ##resolve
             )
             logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
             ## Enable higher quality processing with LLMs.  ## See MarkerOpenAIService,
             #llm_service = llm_service.removeprefix("<class '").removesuffix("'>")  # e.g <class 'marker.services.openai.OpenAIService'>
             llm_service  = str(llm_service).split("'")[1]  ## SMY: split and slicing
+            self.use_llm = self.use_llm[0] if isinstance(self.use_llm, tuple) else self.use_llm
             self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None,  ##SMY: passing as hint type tuple!
                 "temperature"    : self.temperature,      #self.client.temperature,
                 "top_p"          : self.top_p,            #self.client.top_p,
                 "openai_image_format": self.openai_image_format, #"webp",  #"png"  #better compatibility
+                "pdftext_workers": self.max_workers,  ## number of workers to use for pdftext."
                 "max_retries"    : self.max_retries,  #3,  ## pass to __call__
                 "output_dir"     : self.output_dir,
                 "use_llm"        : self.use_llm,      #False,  #True,
+                "page_range"     : self.page_range,   ##debug  #len(pdf_file)
             }
             return config_dict
         except Exception as exc:
             raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}")  #").with_traceback(tb)
             #raise
+    ''' # create/load models. Called to curtail reloading models at each instance
+    def load_models():
+        return create_model_dict()'''
     ##SMY: flagged for deprecation
     ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
     #def get_extraction_converter(self, chat_fn):

converters/pdf_to_md.py CHANGED Viewed

@@ -100,6 +100,7 @@ def init_worker(#self,
             api_token,  #: str,
             openai_base_url,  #: str = "https://router.huggingface.co/v1",
             openai_image_format,  #: str | None = "webp",
             max_retries,  #: int | None = 2,
             output_format,  #: str = "markdown",
             output_dir,  #: Union | None = "output_dir",

             api_token,  #: str,
             openai_base_url,  #: str = "https://router.huggingface.co/v1",
             openai_image_format,  #: str | None = "webp",
+            max_workers,  #: int  | None = 1,
             max_retries,  #: int | None = 2,
             output_format,  #: str = "markdown",
             output_dir,  #: Union | None = "output_dir",

llm/llm_login.py CHANGED Viewed

@@ -38,12 +38,12 @@ def login_huggingface(token: Optional[str] = None):
     try:
         #if HfApi.whoami():   ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
         if whoami():  ##SMY: Call HF API to know "whoami".
-            logger.info("✔️ hf_login already", extra={"mode": "HF Oauth"})
             #return True
         else:
             login()   ##SMY: Not visible/interactive to users onH Space. #limitation
             sleep(5)  ##SMY pause for login. Helpful: pool async opex
-            logger.info("✔️ hf_login already", extra={"mode": "cli"})
             #return True
     except Exception as exc:
         # Respect common env var names; prefer explicit token arg when provided

     try:
         #if HfApi.whoami():   ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
         if whoami():  ##SMY: Call HF API to know "whoami".
+            logger.info("✔️ hf_login already: whoami()", extra={"mode": "HF Oauth"})
             #return True
         else:
             login()   ##SMY: Not visible/interactive to users onH Space. #limitation
             sleep(5)  ##SMY pause for login. Helpful: pool async opex
+            logger.info("✔️ hf_login already: login()", extra={"mode": "cli"})
             #return True
     except Exception as exc:
         # Respect common env var names; prefer explicit token arg when provided

main.py CHANGED Viewed

@@ -9,6 +9,8 @@ setup_logging()  ## set logging
 #logger = get_logger("pypdfmd")
 logger = get_logger("parserpdf")
 if __name__ == "__main__":
     # Ensure the working directory is clean
     #os.chdir(os.path.dirname(__file__))
@@ -19,4 +21,4 @@ if __name__ == "__main__":
     demo = build_interface()
     #demo.launch(debug=True, show_error=True ,ssr_mode=True)  #(share=True)  # share=True for public link; remove in production
-    demo.launch(debug=True, show_error=True, ssr_mode=False)

 #logger = get_logger("pypdfmd")
 logger = get_logger("parserpdf")
 if __name__ == "__main__":
     # Ensure the working directory is clean
     #os.chdir(os.path.dirname(__file__))
     demo = build_interface()
     #demo.launch(debug=True, show_error=True ,ssr_mode=True)  #(share=True)  # share=True for public link; remove in production
+    demo.launch(debug=True, show_error=True, ssr_mode=True)   #ssr_mode=False

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-gradio>=5.40.0
-marker-pdf[full]>=1.3.0           # pip install marker (GitHub: https://github.com/datalab-to/marker)
 weasyprint>=59.0       # optional fallback if pandoc is not available
 #pandoc==2.3            # for Markdown → PDF conversion
 python-magic==0.4.27    # file‑type detection

+gradio>=5.44.0
+marker-pdf[full]>=1.10.0           # pip install marker (GitHub: https://github.com/datalab-to/marker)
 weasyprint>=59.0       # optional fallback if pandoc is not available
 #pandoc==2.3            # for Markdown → PDF conversion
 python-magic==0.4.27    # file‑type detection

ui/gradio_ui.py CHANGED Viewed

@@ -33,6 +33,17 @@ pdf2md_converter = PdfToMarkdownConverter()
 #html2md_converter = HtmlToMarkdownConverter()
 #md2pdf_converter = MarkdownToPdfConverter()
 def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
     """ Use user's supplied token or Get token from logged-in users, else from token stored on the  machine. Return token"""
@@ -46,7 +57,8 @@ def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
     return oauth_token.token  ##token value
 # pool executor to convert files called by Gradio
-##SMY: TODO: future: refactor to gradio_process.py
 def convert_batch(
     pdf_files, #: list[str],
     pdf_files_count: int,

 #html2md_converter = HtmlToMarkdownConverter()
 #md2pdf_converter = MarkdownToPdfConverter()
+# User eXperience: Load Marker models ahead of time if not already loaded in reload mode
+## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
+from converters.extraction_converter import load_models
+try:
+    if 'model_dict' not in globals():
+        model_dict = load_models()
+except Exception as exc:
+    #tb = traceback.format_exc()   #exc.__traceback__
+    logger.exception(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
+    raise RuntimeError(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
 def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
     """ Use user's supplied token or Get token from logged-in users, else from token stored on the  machine. Return token"""
     return oauth_token.token  ##token value
 # pool executor to convert files called by Gradio
+##SMY: TODO: future: refactor to gradio_process.py and
+## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
 def convert_batch(
     pdf_files, #: list[str],
     pdf_files_count: int,