Spaces:

semmyk
/

parserPDF

Sleeping

App Files Files Community

semmyk commited on Oct 3

Commit

ee6cd88

1 Parent(s): a0c450a

baseline08_beta0.3.9.1_03Oct25: fixing slow Marker: @spaces.GPU moved to convert_files starting at 180s

Browse files

Files changed (3) hide show

converters/pdf_to_md.py +7 -3
globals.py +1 -0
ui/gradio_ui.py +1 -0

converters/pdf_to_md.py CHANGED Viewed

@@ -6,6 +6,7 @@ import traceback  ## Extract, format and print information about Python stack tr
 import time
 import spaces
 from converters.extraction_converter import DocumentConverter  #, DocumentExtractor #as docextractor #ExtractionConverter  #get_extraction_converter  ## SMY: should disuse
 from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
@@ -21,7 +22,7 @@ docconverter: DocumentConverter = None
 converter = None  #DocumentConverter
 #converter:DocumentConverter.converter = None
-@spaces.GPU
 # Define docextractor in the pool as serialised object and passed to each worker process.
 # Note: DocumentConverter must be "picklable".
 def init_worker(#self,
@@ -115,7 +116,7 @@ class PdfToMarkdownConverter:
     #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
     #duration = 10
     #@spaces.GPU(duration=duration)   ## HF Spaces GPU support
-    @spaces.GPU
     ## moved from extraction_converter ( to standalone extract_to_md)
     #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
     def extract(self, src_path: str, output_dir: str):   #Dict:
@@ -126,7 +127,7 @@ class PdfToMarkdownConverter:
         Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
         """
-        from globals import config_load_models
         try:
             ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
             weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
@@ -178,6 +179,9 @@ class PdfToMarkdownConverter:
         #return {"images": len(rendered.images), "file": md_file}  ##debug
         return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path}  ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
     #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
     #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]:  #str:
     def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:

 import time
 import spaces
+from globals import config_load_models
 from converters.extraction_converter import DocumentConverter  #, DocumentExtractor #as docextractor #ExtractionConverter  #get_extraction_converter  ## SMY: should disuse
 from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
 converter = None  #DocumentConverter
 #converter:DocumentConverter.converter = None
+#@spaces.GPU
 # Define docextractor in the pool as serialised object and passed to each worker process.
 # Note: DocumentConverter must be "picklable".
 def init_worker(#self,
     #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
     #duration = 10
     #@spaces.GPU(duration=duration)   ## HF Spaces GPU support
+    #@spaces.GPU
     ## moved from extraction_converter ( to standalone extract_to_md)
     #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
     def extract(self, src_path: str, output_dir: str):   #Dict:
         Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
         """
+        #from globals import config_load_models   ##SMY: moved to top-level import
         try:
             ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
             weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
         #return {"images": len(rendered.images), "file": md_file}  ##debug
         return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path}  ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
+    #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
+    duration = 20*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 180  ## sec
+    @spaces.GPU(duration=duration)   ## HF Spaces GPU support
     #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
     #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]:  #str:
     def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:

globals.py CHANGED Viewed

@@ -8,6 +8,7 @@ class Config:
         self.model_dict = {}
         self.weasyprint_libpath = ""
         self.config_ini = "utils\\config.ini"
 # Create a single, shared instance of the Config class
 # Other modules will import and use this instance.

         self.model_dict = {}
         self.weasyprint_libpath = ""
         self.config_ini = "utils\\config.ini"
+        self.pdf_files_count = 0
 # Create a single, shared instance of the Config class
 # Other modules will import and use this instance.

ui/gradio_ui.py CHANGED Viewed

@@ -161,6 +161,7 @@ def convert_batch(
     page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
     weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
     config_load_models.weasyprint_libpath = weasyprint_dll_directories  ## Assign user's weasyprint path to Global var
     progress((3,16), desc=f"Retrieved configuration values")
     time.sleep(0.25)

     page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
     weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
     config_load_models.weasyprint_libpath = weasyprint_dll_directories  ## Assign user's weasyprint path to Global var
+    config_load_models.pdf_files_count = pdf_files_count
     progress((3,16), desc=f"Retrieved configuration values")
     time.sleep(0.25)