semmyk commited on
Commit
ee6cd88
·
1 Parent(s): a0c450a

baseline08_beta0.3.9.1_03Oct25: fixing slow Marker: @spaces.GPU moved to convert_files starting at 180s

Browse files
Files changed (3) hide show
  1. converters/pdf_to_md.py +7 -3
  2. globals.py +1 -0
  3. ui/gradio_ui.py +1 -0
converters/pdf_to_md.py CHANGED
@@ -6,6 +6,7 @@ import traceback ## Extract, format and print information about Python stack tr
6
  import time
7
 
8
  import spaces
 
9
 
10
  from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
11
  from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
@@ -21,7 +22,7 @@ docconverter: DocumentConverter = None
21
  converter = None #DocumentConverter
22
  #converter:DocumentConverter.converter = None
23
 
24
- @spaces.GPU
25
  # Define docextractor in the pool as serialised object and passed to each worker process.
26
  # Note: DocumentConverter must be "picklable".
27
  def init_worker(#self,
@@ -115,7 +116,7 @@ class PdfToMarkdownConverter:
115
  #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
116
  #duration = 10
117
  #@spaces.GPU(duration=duration) ## HF Spaces GPU support
118
- @spaces.GPU
119
  ## moved from extraction_converter ( to standalone extract_to_md)
120
  #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
121
  def extract(self, src_path: str, output_dir: str): #Dict:
@@ -126,7 +127,7 @@ class PdfToMarkdownConverter:
126
  Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
127
  """
128
 
129
- from globals import config_load_models
130
  try:
131
  ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
132
  weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
@@ -178,6 +179,9 @@ class PdfToMarkdownConverter:
178
  #return {"images": len(rendered.images), "file": md_file} ##debug
179
  return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
180
 
 
 
 
181
  #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
182
  #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
183
  def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
 
6
  import time
7
 
8
  import spaces
9
+ from globals import config_load_models
10
 
11
  from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
12
  from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
 
22
  converter = None #DocumentConverter
23
  #converter:DocumentConverter.converter = None
24
 
25
+ #@spaces.GPU
26
  # Define docextractor in the pool as serialised object and passed to each worker process.
27
  # Note: DocumentConverter must be "picklable".
28
  def init_worker(#self,
 
116
  #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
117
  #duration = 10
118
  #@spaces.GPU(duration=duration) ## HF Spaces GPU support
119
+ #@spaces.GPU
120
  ## moved from extraction_converter ( to standalone extract_to_md)
121
  #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
122
  def extract(self, src_path: str, output_dir: str): #Dict:
 
127
  Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
128
  """
129
 
130
+ #from globals import config_load_models ##SMY: moved to top-level import
131
  try:
132
  ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
133
  weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
 
179
  #return {"images": len(rendered.images), "file": md_file} ##debug
180
  return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
181
 
182
+ #duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
183
+ duration = 20*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 180 ## sec
184
+ @spaces.GPU(duration=duration) ## HF Spaces GPU support
185
  #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
186
  #def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
187
  def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
globals.py CHANGED
@@ -8,6 +8,7 @@ class Config:
8
  self.model_dict = {}
9
  self.weasyprint_libpath = ""
10
  self.config_ini = "utils\\config.ini"
 
11
 
12
  # Create a single, shared instance of the Config class
13
  # Other modules will import and use this instance.
 
8
  self.model_dict = {}
9
  self.weasyprint_libpath = ""
10
  self.config_ini = "utils\\config.ini"
11
+ self.pdf_files_count = 0
12
 
13
  # Create a single, shared instance of the Config class
14
  # Other modules will import and use this instance.
ui/gradio_ui.py CHANGED
@@ -161,6 +161,7 @@ def convert_batch(
161
  page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
162
  weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
163
  config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
 
164
 
165
  progress((3,16), desc=f"Retrieved configuration values")
166
  time.sleep(0.25)
 
161
  page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
162
  weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
163
  config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
164
+ config_load_models.pdf_files_count = pdf_files_count
165
 
166
  progress((3,16), desc=f"Retrieved configuration values")
167
  time.sleep(0.25)