Spaces:
Sleeping
Sleeping
baseline08_beta0.3.9.1_03Oct25: fixing slow Marker: @spaces.GPU moved to convert_files starting at 180s
Browse files- converters/pdf_to_md.py +7 -3
- globals.py +1 -0
- ui/gradio_ui.py +1 -0
converters/pdf_to_md.py
CHANGED
|
@@ -6,6 +6,7 @@ import traceback ## Extract, format and print information about Python stack tr
|
|
| 6 |
import time
|
| 7 |
|
| 8 |
import spaces
|
|
|
|
| 9 |
|
| 10 |
from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
|
| 11 |
from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
|
|
@@ -21,7 +22,7 @@ docconverter: DocumentConverter = None
|
|
| 21 |
converter = None #DocumentConverter
|
| 22 |
#converter:DocumentConverter.converter = None
|
| 23 |
|
| 24 |
-
|
| 25 |
# Define docextractor in the pool as serialised object and passed to each worker process.
|
| 26 |
# Note: DocumentConverter must be "picklable".
|
| 27 |
def init_worker(#self,
|
|
@@ -115,7 +116,7 @@ class PdfToMarkdownConverter:
|
|
| 115 |
#duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
|
| 116 |
#duration = 10
|
| 117 |
#@spaces.GPU(duration=duration) ## HF Spaces GPU support
|
| 118 |
-
|
| 119 |
## moved from extraction_converter ( to standalone extract_to_md)
|
| 120 |
#def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
|
| 121 |
def extract(self, src_path: str, output_dir: str): #Dict:
|
|
@@ -126,7 +127,7 @@ class PdfToMarkdownConverter:
|
|
| 126 |
Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
|
| 127 |
"""
|
| 128 |
|
| 129 |
-
from globals import config_load_models
|
| 130 |
try:
|
| 131 |
## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
|
| 132 |
weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
|
|
@@ -178,6 +179,9 @@ class PdfToMarkdownConverter:
|
|
| 178 |
#return {"images": len(rendered.images), "file": md_file} ##debug
|
| 179 |
return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
|
| 180 |
|
|
|
|
|
|
|
|
|
|
| 181 |
#def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
|
| 182 |
#def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
|
| 183 |
def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
|
|
|
|
| 6 |
import time
|
| 7 |
|
| 8 |
import spaces
|
| 9 |
+
from globals import config_load_models
|
| 10 |
|
| 11 |
from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
|
| 12 |
from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
|
|
|
|
| 22 |
converter = None #DocumentConverter
|
| 23 |
#converter:DocumentConverter.converter = None
|
| 24 |
|
| 25 |
+
#@spaces.GPU
|
| 26 |
# Define docextractor in the pool as serialised object and passed to each worker process.
|
| 27 |
# Note: DocumentConverter must be "picklable".
|
| 28 |
def init_worker(#self,
|
|
|
|
| 116 |
#duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
|
| 117 |
#duration = 10
|
| 118 |
#@spaces.GPU(duration=duration) ## HF Spaces GPU support
|
| 119 |
+
#@spaces.GPU
|
| 120 |
## moved from extraction_converter ( to standalone extract_to_md)
|
| 121 |
#def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
|
| 122 |
def extract(self, src_path: str, output_dir: str): #Dict:
|
|
|
|
| 127 |
Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
|
| 128 |
"""
|
| 129 |
|
| 130 |
+
#from globals import config_load_models ##SMY: moved to top-level import
|
| 131 |
try:
|
| 132 |
## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
|
| 133 |
weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
|
|
|
|
| 179 |
#return {"images": len(rendered.images), "file": md_file} ##debug
|
| 180 |
return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
|
| 181 |
|
| 182 |
+
#duration = 5.75 * pdf_files_count if pdf_files_count>=2 else 7
|
| 183 |
+
duration = 20*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 180 ## sec
|
| 184 |
+
@spaces.GPU(duration=duration) ## HF Spaces GPU support
|
| 185 |
#def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
|
| 186 |
#def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
|
| 187 |
def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
|
globals.py
CHANGED
|
@@ -8,6 +8,7 @@ class Config:
|
|
| 8 |
self.model_dict = {}
|
| 9 |
self.weasyprint_libpath = ""
|
| 10 |
self.config_ini = "utils\\config.ini"
|
|
|
|
| 11 |
|
| 12 |
# Create a single, shared instance of the Config class
|
| 13 |
# Other modules will import and use this instance.
|
|
|
|
| 8 |
self.model_dict = {}
|
| 9 |
self.weasyprint_libpath = ""
|
| 10 |
self.config_ini = "utils\\config.ini"
|
| 11 |
+
self.pdf_files_count = 0
|
| 12 |
|
| 13 |
# Create a single, shared instance of the Config class
|
| 14 |
# Other modules will import and use this instance.
|
ui/gradio_ui.py
CHANGED
|
@@ -161,6 +161,7 @@ def convert_batch(
|
|
| 161 |
page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
|
| 162 |
weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
|
| 163 |
config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
|
|
|
|
| 164 |
|
| 165 |
progress((3,16), desc=f"Retrieved configuration values")
|
| 166 |
time.sleep(0.25)
|
|
|
|
| 161 |
page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
|
| 162 |
weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
|
| 163 |
config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
|
| 164 |
+
config_load_models.pdf_files_count = pdf_files_count
|
| 165 |
|
| 166 |
progress((3,16), desc=f"Retrieved configuration values")
|
| 167 |
time.sleep(0.25)
|