Spaces:
Sleeping
Sleeping
baseline08_beta0.2.0_29Sept25: fix oauth_token.token (convert_batch). - fixing models load. - update README, requirements
Browse files- README.md +31 -3
- converters/extraction_converter.py +40 -13
- converters/pdf_to_md.py +1 -0
- llm/llm_login.py +2 -2
- main.py +3 -1
- requirements.txt +2 -2
- ui/gradio_ui.py +13 -1
README.md
CHANGED
|
@@ -4,17 +4,37 @@ emoji: 📚
|
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
|
|
|
|
|
|
| 7 |
command: python main.py
|
| 8 |
app_file: main.py
|
| 9 |
hf_oauth: true
|
| 10 |
oauth_scopes: [read-access]
|
| 11 |
-
python_version: 3.12
|
| 12 |
license: mit
|
| 13 |
pinned: true
|
| 14 |
short_description: PDF & HTML parser to markdown
|
| 15 |
-
models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
tags: [markdown, PDF, parser, converter, extractor]
|
| 17 |
-
preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
|
| 18 |
owner: research-semmyk
|
| 19 |
#---
|
| 20 |
#
|
|
@@ -46,6 +66,14 @@ requires-python: ">=3.12"
|
|
| 46 |
# - huggingface.co/datalab-to/line_detector0
|
| 47 |
# - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
|
| 48 |
#owner: research-semmyk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
---
|
| 50 |
|
| 51 |
# parserPDF
|
|
|
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.44.1
|
| 8 |
+
python_version: 3.12
|
| 9 |
command: python main.py
|
| 10 |
app_file: main.py
|
| 11 |
hf_oauth: true
|
| 12 |
oauth_scopes: [read-access]
|
|
|
|
| 13 |
license: mit
|
| 14 |
pinned: true
|
| 15 |
short_description: PDF & HTML parser to markdown
|
| 16 |
+
#models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b, ]
|
| 17 |
+
models:
|
| 18 |
+
- meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
| 19 |
+
- openai/gpt-oss-120b, openai/gpt-oss-20b
|
| 20 |
+
- vikp/surya_det3
|
| 21 |
+
- vikp/surya_rec2
|
| 22 |
+
- vikp/surya_tablerec
|
| 23 |
+
- datalab-to/surya_layout
|
| 24 |
+
- datalab-to/surya_tablerec
|
| 25 |
+
- datalab-to/texify
|
| 26 |
+
- datalab-to/ocr_error_detection
|
| 27 |
+
- datalab-to/inline_math_det0
|
| 28 |
+
- datalab-to/line_detector0
|
| 29 |
+
- xiaoyao9184/surya_text_detection
|
| 30 |
+
- xiaoyao9184/surya_text_recognition
|
| 31 |
+
- xiaoyao9184/surya_table_recognition
|
| 32 |
+
- xiaoyao9184/surya_texify
|
| 33 |
+
- xiaoyao9184/surya_layout
|
| 34 |
+
- xiaoyao9184/surya_ocr_error_detection
|
| 35 |
+
- xiaoyao9184/surya_inline_math_detection]
|
| 36 |
tags: [markdown, PDF, parser, converter, extractor]
|
| 37 |
+
#preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
|
| 38 |
owner: research-semmyk
|
| 39 |
#---
|
| 40 |
#
|
|
|
|
| 66 |
# - huggingface.co/datalab-to/line_detector0
|
| 67 |
# - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
|
| 68 |
#owner: research-semmyk
|
| 69 |
+
## Model list
|
| 70 |
+
#[
|
| 71 |
+
# "datalab/models/text_recognition/2025_08_29",
|
| 72 |
+
# "datalab/models/layout/2025_02_18",
|
| 73 |
+
# "datalab/models/table_recognition/2025_02_18",
|
| 74 |
+
# "datalab/models/text_detection/2025_05_07",
|
| 75 |
+
# "datalab/models/ocr_error_detection/2025_02_18",
|
| 76 |
+
#]
|
| 77 |
---
|
| 78 |
|
| 79 |
# parserPDF
|
converters/extraction_converter.py
CHANGED
|
@@ -21,6 +21,10 @@ from utils.logger import get_logger
|
|
| 21 |
|
| 22 |
logger = get_logger(__name__)
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# Full document converter
|
| 25 |
class DocumentConverter:
|
| 26 |
"""
|
|
@@ -43,7 +47,7 @@ class DocumentConverter:
|
|
| 43 |
api_token: str,
|
| 44 |
openai_base_url: str = "https://router.huggingface.co/v1",
|
| 45 |
openai_image_format: Optional[str] = "webp",
|
| 46 |
-
|
| 47 |
max_retries: Optional[int] = 2,
|
| 48 |
output_format: str = "markdown",
|
| 49 |
output_dir: Optional[Union[str, Path]] = "output_dir",
|
|
@@ -59,8 +63,9 @@ class DocumentConverter:
|
|
| 59 |
self.top_p = top_p # self.client.top_p,
|
| 60 |
self.llm_service = MarkerOpenAIService
|
| 61 |
self.openai_image_format = openai_image_format #"png" #better compatibility
|
|
|
|
| 62 |
self.max_retries = max_retries ## pass to __call__
|
| 63 |
-
self.output_dir = output_dir
|
| 64 |
self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
|
| 65 |
#self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
|
| 66 |
self.page_range = page_range if page_range else None
|
|
@@ -117,18 +122,30 @@ class DocumentConverter:
|
|
| 117 |
logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
|
| 118 |
raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
|
| 119 |
|
| 120 |
-
# 3) Create the artifact dictionary and retrieve the LLM service.
|
| 121 |
try:
|
| 122 |
-
|
| 123 |
-
self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict() ##SMY: BaseModel for Any??
|
| 124 |
-
|
|
|
|
| 125 |
|
| 126 |
except Exception as exc:
|
| 127 |
tb = traceback.format_exc() #exc.__traceback__
|
| 128 |
logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
|
| 129 |
raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}") #.with_traceback(tb)
|
| 130 |
|
| 131 |
-
# 4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
try:
|
| 133 |
llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
|
| 134 |
|
|
@@ -136,13 +153,18 @@ class DocumentConverter:
|
|
| 136 |
os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key ## to handle Marker's assertion test on OpenAI
|
| 137 |
logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
|
| 138 |
|
|
|
|
|
|
|
|
|
|
| 139 |
#self.converter: MarkerConverter = MarkerConverter(
|
| 140 |
self.converter = MarkerConverter(
|
| 141 |
-
|
| 142 |
-
artifact_dict=create_model_dict(),
|
| 143 |
-
|
|
|
|
|
|
|
| 144 |
#llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
|
| 145 |
-
llm_service=llm_service_str ##resolve
|
| 146 |
)
|
| 147 |
|
| 148 |
logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
|
|
@@ -160,7 +182,7 @@ class DocumentConverter:
|
|
| 160 |
## Enable higher quality processing with LLMs. ## See MarkerOpenAIService,
|
| 161 |
#llm_service = llm_service.removeprefix("<class '").removesuffix("'>") # e.g <class 'marker.services.openai.OpenAIService'>
|
| 162 |
llm_service = str(llm_service).split("'")[1] ## SMY: split and slicing
|
| 163 |
-
self.use_llm = self.use_llm[0]
|
| 164 |
self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
|
| 165 |
|
| 166 |
|
|
@@ -172,10 +194,11 @@ class DocumentConverter:
|
|
| 172 |
"temperature" : self.temperature, #self.client.temperature,
|
| 173 |
"top_p" : self.top_p, #self.client.top_p,
|
| 174 |
"openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
|
|
|
|
| 175 |
"max_retries" : self.max_retries, #3, ## pass to __call__
|
| 176 |
"output_dir" : self.output_dir,
|
| 177 |
"use_llm" : self.use_llm, #False, #True,
|
| 178 |
-
"page_range" : self.page_range,
|
| 179 |
}
|
| 180 |
return config_dict
|
| 181 |
except Exception as exc:
|
|
@@ -184,6 +207,10 @@ class DocumentConverter:
|
|
| 184 |
raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
|
| 185 |
#raise
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
##SMY: flagged for deprecation
|
| 188 |
##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
|
| 189 |
#def get_extraction_converter(self, chat_fn):
|
|
|
|
| 21 |
|
| 22 |
logger = get_logger(__name__)
|
| 23 |
|
| 24 |
+
# create/load models. Called to curtail reloading models at each instance
|
| 25 |
+
def load_models():
|
| 26 |
+
return create_model_dict()
|
| 27 |
+
|
| 28 |
# Full document converter
|
| 29 |
class DocumentConverter:
|
| 30 |
"""
|
|
|
|
| 47 |
api_token: str,
|
| 48 |
openai_base_url: str = "https://router.huggingface.co/v1",
|
| 49 |
openai_image_format: Optional[str] = "webp",
|
| 50 |
+
max_workers: Optional[str] =1, #4, for config_dict["pdftext_workers"]
|
| 51 |
max_retries: Optional[int] = 2,
|
| 52 |
output_format: str = "markdown",
|
| 53 |
output_dir: Optional[Union[str, Path]] = "output_dir",
|
|
|
|
| 63 |
self.top_p = top_p # self.client.top_p,
|
| 64 |
self.llm_service = MarkerOpenAIService
|
| 65 |
self.openai_image_format = openai_image_format #"png" #better compatibility
|
| 66 |
+
self.max_workers = max_workers ## pass to config_dict["pdftext_workers"]
|
| 67 |
self.max_retries = max_retries ## pass to __call__
|
| 68 |
+
self.output_dir = output_dir ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
|
| 69 |
self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
|
| 70 |
#self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
|
| 71 |
self.page_range = page_range if page_range else None
|
|
|
|
| 122 |
logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
|
| 123 |
raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
|
| 124 |
|
| 125 |
+
# 3) Create the artifact dictionary and retrieve the LLM service. ##SMY: disused
|
| 126 |
try:
|
| 127 |
+
##self.artifact_dict: Dict[str, Any] = self.get_create_model_dict ##SMY: Might have to eliminate function afterall
|
| 128 |
+
#self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict() ##SMY: BaseModel for Any??
|
| 129 |
+
self.artifact_dict = {} ##dummy
|
| 130 |
+
##logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
|
| 131 |
|
| 132 |
except Exception as exc:
|
| 133 |
tb = traceback.format_exc() #exc.__traceback__
|
| 134 |
logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
|
| 135 |
raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}") #.with_traceback(tb)
|
| 136 |
|
| 137 |
+
# 4) Load models if not already loaded in reload mode
|
| 138 |
+
try:
|
| 139 |
+
if 'model_dict' not in globals():
|
| 140 |
+
#model_dict = self.load_models()
|
| 141 |
+
model_dict = load_models()
|
| 142 |
+
except Exception as exc:
|
| 143 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 144 |
+
logger.exception(f"✗ Error loading models (reload): {exc}\n{tb}")
|
| 145 |
+
raise RuntimeError(f"✗ Error loading models (reload): {exc}\n{tb}") #.with_traceback(tb)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# 5) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
|
| 149 |
try:
|
| 150 |
llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
|
| 151 |
|
|
|
|
| 153 |
os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key ## to handle Marker's assertion test on OpenAI
|
| 154 |
logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
|
| 155 |
|
| 156 |
+
config_dict = config_parser.generate_config_dict()
|
| 157 |
+
#config_dict["pdftext_worker"] = self.max_workers #1 ##SMY: move to get_config_dicts()
|
| 158 |
+
|
| 159 |
#self.converter: MarkerConverter = MarkerConverter(
|
| 160 |
self.converter = MarkerConverter(
|
| 161 |
+
##artifact_dict=self.artifact_dict,
|
| 162 |
+
#artifact_dict=create_model_dict(),
|
| 163 |
+
artifact_dict=model_dict,
|
| 164 |
+
config=config_dict,
|
| 165 |
+
#config=config_parser.generate_config_dict(),
|
| 166 |
#llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
|
| 167 |
+
llm_service=llm_service_str, ##resolve
|
| 168 |
)
|
| 169 |
|
| 170 |
logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
|
|
|
|
| 182 |
## Enable higher quality processing with LLMs. ## See MarkerOpenAIService,
|
| 183 |
#llm_service = llm_service.removeprefix("<class '").removesuffix("'>") # e.g <class 'marker.services.openai.OpenAIService'>
|
| 184 |
llm_service = str(llm_service).split("'")[1] ## SMY: split and slicing
|
| 185 |
+
self.use_llm = self.use_llm[0] if isinstance(self.use_llm, tuple) else self.use_llm
|
| 186 |
self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
|
| 187 |
|
| 188 |
|
|
|
|
| 194 |
"temperature" : self.temperature, #self.client.temperature,
|
| 195 |
"top_p" : self.top_p, #self.client.top_p,
|
| 196 |
"openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
|
| 197 |
+
"pdftext_workers": self.max_workers, ## number of workers to use for pdftext."
|
| 198 |
"max_retries" : self.max_retries, #3, ## pass to __call__
|
| 199 |
"output_dir" : self.output_dir,
|
| 200 |
"use_llm" : self.use_llm, #False, #True,
|
| 201 |
+
"page_range" : self.page_range, ##debug #len(pdf_file)
|
| 202 |
}
|
| 203 |
return config_dict
|
| 204 |
except Exception as exc:
|
|
|
|
| 207 |
raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
|
| 208 |
#raise
|
| 209 |
|
| 210 |
+
''' # create/load models. Called to curtail reloading models at each instance
|
| 211 |
+
def load_models():
|
| 212 |
+
return create_model_dict()'''
|
| 213 |
+
|
| 214 |
##SMY: flagged for deprecation
|
| 215 |
##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
|
| 216 |
#def get_extraction_converter(self, chat_fn):
|
converters/pdf_to_md.py
CHANGED
|
@@ -100,6 +100,7 @@ def init_worker(#self,
|
|
| 100 |
api_token, #: str,
|
| 101 |
openai_base_url, #: str = "https://router.huggingface.co/v1",
|
| 102 |
openai_image_format, #: str | None = "webp",
|
|
|
|
| 103 |
max_retries, #: int | None = 2,
|
| 104 |
output_format, #: str = "markdown",
|
| 105 |
output_dir, #: Union | None = "output_dir",
|
|
|
|
| 100 |
api_token, #: str,
|
| 101 |
openai_base_url, #: str = "https://router.huggingface.co/v1",
|
| 102 |
openai_image_format, #: str | None = "webp",
|
| 103 |
+
max_workers, #: int | None = 1,
|
| 104 |
max_retries, #: int | None = 2,
|
| 105 |
output_format, #: str = "markdown",
|
| 106 |
output_dir, #: Union | None = "output_dir",
|
llm/llm_login.py
CHANGED
|
@@ -38,12 +38,12 @@ def login_huggingface(token: Optional[str] = None):
|
|
| 38 |
try:
|
| 39 |
#if HfApi.whoami(): ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
|
| 40 |
if whoami(): ##SMY: Call HF API to know "whoami".
|
| 41 |
-
logger.info("✔️ hf_login already", extra={"mode": "HF Oauth"})
|
| 42 |
#return True
|
| 43 |
else:
|
| 44 |
login() ##SMY: Not visible/interactive to users onH Space. #limitation
|
| 45 |
sleep(5) ##SMY pause for login. Helpful: pool async opex
|
| 46 |
-
logger.info("✔️ hf_login already", extra={"mode": "cli"})
|
| 47 |
#return True
|
| 48 |
except Exception as exc:
|
| 49 |
# Respect common env var names; prefer explicit token arg when provided
|
|
|
|
| 38 |
try:
|
| 39 |
#if HfApi.whoami(): ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
|
| 40 |
if whoami(): ##SMY: Call HF API to know "whoami".
|
| 41 |
+
logger.info("✔️ hf_login already: whoami()", extra={"mode": "HF Oauth"})
|
| 42 |
#return True
|
| 43 |
else:
|
| 44 |
login() ##SMY: Not visible/interactive to users onH Space. #limitation
|
| 45 |
sleep(5) ##SMY pause for login. Helpful: pool async opex
|
| 46 |
+
logger.info("✔️ hf_login already: login()", extra={"mode": "cli"})
|
| 47 |
#return True
|
| 48 |
except Exception as exc:
|
| 49 |
# Respect common env var names; prefer explicit token arg when provided
|
main.py
CHANGED
|
@@ -9,6 +9,8 @@ setup_logging() ## set logging
|
|
| 9 |
#logger = get_logger("pypdfmd")
|
| 10 |
logger = get_logger("parserpdf")
|
| 11 |
|
|
|
|
|
|
|
| 12 |
if __name__ == "__main__":
|
| 13 |
# Ensure the working directory is clean
|
| 14 |
#os.chdir(os.path.dirname(__file__))
|
|
@@ -19,4 +21,4 @@ if __name__ == "__main__":
|
|
| 19 |
|
| 20 |
demo = build_interface()
|
| 21 |
#demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
|
| 22 |
-
demo.launch(debug=True, show_error=True, ssr_mode=False
|
|
|
|
| 9 |
#logger = get_logger("pypdfmd")
|
| 10 |
logger = get_logger("parserpdf")
|
| 11 |
|
| 12 |
+
|
| 13 |
+
|
| 14 |
if __name__ == "__main__":
|
| 15 |
# Ensure the working directory is clean
|
| 16 |
#os.chdir(os.path.dirname(__file__))
|
|
|
|
| 21 |
|
| 22 |
demo = build_interface()
|
| 23 |
#demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
|
| 24 |
+
demo.launch(debug=True, show_error=True, ssr_mode=True) #ssr_mode=False
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
gradio>=5.
|
| 2 |
-
marker-pdf[full]>=1.
|
| 3 |
weasyprint>=59.0 # optional fallback if pandoc is not available
|
| 4 |
#pandoc==2.3 # for Markdown → PDF conversion
|
| 5 |
python-magic==0.4.27 # file‑type detection
|
|
|
|
| 1 |
+
gradio>=5.44.0
|
| 2 |
+
marker-pdf[full]>=1.10.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
|
| 3 |
weasyprint>=59.0 # optional fallback if pandoc is not available
|
| 4 |
#pandoc==2.3 # for Markdown → PDF conversion
|
| 5 |
python-magic==0.4.27 # file‑type detection
|
ui/gradio_ui.py
CHANGED
|
@@ -33,6 +33,17 @@ pdf2md_converter = PdfToMarkdownConverter()
|
|
| 33 |
#html2md_converter = HtmlToMarkdownConverter()
|
| 34 |
#md2pdf_converter = MarkdownToPdfConverter()
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
|
| 38 |
""" Use user's supplied token or Get token from logged-in users, else from token stored on the machine. Return token"""
|
|
@@ -46,7 +57,8 @@ def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
|
|
| 46 |
return oauth_token.token ##token value
|
| 47 |
|
| 48 |
# pool executor to convert files called by Gradio
|
| 49 |
-
##SMY: TODO: future: refactor to gradio_process.py
|
|
|
|
| 50 |
def convert_batch(
|
| 51 |
pdf_files, #: list[str],
|
| 52 |
pdf_files_count: int,
|
|
|
|
| 33 |
#html2md_converter = HtmlToMarkdownConverter()
|
| 34 |
#md2pdf_converter = MarkdownToPdfConverter()
|
| 35 |
|
| 36 |
+
|
| 37 |
+
# User eXperience: Load Marker models ahead of time if not already loaded in reload mode
|
| 38 |
+
## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
|
| 39 |
+
from converters.extraction_converter import load_models
|
| 40 |
+
try:
|
| 41 |
+
if 'model_dict' not in globals():
|
| 42 |
+
model_dict = load_models()
|
| 43 |
+
except Exception as exc:
|
| 44 |
+
#tb = traceback.format_exc() #exc.__traceback__
|
| 45 |
+
logger.exception(f"✗ Error loading models (reload): {exc}") #\n{tb}")
|
| 46 |
+
raise RuntimeError(f"✗ Error loading models (reload): {exc}") #\n{tb}")
|
| 47 |
|
| 48 |
def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
|
| 49 |
""" Use user's supplied token or Get token from logged-in users, else from token stored on the machine. Return token"""
|
|
|
|
| 57 |
return oauth_token.token ##token value
|
| 58 |
|
| 59 |
# pool executor to convert files called by Gradio
|
| 60 |
+
##SMY: TODO: future: refactor to gradio_process.py and
|
| 61 |
+
## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
|
| 62 |
def convert_batch(
|
| 63 |
pdf_files, #: list[str],
|
| 64 |
pdf_files_count: int,
|