Spaces:
Sleeping
Sleeping
baseline08_beta0.3.1_01Oct25: refactor progress feedback, gradio ui tweak, weasyprint dll path
Browse files- README.md +2 -1
- converters/pdf_to_md.py +5 -6
- file_handler/file_utils.py +2 -2
- globals.py +2 -0
- ui/gradio_ui.py +64 -22
- utils/get_config.py +1 -0
- utils/lib_loader.py +8 -7
README.md
CHANGED
|
@@ -193,7 +193,8 @@ parserpdf/
|
|
| 193 |
- Process: Outputs Markdown files with extracted text/images to `output_dir`.
|
| 194 |
|
| 195 |
## Configuration
|
| 196 |
-
- Edit `utils/config.
|
|
|
|
| 197 |
- UI overrides: Adjust sliders for max tokens, temperature, workers, etc.
|
| 198 |
|
| 199 |
## LLM Providers
|
|
|
|
| 193 |
- Process: Outputs Markdown files with extracted text/images to `output_dir`.
|
| 194 |
|
| 195 |
## Configuration
|
| 196 |
+
- Edit `utils/config.ini` or `utils/config.py` for defaults (e.g., model ID, output dir).
|
| 197 |
+
- On windows, set weasyprint's GTK path: e.g. "C:\\Dat\\dev\\gtk3-runtime\\bin" or "C:\\msys64\\mingw64\\bin"
|
| 198 |
- UI overrides: Adjust sliders for max tokens, temperature, workers, etc.
|
| 199 |
|
| 200 |
## LLM Providers
|
converters/pdf_to_md.py
CHANGED
|
@@ -5,11 +5,9 @@ from typing import List, Dict, Union, Optional
|
|
| 5 |
import traceback ## Extract, format and print information about Python stack traces.
|
| 6 |
import time
|
| 7 |
|
| 8 |
-
from ui.gradio_ui import gr
|
| 9 |
from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
|
| 10 |
from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
|
| 11 |
|
| 12 |
-
|
| 13 |
from utils import config
|
| 14 |
from utils.lib_loader import set_weasyprint_library
|
| 15 |
from utils.logger import get_logger
|
|
@@ -45,9 +43,7 @@ def init_worker(#self,
|
|
| 45 |
use_llm: bool, #: bool | None = False,
|
| 46 |
force_ocr: bool,
|
| 47 |
page_range: str, #: str | None = None
|
| 48 |
-
progress: gr.Progress = gr.Progress(),
|
| 49 |
):
|
| 50 |
-
|
| 51 |
#'''
|
| 52 |
"""
|
| 53 |
instantiate DocumentConverter/DocumentExtractor for use in each pool worker
|
|
@@ -64,6 +60,7 @@ def init_worker(#self,
|
|
| 64 |
#'''
|
| 65 |
# 1) Instantiate the DocumentConverter
|
| 66 |
logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
|
|
|
|
| 67 |
try:
|
| 68 |
docconverter = DocumentConverter(
|
| 69 |
model_id, #: str,
|
|
@@ -121,10 +118,12 @@ class PdfToMarkdownConverter:
|
|
| 121 |
Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
|
| 122 |
"""
|
| 123 |
|
|
|
|
| 124 |
try:
|
| 125 |
## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
|
|
|
|
| 126 |
# Set a new environment variable
|
| 127 |
-
set_weasyprint_library() ##utils.lib_loader.set_weasyprint_library()
|
| 128 |
except Exception as exc:
|
| 129 |
tb = traceback.format_exc()
|
| 130 |
logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
|
|
@@ -173,7 +172,7 @@ class PdfToMarkdownConverter:
|
|
| 173 |
|
| 174 |
#def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
|
| 175 |
#def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
|
| 176 |
-
def convert_files(self, src_path: str, max_retries: int = 2
|
| 177 |
#def convert_files(self, src_path: str) -> str:
|
| 178 |
"""
|
| 179 |
Worker task: use `extractor` to convert file with retry/backoff.
|
|
|
|
| 5 |
import traceback ## Extract, format and print information about Python stack traces.
|
| 6 |
import time
|
| 7 |
|
|
|
|
| 8 |
from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
|
| 9 |
from file_handler.file_utils import write_markdown, dump_images, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
|
| 10 |
|
|
|
|
| 11 |
from utils import config
|
| 12 |
from utils.lib_loader import set_weasyprint_library
|
| 13 |
from utils.logger import get_logger
|
|
|
|
| 43 |
use_llm: bool, #: bool | None = False,
|
| 44 |
force_ocr: bool,
|
| 45 |
page_range: str, #: str | None = None
|
|
|
|
| 46 |
):
|
|
|
|
| 47 |
#'''
|
| 48 |
"""
|
| 49 |
instantiate DocumentConverter/DocumentExtractor for use in each pool worker
|
|
|
|
| 60 |
#'''
|
| 61 |
# 1) Instantiate the DocumentConverter
|
| 62 |
logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
|
| 63 |
+
|
| 64 |
try:
|
| 65 |
docconverter = DocumentConverter(
|
| 66 |
model_id, #: str,
|
|
|
|
| 118 |
Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
|
| 119 |
"""
|
| 120 |
|
| 121 |
+
from globals import config_load_models
|
| 122 |
try:
|
| 123 |
## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
|
| 124 |
+
weasyprint_libpath = config_load_models.weasyprint_libpath if config_load_models.weasyprint_libpath else None
|
| 125 |
# Set a new environment variable
|
| 126 |
+
set_weasyprint_library(weasyprint_libpath) ##utils.lib_loader.set_weasyprint_library()
|
| 127 |
except Exception as exc:
|
| 128 |
tb = traceback.format_exc()
|
| 129 |
logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
|
|
|
|
| 172 |
|
| 173 |
#def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
|
| 174 |
#def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2, progress = gr.Progress()) -> Union[Dict, str]: #str:
|
| 175 |
+
def convert_files(self, src_path: str, max_retries: int = 2) -> Union[Dict, str]:
|
| 176 |
#def convert_files(self, src_path: str) -> str:
|
| 177 |
"""
|
| 178 |
Worker task: use `extractor` to convert file with retry/backoff.
|
file_handler/file_utils.py
CHANGED
|
@@ -479,8 +479,8 @@ def write_markdown(
|
|
| 479 |
#md_path = Path("data") / output_dir / f"{src.stem}" / md_name ##debug
|
| 480 |
md_path = Path(output_dir) / f"{src.stem}" / md_name ##debug
|
| 481 |
##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
|
| 482 |
-
|
| 483 |
-
md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
|
| 484 |
md_path.parent.chmod(0)
|
| 485 |
|
| 486 |
try:
|
|
|
|
| 479 |
#md_path = Path("data") / output_dir / f"{src.stem}" / md_name ##debug
|
| 480 |
md_path = Path(output_dir) / f"{src.stem}" / md_name ##debug
|
| 481 |
##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
|
| 482 |
+
md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
|
| 483 |
+
#md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
|
| 484 |
md_path.parent.chmod(0)
|
| 485 |
|
| 486 |
try:
|
globals.py
CHANGED
|
@@ -6,6 +6,8 @@ class Config:
|
|
| 6 |
""" Single model_dict use across the app"""
|
| 7 |
def __init__(self):
|
| 8 |
self.model_dict = {}
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Create a single, shared instance of the Config class
|
| 11 |
# Other modules will import and use this instance.
|
|
|
|
| 6 |
""" Single model_dict use across the app"""
|
| 7 |
def __init__(self):
|
| 8 |
self.model_dict = {}
|
| 9 |
+
self.weasyprint_libpath = ""
|
| 10 |
+
self.config_ini = "utils\\config.ini"
|
| 11 |
|
| 12 |
# Create a single, shared instance of the Config class
|
| 13 |
# Other modules will import and use this instance.
|
ui/gradio_ui.py
CHANGED
|
@@ -2,7 +2,8 @@
|
|
| 2 |
from ast import Interactive
|
| 3 |
import gradio as gr
|
| 4 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 5 |
-
import asyncio
|
|
|
|
| 6 |
|
| 7 |
from pathlib import Path, WindowsPath
|
| 8 |
from typing import Optional, Union #, Dict, List, Any, Tuple
|
|
@@ -83,9 +84,12 @@ def convert_batch(
|
|
| 83 |
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 84 |
force_ocr: bool = True, #Optional[bool] = False,
|
| 85 |
page_range: str = None, #Optional[str] = None,
|
|
|
|
| 86 |
tz_hours: str = None,
|
| 87 |
oauth_token: gr.OAuthToken | None=None,
|
| 88 |
-
progress: gr.Progress = gr.Progress(), #Progress tracker to keep tab on pool queue executor
|
|
|
|
|
|
|
| 89 |
): #-> str:
|
| 90 |
"""
|
| 91 |
Handles the conversion process using multiprocessing.
|
|
@@ -98,6 +102,7 @@ def convert_batch(
|
|
| 98 |
# [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 99 |
yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
|
| 100 |
progress((0,16), f"Commencing Processing ...")
|
|
|
|
| 101 |
|
| 102 |
# get token from logged-in user:
|
| 103 |
api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
|
|
@@ -126,6 +131,7 @@ def convert_batch(
|
|
| 126 |
return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
|
| 127 |
'''
|
| 128 |
progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
|
|
|
|
| 129 |
## debug
|
| 130 |
#logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
|
| 131 |
|
|
@@ -136,8 +142,11 @@ def convert_batch(
|
|
| 136 |
return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
|
| 137 |
|
| 138 |
progress((2,16), desc=f"Getting configuration values")
|
|
|
|
| 139 |
# Get config values if not provided
|
| 140 |
-
config_file = find_file("config.ini") ##from file_handler.file_utils ##takes a bit of time to process. #NeedOptimise
|
|
|
|
|
|
|
| 141 |
model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
|
| 142 |
openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
|
| 143 |
openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
|
|
@@ -147,12 +156,16 @@ def convert_batch(
|
|
| 147 |
output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
|
| 148 |
use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
|
| 149 |
page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
|
|
|
|
|
|
|
| 150 |
|
| 151 |
progress((3,16), desc=f"Retrieved configuration values")
|
|
|
|
| 152 |
|
| 153 |
# Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
|
| 154 |
-
progress((4,16), desc=f"Initialiasing init_args")
|
| 155 |
yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
|
|
|
|
|
|
|
| 156 |
init_args = (
|
| 157 |
provider,
|
| 158 |
model_id,
|
|
@@ -175,13 +188,15 @@ def convert_batch(
|
|
| 175 |
use_llm,
|
| 176 |
force_ocr,
|
| 177 |
page_range,
|
|
|
|
| 178 |
)
|
| 179 |
|
| 180 |
# create output_dir
|
| 181 |
try:
|
| 182 |
yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 183 |
progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
|
| 184 |
-
|
|
|
|
| 185 |
#pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
|
| 186 |
|
| 187 |
# Create Marker output_dir in temporary directory where Gradio can access it.
|
|
@@ -191,6 +206,7 @@ def convert_batch(
|
|
| 191 |
logger.info(f"✓ output_dir created: ", extra={"output_dir": pdf2md_converter.output_dir.name, "in": str(pdf2md_converter.output_dir.parent)})
|
| 192 |
yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 193 |
progress((6,16), desc=f"✓ Created output_dir.")
|
|
|
|
| 194 |
except Exception as exc:
|
| 195 |
tb = traceback.format_exc()
|
| 196 |
tbp = traceback.print_exc() # Print the exception traceback
|
|
@@ -206,6 +222,7 @@ def convert_batch(
|
|
| 206 |
logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
|
| 207 |
yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 208 |
progress((7,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
|
|
|
|
| 209 |
|
| 210 |
# Create a pool with init_worker initialiser
|
| 211 |
with ProcessPoolExecutor(
|
|
@@ -215,6 +232,7 @@ def convert_batch(
|
|
| 215 |
) as pool:
|
| 216 |
logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
|
| 217 |
progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
|
|
|
|
| 218 |
|
| 219 |
# Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
|
| 220 |
# The 'docconverter' argument is implicitly handled by the initialiser
|
|
@@ -223,21 +241,27 @@ def convert_batch(
|
|
| 223 |
#futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
|
| 224 |
#logs = [f.result() for f in futures]
|
| 225 |
try:
|
| 226 |
-
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 227 |
progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
|
|
|
|
|
|
|
| 228 |
|
| 229 |
# Use progress.tqdm to integrate with the executor map
|
| 230 |
#results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 231 |
for result_interim in progress.tqdm(
|
| 232 |
-
iterable=pool.map(pdf2md_converter.convert_files, pdf_files) #, max_retries), total=len(pdf_files)
|
| 233 |
-
):
|
| 234 |
results.append(result_interim)
|
|
|
|
| 235 |
# Update the Gradio UI to improve user-friendly eXperience
|
| 236 |
-
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 237 |
-
progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
|
|
|
|
|
|
| 238 |
|
| 239 |
yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 240 |
progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
|
|
|
|
| 241 |
except Exception as exc:
|
| 242 |
# Raise the exception to stop the Gradio app: exception to halt execution
|
| 243 |
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
|
@@ -248,8 +272,9 @@ def convert_batch(
|
|
| 248 |
|
| 249 |
# Process file conversion results
|
| 250 |
try:
|
| 251 |
-
progress((12,16), desc="Processing results from files conversion") ##rekickin
|
| 252 |
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
|
|
|
|
|
|
| 253 |
|
| 254 |
logs = []
|
| 255 |
logs_files_images = []
|
|
@@ -265,10 +290,11 @@ def convert_batch(
|
|
| 265 |
for i, log in enumerate(logs):
|
| 266 |
logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
|
| 267 |
logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 268 |
-
|
| 269 |
# Update the Gradio UI to improve user-friendly eXperience
|
| 270 |
#yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
|
| 271 |
-
|
|
|
|
| 272 |
except Exception as exc:
|
| 273 |
tbp = traceback.print_exc() # Print the exception traceback
|
| 274 |
logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True) # Log the full traceback
|
|
@@ -283,11 +309,14 @@ def convert_batch(
|
|
| 283 |
# Zip Processed Files and images. Insert to first index
|
| 284 |
try: ##from file_handler.file_utils
|
| 285 |
progress((13,16), desc="Zipping processed files and images")
|
|
|
|
| 286 |
zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
|
| 287 |
logs_files_images.insert(0, zipped_processed_files)
|
| 288 |
|
| 289 |
-
|
| 290 |
#yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
|
|
|
|
|
|
|
| 291 |
|
| 292 |
except Exception as exc:
|
| 293 |
tb = traceback.format_exc()
|
|
@@ -300,6 +329,7 @@ def convert_batch(
|
|
| 300 |
# Return processed files log
|
| 301 |
try:
|
| 302 |
progress((15,16), desc="Formatting processed log results")
|
|
|
|
| 303 |
|
| 304 |
## # Convert logs list of dicts to formatted json string
|
| 305 |
logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
|
|
@@ -310,6 +340,7 @@ def convert_batch(
|
|
| 310 |
logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
|
| 311 |
|
| 312 |
progress((16,16), desc="Complete processing and formatting file processing results")
|
|
|
|
| 313 |
# [templates]
|
| 314 |
#outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 315 |
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
|
@@ -581,7 +612,8 @@ def build_interface() -> gr.Blocks:
|
|
| 581 |
gr.Markdown(f"#### **Marker Configuration**")
|
| 582 |
with gr.Row():
|
| 583 |
openai_base_url_tb = gr.Textbox(
|
| 584 |
-
label="OpenAI Base URL
|
|
|
|
| 585 |
value="https://router.huggingface.co/v1",
|
| 586 |
lines=1,
|
| 587 |
max_lines=1,
|
|
@@ -624,15 +656,24 @@ def build_interface() -> gr.Blocks:
|
|
| 624 |
value=False
|
| 625 |
)
|
| 626 |
force_ocr_cb = gr.Checkbox(
|
| 627 |
-
label="
|
| 628 |
value=True,
|
| 629 |
)
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
|
| 638 |
with gr.Accordion("🤗 HuggingFace Client Logout", open=True): #, open=False):
|
|
@@ -952,6 +993,7 @@ def build_interface() -> gr.Blocks:
|
|
| 952 |
use_llm_cb,
|
| 953 |
force_ocr_cb,
|
| 954 |
page_range_tb,
|
|
|
|
| 955 |
tz_hours_num, #state_tz_hours
|
| 956 |
]
|
| 957 |
|
|
|
|
| 2 |
from ast import Interactive
|
| 3 |
import gradio as gr
|
| 4 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 5 |
+
import asyncio ##future
|
| 6 |
+
import time
|
| 7 |
|
| 8 |
from pathlib import Path, WindowsPath
|
| 9 |
from typing import Optional, Union #, Dict, List, Any, Tuple
|
|
|
|
| 84 |
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 85 |
force_ocr: bool = True, #Optional[bool] = False,
|
| 86 |
page_range: str = None, #Optional[str] = None,
|
| 87 |
+
weasyprint_dll_directories: str = None,
|
| 88 |
tz_hours: str = None,
|
| 89 |
oauth_token: gr.OAuthToken | None=None,
|
| 90 |
+
progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
|
| 91 |
+
progress1: gr.Progress = gr.Progress(),
|
| 92 |
+
#progress2: gr.Progress = gr.Progress(track_tqdm=True),
|
| 93 |
): #-> str:
|
| 94 |
"""
|
| 95 |
Handles the conversion process using multiprocessing.
|
|
|
|
| 102 |
# [template]: #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 103 |
yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
|
| 104 |
progress((0,16), f"Commencing Processing ...")
|
| 105 |
+
time.sleep(0.25)
|
| 106 |
|
| 107 |
# get token from logged-in user:
|
| 108 |
api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
|
|
|
|
| 131 |
return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
|
| 132 |
'''
|
| 133 |
progress((1,16), desc=f"Log in: {is_loggedin_huggingface(api_token)}")
|
| 134 |
+
time.sleep(0.25)
|
| 135 |
## debug
|
| 136 |
#logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
|
| 137 |
|
|
|
|
| 142 |
return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
|
| 143 |
|
| 144 |
progress((2,16), desc=f"Getting configuration values")
|
| 145 |
+
time.sleep(0.25)
|
| 146 |
# Get config values if not provided
|
| 147 |
+
#config_file = find_file("config.ini") ##from file_handler.file_utils ##takes a bit of time to process. #NeedOptimise
|
| 148 |
+
|
| 149 |
+
config_file = Path("utils") / "config.ini" ##SMY: speed up sacrificing flexibility
|
| 150 |
model_id = model_id if model_id else get_config_value(config_file, "MARKER_CAP", "MODEL_ID")
|
| 151 |
openai_base_url = openai_base_url if openai_base_url else get_config_value(config_file, "MARKER_CAP", "OPENAI_BASE_URL")
|
| 152 |
openai_image_format = openai_image_format if openai_image_format else get_config_value(config_file, "MARKER_CAP", "OPENAI_IMAGE_FORMAT")
|
|
|
|
| 156 |
output_dir_string = output_dir_string if output_dir_string else str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR"))
|
| 157 |
use_llm = use_llm if use_llm else get_config_value(config_file, "MARKER_CAP", "USE_LLM")
|
| 158 |
page_range = page_range if page_range else get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE")
|
| 159 |
+
weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
|
| 160 |
+
config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
|
| 161 |
|
| 162 |
progress((3,16), desc=f"Retrieved configuration values")
|
| 163 |
+
time.sleep(0.25)
|
| 164 |
|
| 165 |
# Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
|
|
|
|
| 166 |
yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 167 |
+
progress((4,16), desc=f"Initialiasing init_args")
|
| 168 |
+
time.sleep(0.25)
|
| 169 |
init_args = (
|
| 170 |
provider,
|
| 171 |
model_id,
|
|
|
|
| 188 |
use_llm,
|
| 189 |
force_ocr,
|
| 190 |
page_range,
|
| 191 |
+
#progress,
|
| 192 |
)
|
| 193 |
|
| 194 |
# create output_dir
|
| 195 |
try:
|
| 196 |
yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 197 |
progress((5,16), desc=f"ProcessPoolExecutor: Creating output_dir")
|
| 198 |
+
time.sleep(0.25)
|
| 199 |
+
|
| 200 |
#pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
|
| 201 |
|
| 202 |
# Create Marker output_dir in temporary directory where Gradio can access it.
|
|
|
|
| 206 |
logger.info(f"✓ output_dir created: ", extra={"output_dir": pdf2md_converter.output_dir.name, "in": str(pdf2md_converter.output_dir.parent)})
|
| 207 |
yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 208 |
progress((6,16), desc=f"✓ Created output_dir.")
|
| 209 |
+
time.sleep(0.25)
|
| 210 |
except Exception as exc:
|
| 211 |
tb = traceback.format_exc()
|
| 212 |
tbp = traceback.print_exc() # Print the exception traceback
|
|
|
|
| 222 |
logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
|
| 223 |
yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 224 |
progress((7,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
|
| 225 |
+
time.sleep(0.25)
|
| 226 |
|
| 227 |
# Create a pool with init_worker initialiser
|
| 228 |
with ProcessPoolExecutor(
|
|
|
|
| 232 |
) as pool:
|
| 233 |
logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
|
| 234 |
progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
|
| 235 |
+
time.sleep(0.25)
|
| 236 |
|
| 237 |
# Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
|
| 238 |
# The 'docconverter' argument is implicitly handled by the initialiser
|
|
|
|
| 241 |
#futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
|
| 242 |
#logs = [f.result() for f in futures]
|
| 243 |
try:
|
| 244 |
+
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 245 |
progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
|
| 246 |
+
time.sleep(0.25)
|
| 247 |
+
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 248 |
|
| 249 |
# Use progress.tqdm to integrate with the executor map
|
| 250 |
#results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 251 |
for result_interim in progress.tqdm(
|
| 252 |
+
iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
|
| 253 |
+
desc="ProcessPoolExecutor: Pooling file conversion ..."):
|
| 254 |
results.append(result_interim)
|
| 255 |
+
|
| 256 |
# Update the Gradio UI to improve user-friendly eXperience
|
| 257 |
+
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 258 |
+
#progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 259 |
+
#progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 260 |
+
#time.sleep(0.25)
|
| 261 |
|
| 262 |
yield gr.update(interactive=True), f"ProcessPoolExecutor: Got Results from files conversion: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 263 |
progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
|
| 264 |
+
time.sleep(0.25)
|
| 265 |
except Exception as exc:
|
| 266 |
# Raise the exception to stop the Gradio app: exception to halt execution
|
| 267 |
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
|
|
|
| 272 |
|
| 273 |
# Process file conversion results
|
| 274 |
try:
|
|
|
|
| 275 |
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
| 276 |
+
progress((12,16), desc="Processing results from files conversion") ##rekickin
|
| 277 |
+
time.sleep(0.25)
|
| 278 |
|
| 279 |
logs = []
|
| 280 |
logs_files_images = []
|
|
|
|
| 290 |
for i, log in enumerate(logs):
|
| 291 |
logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
|
| 292 |
logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 293 |
+
i_image_count = log.get("images", 0)
|
| 294 |
# Update the Gradio UI to improve user-friendly eXperience
|
| 295 |
#yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
|
| 296 |
+
progress1(0.7, desc=f"Processing result log {i}: {str(log)}")
|
| 297 |
+
logs_count = i+i_image_count
|
| 298 |
except Exception as exc:
|
| 299 |
tbp = traceback.print_exc() # Print the exception traceback
|
| 300 |
logger.exception("Error during processing results logs → {exc}\n{tbp}", exc_info=True) # Log the full traceback
|
|
|
|
| 309 |
# Zip Processed Files and images. Insert to first index
|
| 310 |
try: ##from file_handler.file_utils
|
| 311 |
progress((13,16), desc="Zipping processed files and images")
|
| 312 |
+
time.sleep(0.25)
|
| 313 |
zipped_processed_files = zip_processed_files(root_dir=f"{output_dir}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
|
| 314 |
logs_files_images.insert(0, zipped_processed_files)
|
| 315 |
|
| 316 |
+
|
| 317 |
#yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
|
| 318 |
+
progress((14,16), desc="Zipped processed files and images")
|
| 319 |
+
time.sleep(0.25)
|
| 320 |
|
| 321 |
except Exception as exc:
|
| 322 |
tb = traceback.format_exc()
|
|
|
|
| 329 |
# Return processed files log
|
| 330 |
try:
|
| 331 |
progress((15,16), desc="Formatting processed log results")
|
| 332 |
+
time.sleep(0.25)
|
| 333 |
|
| 334 |
## # Convert logs list of dicts to formatted json string
|
| 335 |
logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
|
|
|
|
| 340 |
logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
|
| 341 |
|
| 342 |
progress((16,16), desc="Complete processing and formatting file processing results")
|
| 343 |
+
time.sleep(0.25)
|
| 344 |
# [templates]
|
| 345 |
#outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 346 |
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
|
|
|
| 612 |
gr.Markdown(f"#### **Marker Configuration**")
|
| 613 |
with gr.Row():
|
| 614 |
openai_base_url_tb = gr.Textbox(
|
| 615 |
+
label="OpenAI Base URL",
|
| 616 |
+
info = "default HuggingFace",
|
| 617 |
value="https://router.huggingface.co/v1",
|
| 618 |
lines=1,
|
| 619 |
max_lines=1,
|
|
|
|
| 656 |
value=False
|
| 657 |
)
|
| 658 |
force_ocr_cb = gr.Checkbox(
|
| 659 |
+
label="Force OCR on all pages",
|
| 660 |
value=True,
|
| 661 |
)
|
| 662 |
+
with gr.Column():
|
| 663 |
+
page_range_tb = gr.Textbox(
|
| 664 |
+
label="Page Range (Optional)",
|
| 665 |
+
value=0,
|
| 666 |
+
placeholder="Example: 0,1-5,8,12-15 ~(default: first page)",
|
| 667 |
+
lines=1,
|
| 668 |
+
max_lines=1,
|
| 669 |
+
)
|
| 670 |
+
weasyprint_dll_directories_tb = gr.Textbox(
|
| 671 |
+
label="Path to weasyprint DLL libraries",
|
| 672 |
+
info='"C:\\Dat\\dev\\gtk3-runtime\\bin" or "C:\\msys64\\mingw64\\bin"',
|
| 673 |
+
placeholder="C:\\msys64\\mingw64\\bin",
|
| 674 |
+
lines=1,
|
| 675 |
+
max_lines=1,
|
| 676 |
+
)
|
| 677 |
|
| 678 |
|
| 679 |
with gr.Accordion("🤗 HuggingFace Client Logout", open=True): #, open=False):
|
|
|
|
| 993 |
use_llm_cb,
|
| 994 |
force_ocr_cb,
|
| 995 |
page_range_tb,
|
| 996 |
+
weasyprint_dll_directories_tb,
|
| 997 |
tz_hours_num, #state_tz_hours
|
| 998 |
]
|
| 999 |
|
utils/get_config.py
CHANGED
|
@@ -35,6 +35,7 @@ def get_config_value(config_file:Path, section_key:str, parameter:str, fallback:
|
|
| 35 |
try:
|
| 36 |
#config_file = find_config(config_file)
|
| 37 |
cfg = config()
|
|
|
|
| 38 |
if config_file.is_file():
|
| 39 |
cfg.read(config_file)
|
| 40 |
param_value = cfg[section_key].get(option=parameter, fallback=fallback) #"C:\\Dat\\dev\\gtk3-runtime\\bin")
|
|
|
|
| 35 |
try:
|
| 36 |
#config_file = find_config(config_file)
|
| 37 |
cfg = config()
|
| 38 |
+
config_file = config_file if isinstance(config_file, Path) else Path(config_file)
|
| 39 |
if config_file.is_file():
|
| 40 |
cfg.read(config_file)
|
| 41 |
param_value = cfg[section_key].get(option=parameter, fallback=fallback) #"C:\\Dat\\dev\\gtk3-runtime\\bin")
|
utils/lib_loader.py
CHANGED
|
@@ -20,13 +20,14 @@ def set_weasyprint_library(libpath: Union[str, Path] = None, config_file: Union[
|
|
| 20 |
|
| 21 |
#libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
|
| 22 |
if not libpath:
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
from
|
| 28 |
-
config_file =
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
# Check if the file exists before attempting to load it
|
| 32 |
#if not os.path.exists(libobject):
|
|
|
|
| 20 |
|
| 21 |
#libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
|
| 22 |
if not libpath:
|
| 23 |
+
#from file_handler.file_utils import find_file
|
| 24 |
+
#config_file = find_file("config.ini") ##from file_handler.file_utils
|
| 25 |
+
|
| 26 |
+
## Alternate to speed up while sacrificing
|
| 27 |
+
from globals import config_load_models
|
| 28 |
+
config_file = config_load_models.config_ini
|
| 29 |
+
|
| 30 |
+
lib_path = get_config_value(Path(config_file), "LIBRARIES_CAP", "WEASYPRINT_DLL_DIRECTORIES") if not libpath else "C:\\msys64\\mingw64\\bin"
|
| 31 |
|
| 32 |
# Check if the file exists before attempting to load it
|
| 33 |
#if not os.path.exists(libobject):
|