semmyk commited on
Commit
bfbdd1d
·
1 Parent(s): d82ee51

baseline08_beta0.2.0_29Sept25: fix oauth_token.token (convert_batch). - fixing models load. - update README, requirements

Browse files
README.md CHANGED
@@ -4,17 +4,37 @@ emoji: 📚
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
 
 
7
  command: python main.py
8
  app_file: main.py
9
  hf_oauth: true
10
  oauth_scopes: [read-access]
11
- python_version: 3.12
12
  license: mit
13
  pinned: true
14
  short_description: PDF & HTML parser to markdown
15
- models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  tags: [markdown, PDF, parser, converter, extractor]
17
- preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
18
  owner: research-semmyk
19
  #---
20
  #
@@ -46,6 +66,14 @@ requires-python: ">=3.12"
46
  # - huggingface.co/datalab-to/line_detector0
47
  # - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
48
  #owner: research-semmyk
 
 
 
 
 
 
 
 
49
  ---
50
 
51
  # parserPDF
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.44.1
8
+ python_version: 3.12
9
  command: python main.py
10
  app_file: main.py
11
  hf_oauth: true
12
  oauth_scopes: [read-access]
 
13
  license: mit
14
  pinned: true
15
  short_description: PDF & HTML parser to markdown
16
+ #models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b, ]
17
+ models:
18
+ - meta-llama/Llama-4-Maverick-17B-128E-Instruct
19
+ - openai/gpt-oss-120b, openai/gpt-oss-20b
20
+ - vikp/surya_det3
21
+ - vikp/surya_rec2
22
+ - vikp/surya_tablerec
23
+ - datalab-to/surya_layout
24
+ - datalab-to/surya_tablerec
25
+ - datalab-to/texify
26
+ - datalab-to/ocr_error_detection
27
+ - datalab-to/inline_math_det0
28
+ - datalab-to/line_detector0
29
+ - xiaoyao9184/surya_text_detection
30
+ - xiaoyao9184/surya_text_recognition
31
+ - xiaoyao9184/surya_table_recognition
32
+ - xiaoyao9184/surya_texify
33
+ - xiaoyao9184/surya_layout
34
+ - xiaoyao9184/surya_ocr_error_detection
35
+ - xiaoyao9184/surya_inline_math_detection]
36
  tags: [markdown, PDF, parser, converter, extractor]
37
+ #preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
38
  owner: research-semmyk
39
  #---
40
  #
 
66
  # - huggingface.co/datalab-to/line_detector0
67
  # - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
68
  #owner: research-semmyk
69
+ ## Model list
70
+ #[
71
+ # "datalab/models/text_recognition/2025_08_29",
72
+ # "datalab/models/layout/2025_02_18",
73
+ # "datalab/models/table_recognition/2025_02_18",
74
+ # "datalab/models/text_detection/2025_05_07",
75
+ # "datalab/models/ocr_error_detection/2025_02_18",
76
+ #]
77
  ---
78
 
79
  # parserPDF
converters/extraction_converter.py CHANGED
@@ -21,6 +21,10 @@ from utils.logger import get_logger
21
 
22
  logger = get_logger(__name__)
23
 
 
 
 
 
24
  # Full document converter
25
  class DocumentConverter:
26
  """
@@ -43,7 +47,7 @@ class DocumentConverter:
43
  api_token: str,
44
  openai_base_url: str = "https://router.huggingface.co/v1",
45
  openai_image_format: Optional[str] = "webp",
46
- #max_workers: Optional[str] = 4,
47
  max_retries: Optional[int] = 2,
48
  output_format: str = "markdown",
49
  output_dir: Optional[Union[str, Path]] = "output_dir",
@@ -59,8 +63,9 @@ class DocumentConverter:
59
  self.top_p = top_p # self.client.top_p,
60
  self.llm_service = MarkerOpenAIService
61
  self.openai_image_format = openai_image_format #"png" #better compatibility
 
62
  self.max_retries = max_retries ## pass to __call__
63
- self.output_dir = output_dir
64
  self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
65
  #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
66
  self.page_range = page_range if page_range else None
@@ -117,18 +122,30 @@ class DocumentConverter:
117
  logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
118
  raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
119
 
120
- # 3) Create the artifact dictionary and retrieve the LLM service.
121
  try:
122
- #self.artifact_dict: Dict[str, Any] = self.get_create_model_dict ##SMY: Might have to eliminate function afterall
123
- self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict() ##SMY: BaseModel for Any??
124
- #logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
 
125
 
126
  except Exception as exc:
127
  tb = traceback.format_exc() #exc.__traceback__
128
  logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
129
  raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}") #.with_traceback(tb)
130
 
131
- # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
 
 
 
 
 
 
 
 
 
 
 
132
  try:
133
  llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
134
 
@@ -136,13 +153,18 @@ class DocumentConverter:
136
  os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key ## to handle Marker's assertion test on OpenAI
137
  logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
138
 
 
 
 
139
  #self.converter: MarkerConverter = MarkerConverter(
140
  self.converter = MarkerConverter(
141
- #artifact_dict=self.artifact_dict,
142
- artifact_dict=create_model_dict(),
143
- config=config_parser.generate_config_dict(),
 
 
144
  #llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
145
- llm_service=llm_service_str ##resolve
146
  )
147
 
148
  logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
@@ -160,7 +182,7 @@ class DocumentConverter:
160
  ## Enable higher quality processing with LLMs. ## See MarkerOpenAIService,
161
  #llm_service = llm_service.removeprefix("<class '").removesuffix("'>") # e.g <class 'marker.services.openai.OpenAIService'>
162
  llm_service = str(llm_service).split("'")[1] ## SMY: split and slicing
163
- self.use_llm = self.use_llm[0]
164
  self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
165
 
166
 
@@ -172,10 +194,11 @@ class DocumentConverter:
172
  "temperature" : self.temperature, #self.client.temperature,
173
  "top_p" : self.top_p, #self.client.top_p,
174
  "openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
 
175
  "max_retries" : self.max_retries, #3, ## pass to __call__
176
  "output_dir" : self.output_dir,
177
  "use_llm" : self.use_llm, #False, #True,
178
- "page_range" : self.page_range, #]debug #len(pdf_file)
179
  }
180
  return config_dict
181
  except Exception as exc:
@@ -184,6 +207,10 @@ class DocumentConverter:
184
  raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
185
  #raise
186
 
 
 
 
 
187
  ##SMY: flagged for deprecation
188
  ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
189
  #def get_extraction_converter(self, chat_fn):
 
21
 
22
  logger = get_logger(__name__)
23
 
24
+ # create/load models. Called to curtail reloading models at each instance
25
+ def load_models():
26
+ return create_model_dict()
27
+
28
  # Full document converter
29
  class DocumentConverter:
30
  """
 
47
  api_token: str,
48
  openai_base_url: str = "https://router.huggingface.co/v1",
49
  openai_image_format: Optional[str] = "webp",
50
+ max_workers: Optional[str] =1, #4, for config_dict["pdftext_workers"]
51
  max_retries: Optional[int] = 2,
52
  output_format: str = "markdown",
53
  output_dir: Optional[Union[str, Path]] = "output_dir",
 
63
  self.top_p = top_p # self.client.top_p,
64
  self.llm_service = MarkerOpenAIService
65
  self.openai_image_format = openai_image_format #"png" #better compatibility
66
+ self.max_workers = max_workers ## pass to config_dict["pdftext_workers"]
67
  self.max_retries = max_retries ## pass to __call__
68
+ self.output_dir = output_dir ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
69
  self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
70
  #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
71
  self.page_range = page_range if page_range else None
 
122
  logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
123
  raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
124
 
125
+ # 3) Create the artifact dictionary and retrieve the LLM service. ##SMY: disused
126
  try:
127
+ ##self.artifact_dict: Dict[str, Any] = self.get_create_model_dict ##SMY: Might have to eliminate function afterall
128
+ #self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict() ##SMY: BaseModel for Any??
129
+ self.artifact_dict = {} ##dummy
130
+ ##logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
131
 
132
  except Exception as exc:
133
  tb = traceback.format_exc() #exc.__traceback__
134
  logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
135
  raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}") #.with_traceback(tb)
136
 
137
+ # 4) Load models if not already loaded in reload mode
138
+ try:
139
+ if 'model_dict' not in globals():
140
+ #model_dict = self.load_models()
141
+ model_dict = load_models()
142
+ except Exception as exc:
143
+ tb = traceback.format_exc() #exc.__traceback__
144
+ logger.exception(f"✗ Error loading models (reload): {exc}\n{tb}")
145
+ raise RuntimeError(f"✗ Error loading models (reload): {exc}\n{tb}") #.with_traceback(tb)
146
+
147
+
148
+ # 5) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
149
  try:
150
  llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
151
 
 
153
  os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key ## to handle Marker's assertion test on OpenAI
154
  logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
155
 
156
+ config_dict = config_parser.generate_config_dict()
157
+ #config_dict["pdftext_worker"] = self.max_workers #1 ##SMY: move to get_config_dicts()
158
+
159
  #self.converter: MarkerConverter = MarkerConverter(
160
  self.converter = MarkerConverter(
161
+ ##artifact_dict=self.artifact_dict,
162
+ #artifact_dict=create_model_dict(),
163
+ artifact_dict=model_dict,
164
+ config=config_dict,
165
+ #config=config_parser.generate_config_dict(),
166
  #llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
167
+ llm_service=llm_service_str, ##resolve
168
  )
169
 
170
  logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
 
182
  ## Enable higher quality processing with LLMs. ## See MarkerOpenAIService,
183
  #llm_service = llm_service.removeprefix("<class '").removesuffix("'>") # e.g <class 'marker.services.openai.OpenAIService'>
184
  llm_service = str(llm_service).split("'")[1] ## SMY: split and slicing
185
+ self.use_llm = self.use_llm[0] if isinstance(self.use_llm, tuple) else self.use_llm
186
  self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
187
 
188
 
 
194
  "temperature" : self.temperature, #self.client.temperature,
195
  "top_p" : self.top_p, #self.client.top_p,
196
  "openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
197
+ "pdftext_workers": self.max_workers, ## number of workers to use for pdftext."
198
  "max_retries" : self.max_retries, #3, ## pass to __call__
199
  "output_dir" : self.output_dir,
200
  "use_llm" : self.use_llm, #False, #True,
201
+ "page_range" : self.page_range, ##debug #len(pdf_file)
202
  }
203
  return config_dict
204
  except Exception as exc:
 
207
  raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
208
  #raise
209
 
210
+ ''' # create/load models. Called to curtail reloading models at each instance
211
+ def load_models():
212
+ return create_model_dict()'''
213
+
214
  ##SMY: flagged for deprecation
215
  ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
216
  #def get_extraction_converter(self, chat_fn):
converters/pdf_to_md.py CHANGED
@@ -100,6 +100,7 @@ def init_worker(#self,
100
  api_token, #: str,
101
  openai_base_url, #: str = "https://router.huggingface.co/v1",
102
  openai_image_format, #: str | None = "webp",
 
103
  max_retries, #: int | None = 2,
104
  output_format, #: str = "markdown",
105
  output_dir, #: Union | None = "output_dir",
 
100
  api_token, #: str,
101
  openai_base_url, #: str = "https://router.huggingface.co/v1",
102
  openai_image_format, #: str | None = "webp",
103
+ max_workers, #: int | None = 1,
104
  max_retries, #: int | None = 2,
105
  output_format, #: str = "markdown",
106
  output_dir, #: Union | None = "output_dir",
llm/llm_login.py CHANGED
@@ -38,12 +38,12 @@ def login_huggingface(token: Optional[str] = None):
38
  try:
39
  #if HfApi.whoami(): ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
40
  if whoami(): ##SMY: Call HF API to know "whoami".
41
- logger.info("✔️ hf_login already", extra={"mode": "HF Oauth"})
42
  #return True
43
  else:
44
  login() ##SMY: Not visible/interactive to users onH Space. #limitation
45
  sleep(5) ##SMY pause for login. Helpful: pool async opex
46
- logger.info("✔️ hf_login already", extra={"mode": "cli"})
47
  #return True
48
  except Exception as exc:
49
  # Respect common env var names; prefer explicit token arg when provided
 
38
  try:
39
  #if HfApi.whoami(): ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
40
  if whoami(): ##SMY: Call HF API to know "whoami".
41
+ logger.info("✔️ hf_login already: whoami()", extra={"mode": "HF Oauth"})
42
  #return True
43
  else:
44
  login() ##SMY: Not visible/interactive to users onH Space. #limitation
45
  sleep(5) ##SMY pause for login. Helpful: pool async opex
46
+ logger.info("✔️ hf_login already: login()", extra={"mode": "cli"})
47
  #return True
48
  except Exception as exc:
49
  # Respect common env var names; prefer explicit token arg when provided
main.py CHANGED
@@ -9,6 +9,8 @@ setup_logging() ## set logging
9
  #logger = get_logger("pypdfmd")
10
  logger = get_logger("parserpdf")
11
 
 
 
12
  if __name__ == "__main__":
13
  # Ensure the working directory is clean
14
  #os.chdir(os.path.dirname(__file__))
@@ -19,4 +21,4 @@ if __name__ == "__main__":
19
 
20
  demo = build_interface()
21
  #demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
22
- demo.launch(debug=True, show_error=True, ssr_mode=False)
 
9
  #logger = get_logger("pypdfmd")
10
  logger = get_logger("parserpdf")
11
 
12
+
13
+
14
  if __name__ == "__main__":
15
  # Ensure the working directory is clean
16
  #os.chdir(os.path.dirname(__file__))
 
21
 
22
  demo = build_interface()
23
  #demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
24
+ demo.launch(debug=True, show_error=True, ssr_mode=True) #ssr_mode=False
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- gradio>=5.40.0
2
- marker-pdf[full]>=1.3.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
3
  weasyprint>=59.0 # optional fallback if pandoc is not available
4
  #pandoc==2.3 # for Markdown → PDF conversion
5
  python-magic==0.4.27 # file‑type detection
 
1
+ gradio>=5.44.0
2
+ marker-pdf[full]>=1.10.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
3
  weasyprint>=59.0 # optional fallback if pandoc is not available
4
  #pandoc==2.3 # for Markdown → PDF conversion
5
  python-magic==0.4.27 # file‑type detection
ui/gradio_ui.py CHANGED
@@ -33,6 +33,17 @@ pdf2md_converter = PdfToMarkdownConverter()
33
  #html2md_converter = HtmlToMarkdownConverter()
34
  #md2pdf_converter = MarkdownToPdfConverter()
35
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
38
  """ Use user's supplied token or Get token from logged-in users, else from token stored on the machine. Return token"""
@@ -46,7 +57,8 @@ def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
46
  return oauth_token.token ##token value
47
 
48
  # pool executor to convert files called by Gradio
49
- ##SMY: TODO: future: refactor to gradio_process.py
 
50
  def convert_batch(
51
  pdf_files, #: list[str],
52
  pdf_files_count: int,
 
33
  #html2md_converter = HtmlToMarkdownConverter()
34
  #md2pdf_converter = MarkdownToPdfConverter()
35
 
36
+
37
+ # User eXperience: Load Marker models ahead of time if not already loaded in reload mode
38
+ ## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
39
+ from converters.extraction_converter import load_models
40
+ try:
41
+ if 'model_dict' not in globals():
42
+ model_dict = load_models()
43
+ except Exception as exc:
44
+ #tb = traceback.format_exc() #exc.__traceback__
45
+ logger.exception(f"✗ Error loading models (reload): {exc}") #\n{tb}")
46
+ raise RuntimeError(f"✗ Error loading models (reload): {exc}") #\n{tb}")
47
 
48
  def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
49
  """ Use user's supplied token or Get token from logged-in users, else from token stored on the machine. Return token"""
 
57
  return oauth_token.token ##token value
58
 
59
  # pool executor to convert files called by Gradio
60
+ ##SMY: TODO: future: refactor to gradio_process.py and
61
+ ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
62
  def convert_batch(
63
  pdf_files, #: list[str],
64
  pdf_files_count: int,