Spaces:
Sleeping
Sleeping
baseline08_beta01.5_28Sept25: fix oauth_token.token (convert_batch), log in
Browse files- README.md +2 -7
- converters/extraction_converter.py +1 -1
- llm/llm_login.py +1 -1
- requirements.txt +1 -1
- ui/gradio_ui.py +57 -28
README.md
CHANGED
|
@@ -7,13 +7,14 @@ sdk: gradio
|
|
| 7 |
command: python main.py
|
| 8 |
app_file: main.py
|
| 9 |
hf_oauth: true
|
|
|
|
| 10 |
python_version: 3.12
|
| 11 |
license: mit
|
| 12 |
pinned: true
|
| 13 |
short_description: PDF & HTML parser to markdown
|
| 14 |
models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b]
|
| 15 |
tags: [markdown, PDF, parser, converter, extractor]
|
| 16 |
-
|
| 17 |
owner: research-semmyk
|
| 18 |
#---
|
| 19 |
#
|
|
@@ -39,18 +40,12 @@ version: 0.1.0
|
|
| 39 |
readme: README.md
|
| 40 |
requires-python: ">=3.12"
|
| 41 |
#dependencies: []
|
| 42 |
-
#owner: research-semmyk
|
| 43 |
#preload_from_hub:
|
| 44 |
# - https://huggingface.co/datalab-to/surya_layout
|
| 45 |
# - https://huggingface.co/datalab-to/surya_tablerec
|
| 46 |
# - huggingface.co/datalab-to/line_detector0
|
| 47 |
# - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
|
| 48 |
#owner: research-semmyk
|
| 49 |
-
#preload_from_hub:
|
| 50 |
-
# - https://huggingface.co/datalab-to/surya_layout
|
| 51 |
-
# - https://huggingface.co/datalab-to/surya_tablerec
|
| 52 |
-
# - huggingface.co/datalab-to/line_detector0
|
| 53 |
-
# - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
|
| 54 |
---
|
| 55 |
|
| 56 |
# parserPDF
|
|
|
|
| 7 |
command: python main.py
|
| 8 |
app_file: main.py
|
| 9 |
hf_oauth: true
|
| 10 |
+
oauth_scopes: [read-access]
|
| 11 |
python_version: 3.12
|
| 12 |
license: mit
|
| 13 |
pinned: true
|
| 14 |
short_description: PDF & HTML parser to markdown
|
| 15 |
models: [meta-llama/Llama-4-Maverick-17B-128E-Instruct, openai/gpt-oss-120b, openai/gpt-oss-20b]
|
| 16 |
tags: [markdown, PDF, parser, converter, extractor]
|
| 17 |
+
preload_from_hub: [https://huggingface.co/datalab-to/surya_layout, https://huggingface.co/datalab-to/surya_tablerec, huggingface.co/datalab-to/line_detector0, https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json]
|
| 18 |
owner: research-semmyk
|
| 19 |
#---
|
| 20 |
#
|
|
|
|
| 40 |
readme: README.md
|
| 41 |
requires-python: ">=3.12"
|
| 42 |
#dependencies: []
|
|
|
|
| 43 |
#preload_from_hub:
|
| 44 |
# - https://huggingface.co/datalab-to/surya_layout
|
| 45 |
# - https://huggingface.co/datalab-to/surya_tablerec
|
| 46 |
# - huggingface.co/datalab-to/line_detector0
|
| 47 |
# - https://huggingface.co/tarun-menta/ocr_error_detection/blob/main/config.json
|
| 48 |
#owner: research-semmyk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
---
|
| 50 |
|
| 51 |
# parserPDF
|
converters/extraction_converter.py
CHANGED
|
@@ -133,7 +133,7 @@ class DocumentConverter:
|
|
| 133 |
llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
|
| 134 |
|
| 135 |
# sets api_key required by Marker
|
| 136 |
-
os.environ["OPENAI_API_KEY"] = self.openai_api_key
|
| 137 |
logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
|
| 138 |
|
| 139 |
#self.converter: MarkerConverter = MarkerConverter(
|
|
|
|
| 133 |
llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
|
| 134 |
|
| 135 |
# sets api_key required by Marker
|
| 136 |
+
os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key ## to handle Marker's assertion test on OpenAI
|
| 137 |
logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
|
| 138 |
|
| 139 |
#self.converter: MarkerConverter = MarkerConverter(
|
llm/llm_login.py
CHANGED
|
@@ -47,7 +47,7 @@ def login_huggingface(token: Optional[str] = None):
|
|
| 47 |
#return True
|
| 48 |
except Exception as exc:
|
| 49 |
# Respect common env var names; prefer explicit token arg when provided
|
| 50 |
-
fallback_token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 51 |
if fallback_token:
|
| 52 |
try:
|
| 53 |
login(token=fallback_token)
|
|
|
|
| 47 |
#return True
|
| 48 |
except Exception as exc:
|
| 49 |
# Respect common env var names; prefer explicit token arg when provided
|
| 50 |
+
fallback_token = token if token else get_token() or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 51 |
if fallback_token:
|
| 52 |
try:
|
| 53 |
login(token=fallback_token)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
gradio>=
|
| 2 |
marker-pdf[full]>=1.3.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
|
| 3 |
weasyprint>=59.0 # optional fallback if pandoc is not available
|
| 4 |
#pandoc==2.3 # for Markdown β PDF conversion
|
|
|
|
| 1 |
+
gradio>=5.40.0
|
| 2 |
marker-pdf[full]>=1.3.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
|
| 3 |
weasyprint>=59.0 # optional fallback if pandoc is not available
|
| 4 |
#pandoc==2.3 # for Markdown β PDF conversion
|
ui/gradio_ui.py
CHANGED
|
@@ -43,7 +43,7 @@ def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
|
|
| 43 |
oauth_token = oauth_token
|
| 44 |
else: get_token()
|
| 45 |
|
| 46 |
-
return oauth_token
|
| 47 |
|
| 48 |
# pool executor to convert files called by Gradio
|
| 49 |
##SMY: TODO: future: refactor to gradio_process.py
|
|
@@ -74,6 +74,7 @@ def convert_batch(
|
|
| 74 |
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 75 |
page_range: str = None, #Optional[str] = None,
|
| 76 |
tz_hours: str = None,
|
|
|
|
| 77 |
): #-> str:
|
| 78 |
"""
|
| 79 |
Handles the conversion process using multiprocessing.
|
|
@@ -86,7 +87,7 @@ def convert_batch(
|
|
| 86 |
yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"__init__.py"
|
| 87 |
|
| 88 |
# get token from logged-in user:
|
| 89 |
-
api_token = get_login_token(api_token_gr)
|
| 90 |
##SMY: Strictly debug. Must not be live
|
| 91 |
logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token]": api_token, "api_token_gr": api_token_gr})
|
| 92 |
|
|
@@ -262,8 +263,9 @@ def convert_batch(
|
|
| 262 |
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
| 263 |
#return logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
| 264 |
#return gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
|
| 265 |
-
yield gr.update(interactive=True), gr.update(), gr.update(visible=True), gr.update(visible=True)
|
| 266 |
-
yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
|
|
|
| 267 |
|
| 268 |
except Exception as exc:
|
| 269 |
tb = traceback.format_exc()
|
|
@@ -450,12 +452,6 @@ def build_interface() -> gr.Blocks:
|
|
| 450 |
|
| 451 |
return updated_files, message
|
| 452 |
|
| 453 |
-
def clear_state():
|
| 454 |
-
"""
|
| 455 |
-
Clears the accumulated state of uloaded file list, output textbox, files and directory upload.
|
| 456 |
-
"""
|
| 457 |
-
return [], "Files list cleared.", [], []
|
| 458 |
-
|
| 459 |
# with gr.Blocks(title=TITLE) as demo
|
| 460 |
with gr.Blocks(title=TITLE, css=custom_css) as demo:
|
| 461 |
gr.Markdown(f"## {DESCRIPTION}")
|
|
@@ -584,11 +580,12 @@ def build_interface() -> gr.Blocks:
|
|
| 584 |
|
| 585 |
with gr.Accordion("π€ HuggingFace Client Logout", open=True): #, open=False):
|
| 586 |
# Logout controls
|
| 587 |
-
|
| 588 |
-
logout_status = gr.Markdown(visible=True) #visible=False)
|
| 589 |
with gr.Row():
|
| 590 |
-
hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace π€", logout_value="Clear Session & Logout of HF: ({})", variant="huggingface")
|
| 591 |
-
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
# The gr.State component to hold the accumulated list of files
|
| 594 |
uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
|
|
@@ -759,7 +756,35 @@ def build_interface() -> gr.Blocks:
|
|
| 759 |
)
|
| 760 |
hf_provider_dd.change(on_provider_change, inputs=hf_provider_dd, outputs=None)
|
| 761 |
|
|
|
|
| 762 |
# HuggingFace Client Logout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
def do_logout_hf():
|
| 764 |
try:
|
| 765 |
ok = docconverter.client.logout()
|
|
@@ -772,18 +797,9 @@ def build_interface() -> gr.Blocks:
|
|
| 772 |
msg = "β οΈ Logout. No HF session"
|
| 773 |
return msg
|
| 774 |
#yield msg ## generator for string
|
| 775 |
-
'''def get_login_token(state_api_token_arg, oauth_token: gr.OAuthToken | None=None):
|
| 776 |
-
#oauth_token = get_token() if oauth_token is not None else state_api_token
|
| 777 |
-
#oauth_token = oauth_token if oauth_token else state_api_token_arg
|
| 778 |
-
if oauth_token:
|
| 779 |
-
print(oauth_token)
|
| 780 |
-
return oauth_token
|
| 781 |
-
else:
|
| 782 |
-
oauth_token = get_token()
|
| 783 |
-
print(oauth_token)
|
| 784 |
-
return oauth_token'''
|
| 785 |
|
| 786 |
-
def custom_do_logout(hf_login_logout_btn_arg: gr.LoginButton, state_api_token_arg: gr.State):
|
|
|
|
| 787 |
#global state_api_token
|
| 788 |
''' ##SMY: TO DELETE
|
| 789 |
try:
|
|
@@ -797,12 +813,25 @@ def build_interface() -> gr.Blocks:
|
|
| 797 |
msg = do_logout_hf()
|
| 798 |
##debug
|
| 799 |
#msg = "β
Session Cleared. Remember to close browser." if "Clear Session & Logout of HF" in hf_login_logout_btn else "β οΈ Logout" # & Session Cleared"
|
| 800 |
-
return gr.update(value="Sign in to HuggingFace π€"), gr.update(value=""), gr.update(visible=True, value=msg)
|
| 801 |
#yield gr.update(value="Sign in to HuggingFace π€"), gr.update(value=""), gr.update(visible=True, value=msg)
|
| 802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
#hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
|
| 804 |
-
|
| 805 |
-
|
|
|
|
|
|
|
|
|
|
| 806 |
|
| 807 |
# --- PDF & HTML β Markdown tab ---
|
| 808 |
# Event handler for the multiple file upload button
|
|
|
|
| 43 |
oauth_token = oauth_token
|
| 44 |
else: get_token()
|
| 45 |
|
| 46 |
+
return oauth_token.token ##token value
|
| 47 |
|
| 48 |
# pool executor to convert files called by Gradio
|
| 49 |
##SMY: TODO: future: refactor to gradio_process.py
|
|
|
|
| 74 |
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 75 |
page_range: str = None, #Optional[str] = None,
|
| 76 |
tz_hours: str = None,
|
| 77 |
+
oauth_token: gr.OAuthToken | None=None,
|
| 78 |
): #-> str:
|
| 79 |
"""
|
| 80 |
Handles the conversion process using multiprocessing.
|
|
|
|
| 87 |
yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"__init__.py"
|
| 88 |
|
| 89 |
# get token from logged-in user:
|
| 90 |
+
api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
|
| 91 |
##SMY: Strictly debug. Must not be live
|
| 92 |
logger.log(level=30, msg="Commencing: get_login_token", extra={"api_token]": api_token, "api_token_gr": api_token_gr})
|
| 93 |
|
|
|
|
| 263 |
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
| 264 |
#return logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
| 265 |
#return gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
|
| 266 |
+
#yield gr.update(interactive=True), gr.update(), gr.update(visible=True), gr.update(visible=True)
|
| 267 |
+
#yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
| 268 |
+
return gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
| 269 |
|
| 270 |
except Exception as exc:
|
| 271 |
tb = traceback.format_exc()
|
|
|
|
| 452 |
|
| 453 |
return updated_files, message
|
| 454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
# with gr.Blocks(title=TITLE) as demo
|
| 456 |
with gr.Blocks(title=TITLE, css=custom_css) as demo:
|
| 457 |
gr.Markdown(f"## {DESCRIPTION}")
|
|
|
|
| 580 |
|
| 581 |
with gr.Accordion("π€ HuggingFace Client Logout", open=True): #, open=False):
|
| 582 |
# Logout controls
|
|
|
|
|
|
|
| 583 |
with gr.Row():
|
| 584 |
+
#hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace π€", logout_value="Clear Session & Logout of HF: ({})", variant="huggingface")
|
| 585 |
+
hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace π€", logout_value="Logout of HF: ({}) π€", variant="huggingface")
|
| 586 |
+
#logout_btn = gr.Button("Logout from session & HF (inference) Client", variant="stop", )
|
| 587 |
+
|
| 588 |
+
logout_status_md = gr.Markdown(visible=True) #visible=False)
|
| 589 |
|
| 590 |
# The gr.State component to hold the accumulated list of files
|
| 591 |
uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
|
|
|
|
| 756 |
)
|
| 757 |
hf_provider_dd.change(on_provider_change, inputs=hf_provider_dd, outputs=None)
|
| 758 |
|
| 759 |
+
|
| 760 |
# HuggingFace Client Logout
|
| 761 |
+
'''def get_login_token(state_api_token_arg, oauth_token: gr.OAuthToken | None=None):
|
| 762 |
+
#oauth_token = get_token() if oauth_token is not None else state_api_token
|
| 763 |
+
#oauth_token = oauth_token if oauth_token else state_api_token_arg
|
| 764 |
+
if oauth_token:
|
| 765 |
+
print(oauth_token)
|
| 766 |
+
return oauth_token
|
| 767 |
+
else:
|
| 768 |
+
oauth_token = get_token()
|
| 769 |
+
print(oauth_token)
|
| 770 |
+
return oauth_token'''
|
| 771 |
+
#'''
|
| 772 |
+
def do_logout(): ##SMY: use with clear_state() as needed
|
| 773 |
+
try:
|
| 774 |
+
#ok = docextractor.client.logout()
|
| 775 |
+
ok = docconverter.client.logout()
|
| 776 |
+
# Reset token textbox on successful logout
|
| 777 |
+
#msg = "β
Logged out of HuggingFace and cleared tokens. Remember to log out of HuggingFace completely." if ok else "β οΈ Logout failed."
|
| 778 |
+
msg = "β
Session Cleared. Remember to close browser." if ok else "β οΈ HF client closing failed."
|
| 779 |
+
|
| 780 |
+
return msg
|
| 781 |
+
#return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace π€"), gr.update(value="Clear session")
|
| 782 |
+
except AttributeError:
|
| 783 |
+
msg = "β οΈ HF client closing failed."
|
| 784 |
+
|
| 785 |
+
return msg
|
| 786 |
+
#return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace π€"), gr.update(value="Clear session", interactive=False)
|
| 787 |
+
#'''
|
| 788 |
def do_logout_hf():
|
| 789 |
try:
|
| 790 |
ok = docconverter.client.logout()
|
|
|
|
| 797 |
msg = "β οΈ Logout. No HF session"
|
| 798 |
return msg
|
| 799 |
#yield msg ## generator for string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
|
| 801 |
+
#def custom_do_logout(hf_login_logout_btn_arg: gr.LoginButton, state_api_token_arg: gr.State):
|
| 802 |
+
def custom_do_logout():
|
| 803 |
#global state_api_token
|
| 804 |
''' ##SMY: TO DELETE
|
| 805 |
try:
|
|
|
|
| 813 |
msg = do_logout_hf()
|
| 814 |
##debug
|
| 815 |
#msg = "β
Session Cleared. Remember to close browser." if "Clear Session & Logout of HF" in hf_login_logout_btn else "β οΈ Logout" # & Session Cleared"
|
| 816 |
+
return gr.update(value="Sign in to HuggingFace π€"), gr.update(value=""), gr.update(visible=True, value=msg) #, state_api_token_arg
|
| 817 |
#yield gr.update(value="Sign in to HuggingFace π€"), gr.update(value=""), gr.update(visible=True, value=msg)
|
| 818 |
|
| 819 |
+
# Files, status, session clearing
|
| 820 |
+
def clear_state():
|
| 821 |
+
"""
|
| 822 |
+
Clears the accumulated state of uploaded file list, output textbox, files and directory upload.
|
| 823 |
+
"""
|
| 824 |
+
#msg = f"Files list cleared: {do_logout()}" ## use as needed
|
| 825 |
+
msg = f"Files list cleared."
|
| 826 |
+
yield [], msg, '', ''
|
| 827 |
+
#return [], f"Files list cleared.", [], []
|
| 828 |
+
|
| 829 |
#hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
|
| 830 |
+
##unused
|
| 831 |
+
###hf_login_logout_btn.click(fn=custom_do_logout, inputs=[hf_login_logout_btn, state_api_token], outputs=[hf_login_logout_btn, api_token_tb, logout_status_md, state_api_token])
|
| 832 |
+
###logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status_md, hf_login_logout_btn, logout_btn])
|
| 833 |
+
#logout_btn.click(fn=clear_state, inputs=None, outputs=[uploaded_file_list, output_textbox, log_output, api_token_tb])
|
| 834 |
+
hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status_md]) #, state_api_token])
|
| 835 |
|
| 836 |
# --- PDF & HTML β Markdown tab ---
|
| 837 |
# Event handler for the multiple file upload button
|