Spaces:
Sleeping
Sleeping
baseline08_beta01_27Sept25: zipped, yield output, log in
Browse files- file_handler/file_utils.py +151 -32
- llm/hf_client.py +3 -3
- llm/llm_login.py +15 -6
- llm/openai_client.py +8 -6
- ui/gradio_ui.py +149 -89
- utils/get_config.py +1 -1
- utils/logger.py +2 -2
- utils/utils.py +24 -1
file_handler/file_utils.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
| 2 |
#import os
|
| 3 |
from pathlib import Path
|
| 4 |
import sys
|
|
|
|
|
|
|
|
|
|
| 5 |
from itertools import chain
|
| 6 |
from typing import List, Union, Any, Mapping
|
| 7 |
from PIL import Image
|
|
@@ -95,8 +98,8 @@ def resolve_grandparent_object(gp_object:str):
|
|
| 95 |
current_path = Path(sys.argv[0]).resolve()
|
| 96 |
except IndexError:
|
| 97 |
# Handle cases where sys.argv[0] might not exist (e.g., in some IDEs)
|
| 98 |
-
|
| 99 |
-
current_path = Path('.').resolve()
|
| 100 |
|
| 101 |
parent_dir = current_path.parent
|
| 102 |
grandparent_dir = current_path.parent.parent
|
|
@@ -109,7 +112,7 @@ def resolve_grandparent_object(gp_object:str):
|
|
| 109 |
#print(f"resolve: sys.path[1]: {sys.path[1]}") ##debug
|
| 110 |
|
| 111 |
|
| 112 |
-
def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs") -> Path:
|
| 113 |
"""
|
| 114 |
check if log file exists, else create one and return the file path.
|
| 115 |
|
|
@@ -123,6 +126,7 @@ def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs") -> Pa
|
|
| 123 |
import datetime
|
| 124 |
import warnings
|
| 125 |
import tempfile
|
|
|
|
| 126 |
|
| 127 |
# 1. Get the path of the current script's parent directory (the project folder).
|
| 128 |
# `__file__` is a special variable that holds the path to the current script.
|
|
@@ -151,7 +155,8 @@ def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs") -> Pa
|
|
| 151 |
|
| 152 |
# 4. Create log file with a timestamp inside the new logs directory.
|
| 153 |
# This ensures a unique log file is created for the day the script runs.
|
| 154 |
-
timestamp = datetime.datetime.now().strftime("%Y-%m-%d") #.strftime("%Y-%m-%d_%H-%M-%S")
|
|
|
|
| 155 |
log_file = logs_dir / f"{Path(filename).stem}_{timestamp}.log"
|
| 156 |
|
| 157 |
# 5. Check if the file exists (it won't, if it's not the same day).
|
|
@@ -171,8 +176,33 @@ resolve_grandparent_object("file_handler")
|
|
| 171 |
print(f'file: {check_create_logfile("app_logging.log")}')
|
| 172 |
'''
|
| 173 |
|
| 174 |
-
##SMY: to revisit. Make generic for any file apart from log files
|
| 175 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
"""
|
| 177 |
check if File exists, else create one and return the file path.
|
| 178 |
|
|
@@ -182,35 +212,124 @@ def check_create_file(filename: str, dir_path: Union[str, Path]="logs") -> Path:
|
|
| 182 |
Returns:
|
| 183 |
The pathlib.Path object for the file
|
| 184 |
"""
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
# `exist_ok=True` prevents an error if the directory already exists.
|
| 196 |
-
dir_path = project_root / dir_path
|
| 197 |
-
if not dir_path.is_dir():
|
| 198 |
-
dir_path.mkdir(parents=True, exist_ok=True, mode=0o2755) #, mode=0o2644)
|
| 199 |
-
#dir_path.chmod(0)
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
#'''
|
| 207 |
-
if not file_path.exists(): # check if file doesn't exist
|
| 208 |
-
file_path.touch(exist_ok=True) #, mode=0o2664) # Creates an empty file if it doesn't exists
|
| 209 |
-
#file_dir.touch(mode=0o2644, exist_ok=True) #, parents=True) ##SMY: Note Permission Errno13 - https://stackoverflow.com/a/57454275
|
| 210 |
-
#file_dir.chmod(0)
|
| 211 |
-
#'''
|
| 212 |
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
def is_file_with_extension(path_obj: Path) -> bool:
|
| 216 |
"""
|
|
|
|
| 2 |
#import os
|
| 3 |
from pathlib import Path
|
| 4 |
import sys
|
| 5 |
+
import shutil
|
| 6 |
+
import tempfile
|
| 7 |
+
|
| 8 |
from itertools import chain
|
| 9 |
from typing import List, Union, Any, Mapping
|
| 10 |
from PIL import Image
|
|
|
|
| 98 |
current_path = Path(sys.argv[0]).resolve()
|
| 99 |
except IndexError:
|
| 100 |
# Handle cases where sys.argv[0] might not exist (e.g., in some IDEs)
|
| 101 |
+
current_path = Path(__file__).resolve()
|
| 102 |
+
#current_path = Path('.').resolve() ##unreliable
|
| 103 |
|
| 104 |
parent_dir = current_path.parent
|
| 105 |
grandparent_dir = current_path.parent.parent
|
|
|
|
| 112 |
#print(f"resolve: sys.path[1]: {sys.path[1]}") ##debug
|
| 113 |
|
| 114 |
|
| 115 |
+
def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs", tz_hours=None, date_format="%Y-%m-%d") -> Path:
|
| 116 |
"""
|
| 117 |
check if log file exists, else create one and return the file path.
|
| 118 |
|
|
|
|
| 126 |
import datetime
|
| 127 |
import warnings
|
| 128 |
import tempfile
|
| 129 |
+
from utils.utils import get_time_now_str
|
| 130 |
|
| 131 |
# 1. Get the path of the current script's parent directory (the project folder).
|
| 132 |
# `__file__` is a special variable that holds the path to the current script.
|
|
|
|
| 155 |
|
| 156 |
# 4. Create log file with a timestamp inside the new logs directory.
|
| 157 |
# This ensures a unique log file is created for the day the script runs.
|
| 158 |
+
#timestamp = datetime.datetime.now().strftime("%Y-%m-%d") #.strftime("%Y-%m-%d_%H-%M-%S")
|
| 159 |
+
timestamp = get_time_now_str(tz_hours=tz_hours, date_format="%Y-%m-%d")
|
| 160 |
log_file = logs_dir / f"{Path(filename).stem}_{timestamp}.log"
|
| 161 |
|
| 162 |
# 5. Check if the file exists (it won't, if it's not the same day).
|
|
|
|
| 176 |
print(f'file: {check_create_logfile("app_logging.log")}')
|
| 177 |
'''
|
| 178 |
|
| 179 |
+
##SMY:DONE - to revisit. Make generic for any file apart from log files
|
| 180 |
+
def check_create_dir(dir_name: Union[str, Path]) -> Path:
|
| 181 |
+
"""
|
| 182 |
+
check if directory exists, else create one and return the directory path.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
directory_path (str): The path to the directory.
|
| 186 |
+
filename (str): The name of the directory to check/create.
|
| 187 |
+
Returns:
|
| 188 |
+
The pathlib.Path object for the directory
|
| 189 |
+
"""
|
| 190 |
+
|
| 191 |
+
import warnings
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
dir_path = Path(dir_name)
|
| 195 |
+
#if dir_path.is_dir():
|
| 196 |
+
# dir_path.mkdir(parents=True, exist_ok=True) #, mode=0o2755)
|
| 197 |
+
dir_path.mkdir(parents=True, exist_ok=True) #, mode=0o2755)
|
| 198 |
+
except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
|
| 199 |
+
warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
|
| 200 |
+
dir_path.mkdir(mode=0o2755, parents=True, exist_ok=True)
|
| 201 |
+
dir_path.chmod(0o2755)
|
| 202 |
+
|
| 203 |
+
return dir_path
|
| 204 |
+
|
| 205 |
+
def check_create_file(filename: Union[str, Path]) -> Path:
|
| 206 |
"""
|
| 207 |
check if File exists, else create one and return the file path.
|
| 208 |
|
|
|
|
| 212 |
Returns:
|
| 213 |
The pathlib.Path object for the file
|
| 214 |
"""
|
| 215 |
+
|
| 216 |
+
import warnings
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
filename_path = Path(filename)
|
| 220 |
+
filename_path.touch(exist_ok=True) #, mode=0o2755)
|
| 221 |
+
except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
|
| 222 |
+
warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
|
| 223 |
+
filename_path.touch(exist_ok=True, mode=0o2755) # Creates an empty file if it doesn't exists
|
| 224 |
+
filename_path.chmod(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
+
return filename_path
|
| 227 |
+
|
| 228 |
+
def zip_processed_files(root_dir: str, file_paths: list[str], tz_hours=None, date_format='%d%b%Y_%H-%M-%S') -> Path:
|
| 229 |
+
"""
|
| 230 |
+
Creates a zip file from a list of file paths (strings) and returns the Path object.
|
| 231 |
+
It preserves the directory structure relative to the specified root directory.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
root_dir (str): The root directory against which relative paths are calculated.
|
| 235 |
+
file_paths (list[str]): A list of string paths to the files to be zipped.
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
str(Path): The string of the Path object of the newly created zip file.
|
| 239 |
+
"""
|
| 240 |
+
|
| 241 |
+
import zipfile
|
| 242 |
+
from file_handler import file_utils
|
| 243 |
+
from utils import utils
|
| 244 |
+
|
| 245 |
+
root_path = Path(root_dir)
|
| 246 |
+
if not root_path.is_dir():
|
| 247 |
+
raise ValueError(f"Root directory does not exist: {root_path}")
|
| 248 |
+
|
| 249 |
+
# Create a temporary directory in a location where Gradio can access it.
|
| 250 |
+
gradio_output_dir = Path(tempfile.gettempdir()) / "gradio_temp_output"
|
| 251 |
+
#gradio_output_dir.mkdir(exist_ok=True)
|
| 252 |
+
file_utils.check_create_dir(gradio_output_dir)
|
| 253 |
+
final_zip_path = gradio_output_dir / f"outputs_processed_{utils.get_time_now_str(tz_hours=tz_hours, date_format=date_format)}.zip"
|
| 254 |
+
|
| 255 |
+
# Use a context manager to create the zip file: use zipfile() opposed to shutil.make_archive
|
| 256 |
+
# 'w' mode creates a new file, overwriting if it already exists.
|
| 257 |
+
zip_unprocessed = 0
|
| 258 |
+
with zipfile.ZipFile(final_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 259 |
+
for file_path_str in file_paths:
|
| 260 |
+
file_path = Path(file_path_str)
|
| 261 |
+
if file_path.exists() and file_path.is_file():
|
| 262 |
+
# Calculate the relative path from the root_dir.
|
| 263 |
+
# The `arcname` parameter tells `zipfile` what the path inside the zip file should be.
|
| 264 |
+
arcname = file_path.relative_to(root_path)
|
| 265 |
+
zipf.write(file_path, arcname=arcname)
|
| 266 |
+
else:
|
| 267 |
+
#print(f"Warning: Skipping {file_path_str}, as it is not a valid file.")
|
| 268 |
+
zip_processed_files += 1 ##SMY:future - to be implemented
|
| 269 |
+
|
| 270 |
+
#return final_zip_path
|
| 271 |
+
return str(final_zip_path)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def process_and_zip(input_dir_path):
|
| 275 |
+
"""
|
| 276 |
+
Finds dynamic directories, copies files from a source directory to a temporary directory, zips it,
|
| 277 |
+
and returns the path to the zip file.
|
| 278 |
|
| 279 |
+
Args:
|
| 280 |
+
input_dir_path (str): The path to the directory containing files to be processed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
+
Returns:
|
| 283 |
+
pathlib.Path: The path to the generated zip file.
|
| 284 |
+
"""
|
| 285 |
+
# Convert the input path to a Path object
|
| 286 |
+
#input_path = Path(input_dir_path)
|
| 287 |
+
parent_input_path = Path(input_dir_path) #.parent
|
| 288 |
+
|
| 289 |
+
# Check if the input directory exists
|
| 290 |
+
if not parent_input_path.is_dir():
|
| 291 |
+
raise ValueError(f"Input directory does not exist: {parent_input_path}")
|
| 292 |
+
|
| 293 |
+
# Create a temporary directory using a context manager
|
| 294 |
+
with tempfile.TemporaryDirectory() as temp_dir_str:
|
| 295 |
+
temp_dir_path = Path(temp_dir_str)
|
| 296 |
+
|
| 297 |
+
# Define the path for the output structure inside the temporary directory
|
| 298 |
+
temp_output_path = temp_dir_path / "output_dir"
|
| 299 |
+
|
| 300 |
+
# Copy all extracted files to the temporary directory
|
| 301 |
+
# We use semantic accurate and performant .iterdir than more robust glob to get all files and folders
|
| 302 |
+
|
| 303 |
+
for input_subdir in parent_input_path.iterdir():
|
| 304 |
+
if input_subdir.is_dir():
|
| 305 |
+
# Create the corresponding subdirectory in the temp directory
|
| 306 |
+
temp_output_subdir = temp_output_path / input_subdir.name
|
| 307 |
+
#temp_output_subdir.mkdir(parents=True, exist_ok=True) #, mode=0o2755)
|
| 308 |
+
#file_handler.file_utils.check_create_dir(temp_output_subdir)
|
| 309 |
+
check_create_dir(temp_output_subdir)
|
| 310 |
+
|
| 311 |
+
# Copy the files from the source subdirectory to the temp subdirectory
|
| 312 |
+
#for item_path in input_path.glob('*'):
|
| 313 |
+
for item_path in input_subdir.iterdir():
|
| 314 |
+
if item_path.is_dir():
|
| 315 |
+
shutil.copytree(src=item_path, dst=temp_output_subdir / item_path.name)
|
| 316 |
+
else:
|
| 317 |
+
shutil.copy2(item_path, temp_output_subdir)
|
| 318 |
+
|
| 319 |
+
# Create the zip file from the temporary directory
|
| 320 |
+
zip_base_name = temp_dir_path / "outputs_processed_files"
|
| 321 |
+
zip_file_path = shutil.make_archive(
|
| 322 |
+
base_name=str(zip_base_name), ##zip file's name
|
| 323 |
+
format='zip',
|
| 324 |
+
root_dir=str( temp_output_path) #(temp_dir_path) ##exclude from the archive
|
| 325 |
+
)
|
| 326 |
+
# Manually move the completed zip file to the Gradio-managed temporary directory
|
| 327 |
+
final_zip_file_path = parent_input_path / Path(zip_file_path).name
|
| 328 |
+
shutil.move(src=zip_file_path, dst=final_zip_file_path)
|
| 329 |
+
|
| 330 |
+
# The shutil function returns a string, so we convert it back to a Path object in gr.File
|
| 331 |
+
return str(final_zip_file_path)
|
| 332 |
+
|
| 333 |
|
| 334 |
def is_file_with_extension(path_obj: Path) -> bool:
|
| 335 |
"""
|
llm/hf_client.py
CHANGED
|
@@ -6,7 +6,7 @@ import time
|
|
| 6 |
import traceback
|
| 7 |
from huggingface_hub import InferenceClient, login, logout as hf_logout
|
| 8 |
|
| 9 |
-
from llm.llm_login import login_huggingface, is_login_huggingface
|
| 10 |
|
| 11 |
from utils.logger import get_logger
|
| 12 |
|
|
@@ -101,9 +101,9 @@ class HFChatClient:
|
|
| 101 |
#pass
|
| 102 |
'''
|
| 103 |
|
| 104 |
-
login_huggingface(self.token) if not
|
| 105 |
##SMY: TODO: Mapped with openai_client.py
|
| 106 |
-
#self.islogged_in =
|
| 107 |
|
| 108 |
@staticmethod
|
| 109 |
def _normalise_history(history: list, system_message: str, latest_user_message: str) -> list[dict]:
|
|
|
|
| 6 |
import traceback
|
| 7 |
from huggingface_hub import InferenceClient, login, logout as hf_logout
|
| 8 |
|
| 9 |
+
from llm.llm_login import login_huggingface, is_loggedin_huggingface #,is_login_huggingface
|
| 10 |
|
| 11 |
from utils.logger import get_logger
|
| 12 |
|
|
|
|
| 101 |
#pass
|
| 102 |
'''
|
| 103 |
|
| 104 |
+
login_huggingface(self.token) if not is_loggedin_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 105 |
##SMY: TODO: Mapped with openai_client.py
|
| 106 |
+
#self.islogged_in = is_loggedin_huggingface()
|
| 107 |
|
| 108 |
@staticmethod
|
| 109 |
def _normalise_history(history: list, system_message: str, latest_user_message: str) -> list[dict]:
|
llm/llm_login.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from huggingface_hub import HfApi, login, logout, get_token
|
| 2 |
import os
|
| 3 |
import traceback
|
| 4 |
from time import sleep
|
|
@@ -9,6 +9,11 @@ from utils.logger import get_logger
|
|
| 9 |
## Get logger instance
|
| 10 |
logger = get_logger(__name__)
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def login_huggingface(token: Optional[str] = None):
|
| 13 |
"""
|
| 14 |
Login to Hugging Face account. Prioritize CLI login for privacy and determinism.
|
|
@@ -25,13 +30,15 @@ def login_huggingface(token: Optional[str] = None):
|
|
| 25 |
|
| 26 |
# Disable implicit token propagation for determinism
|
| 27 |
# Explicitly disable implicit token propagation; we rely on explicit auth or env var
|
| 28 |
-
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
|
|
|
| 29 |
|
| 30 |
token = token
|
| 31 |
# Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
|
| 32 |
try:
|
| 33 |
-
if HfApi.whoami():
|
| 34 |
-
|
|
|
|
| 35 |
#return True
|
| 36 |
else:
|
| 37 |
login()
|
|
@@ -53,13 +60,15 @@ def login_huggingface(token: Optional[str] = None):
|
|
| 53 |
# Silent fallback; client will still work if token is passed directly
|
| 54 |
#pass
|
| 55 |
|
| 56 |
-
def is_login_huggingface():
|
| 57 |
-
|
|
|
|
| 58 |
from huggingface_hub.utils import HfHubHTTPError
|
| 59 |
|
| 60 |
try:
|
| 61 |
HfApi().whoami()
|
| 62 |
logger.log(level=20, msg=("βοΈ You are logged in."), extra={"is_logged_in": True})
|
|
|
|
| 63 |
return True
|
| 64 |
except HfHubHTTPError as exc:
|
| 65 |
# A 401 status code indicates an authentication error.
|
|
|
|
| 1 |
+
from huggingface_hub import HfApi, login, logout, get_token, whoami
|
| 2 |
import os
|
| 3 |
import traceback
|
| 4 |
from time import sleep
|
|
|
|
| 9 |
## Get logger instance
|
| 10 |
logger = get_logger(__name__)
|
| 11 |
|
| 12 |
+
def disable_immplicit_token():
|
| 13 |
+
# Disable implicit token propagation for determinism
|
| 14 |
+
# Explicitly disable implicit token propagation; we rely on explicit auth or env var
|
| 15 |
+
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
| 16 |
+
|
| 17 |
def login_huggingface(token: Optional[str] = None):
|
| 18 |
"""
|
| 19 |
Login to Hugging Face account. Prioritize CLI login for privacy and determinism.
|
|
|
|
| 30 |
|
| 31 |
# Disable implicit token propagation for determinism
|
| 32 |
# Explicitly disable implicit token propagation; we rely on explicit auth or env var
|
| 33 |
+
#os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
| 34 |
+
disable_immplicit_token()
|
| 35 |
|
| 36 |
token = token
|
| 37 |
# Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
|
| 38 |
try:
|
| 39 |
+
#if HfApi.whoami(): ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
|
| 40 |
+
if whoami(): ##SMY: Call HF API to know "whoami".
|
| 41 |
+
logger.info("βοΈ hf_login already", extra={"mode": "HF Oauth"})
|
| 42 |
#return True
|
| 43 |
else:
|
| 44 |
login()
|
|
|
|
| 60 |
# Silent fallback; client will still work if token is passed directly
|
| 61 |
#pass
|
| 62 |
|
| 63 |
+
#def is_login_huggingface():
|
| 64 |
+
def is_loggedin_huggingface():
|
| 65 |
+
#from huggingface_hub import HfApi
|
| 66 |
from huggingface_hub.utils import HfHubHTTPError
|
| 67 |
|
| 68 |
try:
|
| 69 |
HfApi().whoami()
|
| 70 |
logger.log(level=20, msg=("βοΈ You are logged in."), extra={"is_logged_in": True})
|
| 71 |
+
disable_immplicit_token()
|
| 72 |
return True
|
| 73 |
except HfHubHTTPError as exc:
|
| 74 |
# A 401 status code indicates an authentication error.
|
llm/openai_client.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Optional #Iterable, Literal
|
|
| 7 |
import traceback
|
| 8 |
#from huggingface_hub import InferenceClient, login, logout as hf_logout
|
| 9 |
|
| 10 |
-
from llm.llm_login import login_huggingface, is_login_huggingface
|
| 11 |
|
| 12 |
import dotenv
|
| 13 |
#dotenv.load_dotenv(".env")
|
|
@@ -42,15 +42,17 @@ class OpenAIChatClient:
|
|
| 42 |
self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id ##concatenate so HF can pipe to Hf provider
|
| 43 |
self.hf_provider = hf_provider
|
| 44 |
self.base_url = base_url #"https://router.huggingface.co/v1" #%22" #HF API proxy
|
| 45 |
-
|
| 46 |
-
self.token = openai_api_key_env if openai_api_key_env else api_token #dotenv.get_key(".env", "OPENAI_API_KEY")
|
| 47 |
#self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
|
| 48 |
-
login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 49 |
#self.fake_token = api_token or "a1b2c3" #or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
|
|
|
| 50 |
self.openai_api_key = self.token #self.fake_token
|
| 51 |
self.temperature = temperature
|
| 52 |
self.top_p = top_p
|
| 53 |
-
self.islogged_in =
|
|
|
|
|
|
|
| 54 |
|
| 55 |
logger.log(level=2, msg="initialised OpenAIChatClient:", extra={"base_url": self.base_url, "openai_api_key": self.openai_api_key})
|
| 56 |
|
|
@@ -60,7 +62,7 @@ class OpenAIChatClient:
|
|
| 60 |
logger.exception(f'β OpenAI client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
|
| 61 |
raise RuntimeError(f"β Failed to initialise OpenAI client: {exc}\n{tb}")
|
| 62 |
|
| 63 |
-
#login_huggingface(self.token) if not
|
| 64 |
|
| 65 |
####IN PROGRESS
|
| 66 |
#
|
|
|
|
| 7 |
import traceback
|
| 8 |
#from huggingface_hub import InferenceClient, login, logout as hf_logout
|
| 9 |
|
| 10 |
+
from llm.llm_login import login_huggingface, is_loggedin_huggingface #, is_login_huggingface
|
| 11 |
|
| 12 |
import dotenv
|
| 13 |
#dotenv.load_dotenv(".env")
|
|
|
|
| 42 |
self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id ##concatenate so HF can pipe to Hf provider
|
| 43 |
self.hf_provider = hf_provider
|
| 44 |
self.base_url = base_url #"https://router.huggingface.co/v1" #%22" #HF API proxy
|
| 45 |
+
self.token = api_token if api_token else openai_api_key_env ##None ##debug
|
| 46 |
+
#self.token = openai_api_key_env if openai_api_key_env else api_token #dotenv.get_key(".env", "OPENAI_API_KEY")
|
| 47 |
#self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
|
|
|
|
| 48 |
#self.fake_token = api_token or "a1b2c3" #or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 49 |
+
|
| 50 |
self.openai_api_key = self.token #self.fake_token
|
| 51 |
self.temperature = temperature
|
| 52 |
self.top_p = top_p
|
| 53 |
+
self.islogged_in = is_loggedin_huggingface()
|
| 54 |
+
##SMY: log in now handled at higher entry level.
|
| 55 |
+
#login_huggingface(self.token) if not is_loggedin_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 56 |
|
| 57 |
logger.log(level=2, msg="initialised OpenAIChatClient:", extra={"base_url": self.base_url, "openai_api_key": self.openai_api_key})
|
| 58 |
|
|
|
|
| 62 |
logger.exception(f'β OpenAI client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
|
| 63 |
raise RuntimeError(f"β Failed to initialise OpenAI client: {exc}\n{tb}")
|
| 64 |
|
| 65 |
+
#login_huggingface(self.token) if not is_loggedin_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 66 |
|
| 67 |
####IN PROGRESS
|
| 68 |
#
|
ui/gradio_ui.py
CHANGED
|
@@ -11,10 +11,10 @@ import file_handler
|
|
| 11 |
import file_handler.file_utils
|
| 12 |
from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
|
| 13 |
from utils.utils import is_dict, is_list_of_dicts
|
| 14 |
-
from file_handler.file_utils import process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
|
| 15 |
#from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
|
| 16 |
from llm.provider_validator import is_valid_provider, suggest_providers
|
| 17 |
-
from llm.llm_login import login_huggingface
|
| 18 |
|
| 19 |
from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
|
| 20 |
from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
|
|
@@ -34,6 +34,7 @@ pdf2md_converter = PdfToMarkdownConverter()
|
|
| 34 |
#md2pdf_converter = MarkdownToPdfConverter()
|
| 35 |
|
| 36 |
# pool executor to convert files called by Gradio
|
|
|
|
| 37 |
def convert_batch(
|
| 38 |
pdf_files, #: list[str],
|
| 39 |
pdf_files_count: int,
|
|
@@ -59,8 +60,9 @@ def convert_batch(
|
|
| 59 |
#output_dir: Optional[Union[str, Path]] = "output_dir",
|
| 60 |
output_dir_string: str = "output_dir_default",
|
| 61 |
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 62 |
-
page_range: str = None, #Optional[str] = None,
|
| 63 |
-
|
|
|
|
| 64 |
"""
|
| 65 |
Handles the conversion process using multiprocessing.
|
| 66 |
Spins up a pool and converts all uploaded files in parallel.
|
|
@@ -71,13 +73,18 @@ def convert_batch(
|
|
| 71 |
# explicitly wrap file object in a list
|
| 72 |
#pdf_files = pdf_files_wrap(pdf_files) ##Flag: deprecation
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
## debug
|
| 75 |
#logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
|
| 76 |
|
| 77 |
#if not files:
|
| 78 |
if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
|
| 79 |
logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
|
| 80 |
-
|
|
|
|
| 81 |
|
| 82 |
# Get config values if not provided
|
| 83 |
config_file = find_file("config.ini") ##from file_handler.file_utils
|
|
@@ -117,11 +124,20 @@ def convert_batch(
|
|
| 117 |
|
| 118 |
#global docextractor ##SMY: deprecated.
|
| 119 |
try:
|
|
|
|
| 120 |
login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
except Exception as exc: # Catch all exceptions
|
| 122 |
tb = traceback.format_exc()
|
| 123 |
logger.exception(f"β Error during login_huggingface β {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 124 |
-
return f"β An error occurred during login_huggingface β {exc}\n{tb}", f"Error: {exc}", f"
|
| 125 |
|
| 126 |
try:
|
| 127 |
# Create a pool with init_worker initialiser
|
|
@@ -146,38 +162,48 @@ def convert_batch(
|
|
| 146 |
#result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
|
| 147 |
results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
|
| 148 |
except Exception as exc:
|
| 149 |
-
# Raise the exception to stop the Gradio app
|
| 150 |
-
#raise # Re-raise the exception to halt execution
|
| 151 |
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
| 152 |
traceback.print_exc() # Print the exception traceback
|
| 153 |
-
return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
|
|
|
|
| 154 |
|
| 155 |
#'''
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
#logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
|
| 165 |
-
for log in logs:
|
| 166 |
-
#logs_files_images.append(log.get("filepath", "Error or No filepath")) # if all(isinstance(log, dict) for item in logs))
|
| 167 |
-
#logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 168 |
-
|
| 169 |
-
logs_files_images.append(log.get("filepath") if is_dict(logs) or isinstance(log, Path) else "Error or no image_path") # isinstance(log, (dict, str))
|
| 170 |
-
logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
-
#logs_files_images.append(logs_filepath) ## to del
|
| 174 |
-
#logs_files_images.extend(logs_images) ## to del
|
| 175 |
#'''
|
| 176 |
except Exception as exc:
|
| 177 |
tb = traceback.format_exc()
|
| 178 |
logger.exception(f"β Error during ProcessPoolExecutor β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 179 |
#traceback.print_exc() # Print the exception traceback
|
| 180 |
-
return f"β An error occurred during ProcessPoolExecutorβ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
|
|
|
|
| 181 |
|
| 182 |
'''
|
| 183 |
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
|
@@ -187,8 +213,25 @@ def convert_batch(
|
|
| 187 |
logs = [result for result in results] ## pythonic list comprehension
|
| 188 |
'''
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
try:
|
| 191 |
-
|
|
|
|
| 192 |
#logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
|
| 193 |
|
| 194 |
##convert the List of Path objects to List of string for gr.Files output
|
|
@@ -196,13 +239,20 @@ def convert_batch(
|
|
| 196 |
|
| 197 |
## # Convert any Path objects to strings, but leave strings as-is
|
| 198 |
logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
|
| 199 |
-
|
|
|
|
|
|
|
| 200 |
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
except Exception as exc:
|
| 202 |
tb = traceback.format_exc()
|
| 203 |
logger.exception(f"β Error during returning result logs β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 204 |
#traceback.print_exc() # Print the exception traceback
|
| 205 |
-
return f"β An error occurred during returning result logsβ {exc}\n{tb}", f"Error: {exc}", f"
|
| 206 |
|
| 207 |
|
| 208 |
#return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
|
|
@@ -346,6 +396,7 @@ def build_interface() -> gr.Blocks:
|
|
| 346 |
}
|
| 347 |
"""
|
| 348 |
|
|
|
|
| 349 |
def is_file_with_extension(path_obj: Path) -> bool:
|
| 350 |
"""
|
| 351 |
Checks if a pathlib.Path object is a file and has a non-empty extension.
|
|
@@ -353,6 +404,7 @@ def build_interface() -> gr.Blocks:
|
|
| 353 |
path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
|
| 354 |
return path_obj.is_file() and bool(path_obj.suffix)
|
| 355 |
|
|
|
|
| 356 |
def accumulate_files(uploaded_files, current_state):
|
| 357 |
"""
|
| 358 |
Accumulates newly uploaded files with the existing state.
|
|
@@ -396,18 +448,18 @@ def build_interface() -> gr.Blocks:
|
|
| 396 |
gr.Markdown(f"#### **Backend Configuration**")
|
| 397 |
system_message = gr.Textbox(
|
| 398 |
label="System Message",
|
| 399 |
-
lines=2
|
| 400 |
)
|
| 401 |
with gr.Row():
|
| 402 |
provider_dd = gr.Dropdown(
|
| 403 |
choices=["huggingface", "openai"],
|
| 404 |
label="Provider",
|
| 405 |
value="huggingface",
|
| 406 |
-
#allow_custom_value=True
|
| 407 |
)
|
| 408 |
backend_choice = gr.Dropdown(
|
| 409 |
choices=["model-id", "provider", "endpoint"],
|
| 410 |
-
label="HF Backend Choice"
|
| 411 |
) ## SMY: ensure HFClient maps correctly
|
| 412 |
model_tb = gr.Textbox(
|
| 413 |
label="Model ID",
|
|
@@ -415,7 +467,7 @@ def build_interface() -> gr.Blocks:
|
|
| 415 |
)
|
| 416 |
endpoint_tb = gr.Textbox(
|
| 417 |
label="Endpoint",
|
| 418 |
-
placeholder="Optional custom endpoint"
|
| 419 |
)
|
| 420 |
with gr.Row():
|
| 421 |
max_token_sl = gr.Slider(
|
|
@@ -423,26 +475,29 @@ def build_interface() -> gr.Blocks:
|
|
| 423 |
minimum=1,
|
| 424 |
maximum=131172, #65536, #32768, #16384, #8192,
|
| 425 |
value=1024, #512,
|
| 426 |
-
step=1
|
| 427 |
)
|
| 428 |
temperature_sl = gr.Slider(
|
| 429 |
label="Temperature",
|
| 430 |
minimum=0.0,
|
| 431 |
maximum=1.0,
|
| 432 |
value=0.0,
|
| 433 |
-
step=0.1 #0.01
|
| 434 |
)
|
| 435 |
top_p_sl = gr.Slider(
|
| 436 |
label="Top-p",
|
| 437 |
minimum=0.0,
|
| 438 |
maximum=1.0,
|
| 439 |
value=0.1,
|
| 440 |
-
step=0.1 #0.01
|
| 441 |
-
)
|
| 442 |
-
stream_cb = gr.Checkbox(
|
| 443 |
-
label="LLM Streaming",
|
| 444 |
-
value=False
|
| 445 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
with gr.Row():
|
| 447 |
api_token_tb = gr.Textbox(
|
| 448 |
label="API Token [OPTIONAL]",
|
|
@@ -524,6 +579,7 @@ def build_interface() -> gr.Blocks:
|
|
| 524 |
# Initialise gr.State
|
| 525 |
state_max_workers = gr.State(4) #max_workers_sl,
|
| 526 |
state_max_retries = gr.State(2) #max_retries_sl,
|
|
|
|
| 527 |
|
| 528 |
def update_state_stored_value(new_component_input):
|
| 529 |
""" Updates stored state: use for max_workers and max_retries """
|
|
@@ -532,30 +588,51 @@ def build_interface() -> gr.Blocks:
|
|
| 532 |
# Update gr.State values on slider components change. NB: initial value of `gr.State` must be able to be deepcopied
|
| 533 |
max_workers_sl.change(update_state_stored_value, inputs=max_workers_sl, outputs=state_max_workers)
|
| 534 |
max_retries_sl.change(update_state_stored_value, inputs=max_retries_sl, outputs=state_max_retries)
|
|
|
|
| 535 |
|
| 536 |
|
| 537 |
with gr.Accordion("π€ HuggingFace Client Logout", open=True): #, open=False):
|
| 538 |
# Logout controls
|
| 539 |
-
def do_logout():
|
| 540 |
try:
|
| 541 |
#ok = docextractor.client.logout()
|
| 542 |
ok = docconverter.client.logout()
|
| 543 |
# Reset token textbox on successful logout
|
| 544 |
-
msg = "β
Logged out of HuggingFace and cleared tokens. Remember to log out of HuggingFace completely." if ok else "β οΈ Logout failed."
|
| 545 |
-
|
|
|
|
| 546 |
except AttributeError:
|
| 547 |
-
|
| 548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
def custom_do_logout():
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
logout_status = gr.Markdown(visible=False)
|
| 553 |
with gr.Row():
|
| 554 |
-
hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace π€", logout_value="Logout of HF: ({})", variant="huggingface")
|
| 555 |
-
logout_btn = gr.Button("Logout from session and Hugging Face (inference) Client", variant="stop", )
|
| 556 |
|
| 557 |
-
hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
|
| 558 |
-
|
|
|
|
| 559 |
|
| 560 |
|
| 561 |
# The gr.State component to hold the accumulated list of files
|
|
@@ -695,18 +772,26 @@ def build_interface() -> gr.Blocks:
|
|
| 695 |
'''
|
| 696 |
|
| 697 |
# A Files component to display individual processed files as download links
|
| 698 |
-
with gr.Accordion("β¬ View and Download processed files", open=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
with gr.Row():
|
| 700 |
-
files_individual_JSON = gr.JSON(label="Serialised JSON list", max_height=250)
|
| 701 |
-
files_individual_downloads = gr.Files(label="Individual Processed Files")
|
| 702 |
|
| 703 |
## Displays processed file paths
|
| 704 |
-
with gr.Accordion("View processing log", open=False):
|
| 705 |
log_output = gr.Textbox(
|
| 706 |
label="Conversion Logs",
|
| 707 |
lines=5,
|
| 708 |
#max_lines=25,
|
| 709 |
-
interactive=False
|
| 710 |
)
|
| 711 |
|
| 712 |
# file inputs
|
|
@@ -745,6 +830,7 @@ def build_interface() -> gr.Blocks:
|
|
| 745 |
output_dir_tb,
|
| 746 |
use_llm_cb,
|
| 747 |
page_range_tb,
|
|
|
|
| 748 |
]
|
| 749 |
|
| 750 |
## debug
|
|
@@ -756,12 +842,14 @@ def build_interface() -> gr.Blocks:
|
|
| 756 |
#pdf_files.upload(
|
| 757 |
fn=convert_batch,
|
| 758 |
inputs=inputs_arg,
|
| 759 |
-
outputs=[log_output, files_individual_JSON, files_individual_downloads],
|
| 760 |
)
|
| 761 |
except Exception as exc:
|
| 762 |
tb = traceback.format_exc()
|
| 763 |
logger.exception(f"β Error during process_button.click β {exc}\n{tb}", exc_info=True)
|
| 764 |
-
|
|
|
|
|
|
|
| 765 |
|
| 766 |
##gr.File .upload() event, fire only after a file has been uploaded
|
| 767 |
# Event handler for the pdf file upload button
|
|
@@ -774,37 +862,9 @@ def build_interface() -> gr.Blocks:
|
|
| 774 |
btn_pdf_convert.click(
|
| 775 |
#pdf_files.upload(
|
| 776 |
fn=convert_batch,
|
| 777 |
-
outputs=[log_output, files_individual_downloads],
|
| 778 |
inputs=inputs_arg,
|
| 779 |
-
)
|
| 780 |
-
'''
|
| 781 |
-
inputs = [
|
| 782 |
-
pdf_files,
|
| 783 |
-
#pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
|
| 784 |
-
pdf_files_count,
|
| 785 |
-
provider_dd,
|
| 786 |
-
model_tb,
|
| 787 |
-
hf_provider_dd,
|
| 788 |
-
endpoint_tb,
|
| 789 |
-
backend_choice,
|
| 790 |
-
system_message,
|
| 791 |
-
max_token_sl,
|
| 792 |
-
temperature_sl,
|
| 793 |
-
top_p_sl,
|
| 794 |
-
stream_cb,
|
| 795 |
-
api_token_tb,
|
| 796 |
-
#gr.State(4), # max_workers
|
| 797 |
-
#gr.State(3), # max_retries
|
| 798 |
-
openai_base_url_tb,
|
| 799 |
-
openai_image_format_dd,
|
| 800 |
-
state_max_workers, #gr.State(max_workers_sl), #max_workers_sl,
|
| 801 |
-
state_max_retries, #gr.State(max_retries_sl), #max_retries_sl,
|
| 802 |
-
output_format_dd,
|
| 803 |
-
output_dir_tb,
|
| 804 |
-
use_llm_cb,
|
| 805 |
-
page_range_tb,
|
| 806 |
-
],
|
| 807 |
-
'''
|
| 808 |
# )
|
| 809 |
|
| 810 |
# reuse the same business logic for HTML tab
|
|
@@ -818,7 +878,7 @@ def build_interface() -> gr.Blocks:
|
|
| 818 |
btn_html_convert.click(
|
| 819 |
fn=convert_batch,
|
| 820 |
inputs=inputs_arg,
|
| 821 |
-
outputs=[log_output, files_individual_downloads]
|
| 822 |
)
|
| 823 |
|
| 824 |
def get_file_count(file_list):
|
|
|
|
| 11 |
import file_handler.file_utils
|
| 12 |
from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
|
| 13 |
from utils.utils import is_dict, is_list_of_dicts
|
| 14 |
+
from file_handler.file_utils import zip_processed_files, process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
|
| 15 |
#from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
|
| 16 |
from llm.provider_validator import is_valid_provider, suggest_providers
|
| 17 |
+
from llm.llm_login import is_loggedin_huggingface, login_huggingface
|
| 18 |
|
| 19 |
from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
|
| 20 |
from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
|
|
|
|
| 34 |
#md2pdf_converter = MarkdownToPdfConverter()
|
| 35 |
|
| 36 |
# pool executor to convert files called by Gradio
|
| 37 |
+
##SMY: TODO: future: refactor to gradio_process.py
|
| 38 |
def convert_batch(
|
| 39 |
pdf_files, #: list[str],
|
| 40 |
pdf_files_count: int,
|
|
|
|
| 60 |
#output_dir: Optional[Union[str, Path]] = "output_dir",
|
| 61 |
output_dir_string: str = "output_dir_default",
|
| 62 |
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 63 |
+
page_range: str = None, #Optional[str] = None,
|
| 64 |
+
tz_hours: str = None,
|
| 65 |
+
): #-> str:
|
| 66 |
"""
|
| 67 |
Handles the conversion process using multiprocessing.
|
| 68 |
Spins up a pool and converts all uploaded files in parallel.
|
|
|
|
| 73 |
# explicitly wrap file object in a list
|
| 74 |
#pdf_files = pdf_files_wrap(pdf_files) ##Flag: deprecation
|
| 75 |
|
| 76 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 77 |
+
#outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 78 |
+
yield gr.update(interactive=False), f"Processing files...", {"process": "Processing files"}, f"__init__.py"
|
| 79 |
+
|
| 80 |
## debug
|
| 81 |
#logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
|
| 82 |
|
| 83 |
#if not files:
|
| 84 |
if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
|
| 85 |
logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
|
| 86 |
+
#outputs=[log_output, files_individual_JSON, files_individual_downloads],
|
| 87 |
+
return gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"__init__.py"
|
| 88 |
|
| 89 |
# Get config values if not provided
|
| 90 |
config_file = find_file("config.ini") ##from file_handler.file_utils
|
|
|
|
| 124 |
|
| 125 |
#global docextractor ##SMY: deprecated.
|
| 126 |
try:
|
| 127 |
+
##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
|
| 128 |
login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 129 |
+
|
| 130 |
+
if is_loggedin_huggingface() and (api_token is None or api_token == ""):
|
| 131 |
+
api_token = get_token()
|
| 132 |
+
else:
|
| 133 |
+
login_huggingface()
|
| 134 |
+
# login: Update the Gradio UI to improve user-friendly eXperience
|
| 135 |
+
yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
|
| 136 |
+
|
| 137 |
except Exception as exc: # Catch all exceptions
|
| 138 |
tb = traceback.format_exc()
|
| 139 |
logger.exception(f"β Error during login_huggingface β {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 140 |
+
return gr.update(interactive=True), f"β An error occurred during login_huggingface β {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
|
| 141 |
|
| 142 |
try:
|
| 143 |
# Create a pool with init_worker initialiser
|
|
|
|
| 162 |
#result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
|
| 163 |
results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
|
| 164 |
except Exception as exc:
|
| 165 |
+
# Raise the exception to stop the Gradio app: exception to halt execution
|
|
|
|
| 166 |
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
| 167 |
traceback.print_exc() # Print the exception traceback
|
| 168 |
+
#return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
|
| 169 |
+
yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}"}, f"__init__.py" ## return the exception message
|
| 170 |
|
| 171 |
#'''
|
| 172 |
+
try:
|
| 173 |
+
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
| 174 |
+
logs = []
|
| 175 |
+
logs_files_images = []
|
| 176 |
+
#logs.extend(results) ## performant pythonic
|
| 177 |
+
#logs = list[results] ##
|
| 178 |
+
logs = [result for result in results] ## pythonic list comprehension
|
| 179 |
+
## logs : [file , images , filepath, image_path]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
#logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
|
| 182 |
+
logs_count = 0
|
| 183 |
+
#for log in logs:
|
| 184 |
+
for i, log in enumerate(logs):
|
| 185 |
+
logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
|
| 186 |
+
logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 187 |
+
i_image = log.get("images", 0)
|
| 188 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 189 |
+
yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"__init__.py"
|
| 190 |
+
logs_count = i+i_image
|
| 191 |
+
|
| 192 |
+
#logs_files_images.append(logs_filepath) ## to del
|
| 193 |
+
#logs_files_images.extend(logs_images) ## to del
|
| 194 |
+
except Exception as exc:
|
| 195 |
+
logger.exception("Error during processing results logs β {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 196 |
+
traceback.print_exc() # Print the exception traceback
|
| 197 |
+
#return f"An error occurred during processing results logs: {str(exc)}\n{tb}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
|
| 198 |
+
yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" ## return the exception message
|
| 199 |
|
|
|
|
|
|
|
| 200 |
#'''
|
| 201 |
except Exception as exc:
|
| 202 |
tb = traceback.format_exc()
|
| 203 |
logger.exception(f"β Error during ProcessPoolExecutor β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 204 |
#traceback.print_exc() # Print the exception traceback
|
| 205 |
+
#return gr.update(interactive=True), f"β An error occurred during ProcessPoolExecutorβ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
|
| 206 |
+
yield gr.update(interactive=True), f"β An error occurred during ProcessPoolExecutorβ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
|
| 207 |
|
| 208 |
'''
|
| 209 |
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
|
|
|
| 213 |
logs = [result for result in results] ## pythonic list comprehension
|
| 214 |
'''
|
| 215 |
|
| 216 |
+
# Zip Processed md Files and images. Insert to first index
|
| 217 |
+
try: ##from file_handler.file_utils
|
| 218 |
+
zipped_processed_files = zip_processed_files(root_dir=f"data/{output_dir_string}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y')
|
| 219 |
+
logs_files_images.insert(0, zipped_processed_files)
|
| 220 |
+
#logs_files_images.insert(1, "====================")
|
| 221 |
+
yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"__init__.py"
|
| 222 |
+
|
| 223 |
+
except Exception as exc:
|
| 224 |
+
tb = traceback.format_exc()
|
| 225 |
+
logger.exception(f"β Error during zipping processed files β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 226 |
+
#traceback.print_exc() # Print the exception traceback
|
| 227 |
+
#return gr.update(interactive=True), f"β An error occurred during zipping files β {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
|
| 228 |
+
yield gr.update(interactive=True), f"β An error occurred during zipping files β {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# Return processed files log
|
| 232 |
try:
|
| 233 |
+
## # Convert logs list of dicts to formatted json string
|
| 234 |
+
logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
|
| 235 |
#logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
|
| 236 |
|
| 237 |
##convert the List of Path objects to List of string for gr.Files output
|
|
|
|
| 239 |
|
| 240 |
## # Convert any Path objects to strings, but leave strings as-is
|
| 241 |
logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
|
| 242 |
+
logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
|
| 243 |
+
|
| 244 |
+
#outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 245 |
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
| 246 |
+
#return logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
| 247 |
+
#return gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
|
| 248 |
+
yield gr.update(interactive=True), gr.update(), gr.update(visible=True), gr.update(visible=True)
|
| 249 |
+
yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
|
| 250 |
+
|
| 251 |
except Exception as exc:
|
| 252 |
tb = traceback.format_exc()
|
| 253 |
logger.exception(f"β Error during returning result logs β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 254 |
#traceback.print_exc() # Print the exception traceback
|
| 255 |
+
return gr.update(interactive=True), f"β An error occurred during returning result logsβ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
|
| 256 |
|
| 257 |
|
| 258 |
#return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
|
|
|
|
| 396 |
}
|
| 397 |
"""
|
| 398 |
|
| 399 |
+
##SMY: flagged; to move to file_handler.file_utils
|
| 400 |
def is_file_with_extension(path_obj: Path) -> bool:
|
| 401 |
"""
|
| 402 |
Checks if a pathlib.Path object is a file and has a non-empty extension.
|
|
|
|
| 404 |
path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
|
| 405 |
return path_obj.is_file() and bool(path_obj.suffix)
|
| 406 |
|
| 407 |
+
##SMY: flagged; to move to file_handler.file_utils
|
| 408 |
def accumulate_files(uploaded_files, current_state):
|
| 409 |
"""
|
| 410 |
Accumulates newly uploaded files with the existing state.
|
|
|
|
| 448 |
gr.Markdown(f"#### **Backend Configuration**")
|
| 449 |
system_message = gr.Textbox(
|
| 450 |
label="System Message",
|
| 451 |
+
lines=2,
|
| 452 |
)
|
| 453 |
with gr.Row():
|
| 454 |
provider_dd = gr.Dropdown(
|
| 455 |
choices=["huggingface", "openai"],
|
| 456 |
label="Provider",
|
| 457 |
value="huggingface",
|
| 458 |
+
#allow_custom_value=True,
|
| 459 |
)
|
| 460 |
backend_choice = gr.Dropdown(
|
| 461 |
choices=["model-id", "provider", "endpoint"],
|
| 462 |
+
label="HF Backend Choice",
|
| 463 |
) ## SMY: ensure HFClient maps correctly
|
| 464 |
model_tb = gr.Textbox(
|
| 465 |
label="Model ID",
|
|
|
|
| 467 |
)
|
| 468 |
endpoint_tb = gr.Textbox(
|
| 469 |
label="Endpoint",
|
| 470 |
+
placeholder="Optional custom endpoint",
|
| 471 |
)
|
| 472 |
with gr.Row():
|
| 473 |
max_token_sl = gr.Slider(
|
|
|
|
| 475 |
minimum=1,
|
| 476 |
maximum=131172, #65536, #32768, #16384, #8192,
|
| 477 |
value=1024, #512,
|
| 478 |
+
step=1,
|
| 479 |
)
|
| 480 |
temperature_sl = gr.Slider(
|
| 481 |
label="Temperature",
|
| 482 |
minimum=0.0,
|
| 483 |
maximum=1.0,
|
| 484 |
value=0.0,
|
| 485 |
+
step=0.1, #0.01
|
| 486 |
)
|
| 487 |
top_p_sl = gr.Slider(
|
| 488 |
label="Top-p",
|
| 489 |
minimum=0.0,
|
| 490 |
maximum=1.0,
|
| 491 |
value=0.1,
|
| 492 |
+
step=0.1, #0.01
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
)
|
| 494 |
+
with gr.Column():
|
| 495 |
+
stream_cb = gr.Checkbox(
|
| 496 |
+
label="LLM Streaming",
|
| 497 |
+
value=False,
|
| 498 |
+
)
|
| 499 |
+
#tz_hours_tb = gr.Textbox(value=None, label="TZ Hours", placeholder="Timezone in numbers", max_lines=1,)
|
| 500 |
+
tz_hours_num = gr.Number(label="TZ Hours", placeholder="Timezone in numbers", min_width=5,)
|
| 501 |
with gr.Row():
|
| 502 |
api_token_tb = gr.Textbox(
|
| 503 |
label="API Token [OPTIONAL]",
|
|
|
|
| 579 |
# Initialise gr.State
|
| 580 |
state_max_workers = gr.State(4) #max_workers_sl,
|
| 581 |
state_max_retries = gr.State(2) #max_retries_sl,
|
| 582 |
+
state_tz_hours = gr.State(value=None)
|
| 583 |
|
| 584 |
def update_state_stored_value(new_component_input):
|
| 585 |
""" Updates stored state: use for max_workers and max_retries """
|
|
|
|
| 588 |
# Update gr.State values on slider components change. NB: initial value of `gr.State` must be able to be deepcopied
|
| 589 |
max_workers_sl.change(update_state_stored_value, inputs=max_workers_sl, outputs=state_max_workers)
|
| 590 |
max_retries_sl.change(update_state_stored_value, inputs=max_retries_sl, outputs=state_max_retries)
|
| 591 |
+
tz_hours_num.change(update_state_stored_value, inputs=tz_hours_num, outputs=state_tz_hours)
|
| 592 |
|
| 593 |
|
| 594 |
with gr.Accordion("π€ HuggingFace Client Logout", open=True): #, open=False):
|
| 595 |
# Logout controls
|
| 596 |
+
'''def do_logout():
|
| 597 |
try:
|
| 598 |
#ok = docextractor.client.logout()
|
| 599 |
ok = docconverter.client.logout()
|
| 600 |
# Reset token textbox on successful logout
|
| 601 |
+
#msg = "β
Logged out of HuggingFace and cleared tokens. Remember to log out of HuggingFace completely." if ok else "β οΈ Logout failed."
|
| 602 |
+
msg = "β
Session Cleared. Remember to browser." if ok else "β οΈ Logout failed."
|
| 603 |
+
return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace π€"), gr.update(value="Clear session")
|
| 604 |
except AttributeError:
|
| 605 |
+
msg = "β οΈ Logout."
|
| 606 |
+
return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace π€"), gr.update(value="Clear session", interactive=False)
|
| 607 |
+
'''
|
| 608 |
+
def do_logout_hf():
|
| 609 |
+
try:
|
| 610 |
+
ok = docconverter.client.logout()
|
| 611 |
+
# Reset token textbox on successful logout
|
| 612 |
+
msg = "β
Session Cleared. Remember to close browser." if ok else "β οΈ Logout & Session Cleared"
|
| 613 |
+
#return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace π€"), gr.update(value="Clear session", interactive=False)
|
| 614 |
+
#return msg
|
| 615 |
+
yield msg
|
| 616 |
+
except AttributeError:
|
| 617 |
+
msg = "β οΈ Logout. No HF session"
|
| 618 |
+
#return msg
|
| 619 |
+
yield msg
|
| 620 |
+
|
| 621 |
def custom_do_logout():
|
| 622 |
+
#do_logout()
|
| 623 |
+
#return gr.update(value="Sign in to HuggingFace π€")
|
| 624 |
+
msg = do_logout_hf()
|
| 625 |
+
#return gr.update(value="Sign in to HuggingFace π€"), gr.update(value=""), gr.update(visible=True, value=msg)
|
| 626 |
+
yield gr.update(value="Sign in to HuggingFace π€"), gr.update(value=""), gr.update(visible=True, value=msg)
|
| 627 |
|
| 628 |
logout_status = gr.Markdown(visible=False)
|
| 629 |
with gr.Row():
|
| 630 |
+
hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace π€", logout_value="Clear Session & Logout of HF: ({})", variant="huggingface")
|
| 631 |
+
#logout_btn = gr.Button("Logout from session and Hugging Face (inference) Client", variant="stop", )
|
| 632 |
|
| 633 |
+
#hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
|
| 634 |
+
hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status])
|
| 635 |
+
#logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status, hf_login_logout_btn, logout_btn])
|
| 636 |
|
| 637 |
|
| 638 |
# The gr.State component to hold the accumulated list of files
|
|
|
|
| 772 |
'''
|
| 773 |
|
| 774 |
# A Files component to display individual processed files as download links
|
| 775 |
+
with gr.Accordion("β¬ View and Download processed files", open=True): #, open=False
|
| 776 |
+
processed_file_state = gr.State([])
|
| 777 |
+
|
| 778 |
+
##SMY: future
|
| 779 |
+
zip_btn = gr.DownloadButton("Download Zip file of all processed files", visible=False) #.Button()
|
| 780 |
+
|
| 781 |
+
# Placeholder to download zip file of processed files
|
| 782 |
+
download_zip_file = gr.File(label="Download processed Files (ZIP)", interactive=False, visible=False) #, height="1"
|
| 783 |
+
|
| 784 |
with gr.Row():
|
| 785 |
+
files_individual_JSON = gr.JSON(label="Serialised JSON list", max_height=250, visible=False)
|
| 786 |
+
files_individual_downloads = gr.Files(label="Individual Processed Files", visible=False)
|
| 787 |
|
| 788 |
## Displays processed file paths
|
| 789 |
+
with gr.Accordion("View processing log", open=True): #open=False):
|
| 790 |
log_output = gr.Textbox(
|
| 791 |
label="Conversion Logs",
|
| 792 |
lines=5,
|
| 793 |
#max_lines=25,
|
| 794 |
+
#interactive=False
|
| 795 |
)
|
| 796 |
|
| 797 |
# file inputs
|
|
|
|
| 830 |
output_dir_tb,
|
| 831 |
use_llm_cb,
|
| 832 |
page_range_tb,
|
| 833 |
+
tz_hours_num,
|
| 834 |
]
|
| 835 |
|
| 836 |
## debug
|
|
|
|
| 842 |
#pdf_files.upload(
|
| 843 |
fn=convert_batch,
|
| 844 |
inputs=inputs_arg,
|
| 845 |
+
outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
|
| 846 |
)
|
| 847 |
except Exception as exc:
|
| 848 |
tb = traceback.format_exc()
|
| 849 |
logger.exception(f"β Error during process_button.click β {exc}\n{tb}", exc_info=True)
|
| 850 |
+
msg = "β An error occurred during process_button.click" # β
|
| 851 |
+
#return f"β An error occurred during process_button.click β {exc}\n{tb}"
|
| 852 |
+
return gr.update(interactive=True), f"{msg} β {exc}\n{tb}", f"{msg} β {exc}", f"{msg} β {exc}"
|
| 853 |
|
| 854 |
##gr.File .upload() event, fire only after a file has been uploaded
|
| 855 |
# Event handler for the pdf file upload button
|
|
|
|
| 862 |
btn_pdf_convert.click(
|
| 863 |
#pdf_files.upload(
|
| 864 |
fn=convert_batch,
|
| 865 |
+
outputs=[btn_pdf_convert, log_output, files_individual_JSON, files_individual_downloads],
|
| 866 |
inputs=inputs_arg,
|
| 867 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
# )
|
| 869 |
|
| 870 |
# reuse the same business logic for HTML tab
|
|
|
|
| 878 |
btn_html_convert.click(
|
| 879 |
fn=convert_batch,
|
| 880 |
inputs=inputs_arg,
|
| 881 |
+
outputs=[btn_html_convert,log_output, files_individual_JSON, files_individual_downloads]
|
| 882 |
)
|
| 883 |
|
| 884 |
def get_file_count(file_list):
|
utils/get_config.py
CHANGED
|
@@ -9,7 +9,7 @@ import traceback
|
|
| 9 |
import sys
|
| 10 |
from pathlib import Path
|
| 11 |
#base_grandparent = Path(__file__).resolve().parent.parent
|
| 12 |
-
grandparent_dir = Path('.').resolve() #.parent.parent
|
| 13 |
sys.path.insert(0, f"{grandparent_dir}") #\\file_handler")
|
| 14 |
##end debug
|
| 15 |
#'''
|
|
|
|
| 9 |
import sys
|
| 10 |
from pathlib import Path
|
| 11 |
#base_grandparent = Path(__file__).resolve().parent.parent
|
| 12 |
+
grandparent_dir = Path('.').resolve() #.parent.parent ##unreliable
|
| 13 |
sys.path.insert(0, f"{grandparent_dir}") #\\file_handler")
|
| 14 |
##end debug
|
| 15 |
#'''
|
utils/logger.py
CHANGED
|
@@ -45,7 +45,7 @@ class JsonFormatter(logging.Formatter):
|
|
| 45 |
return json.dumps(payload, ensure_ascii=False)
|
| 46 |
|
| 47 |
#def setup_logging(level: int = logging.INFO) -> None: ## Causing non-stop logging on HF spaces
|
| 48 |
-
def setup_logging(level: int = None) -> None:
|
| 49 |
"""Configure root logger with JSON output to both stdout and file.
|
| 50 |
|
| 51 |
Args:
|
|
@@ -66,7 +66,7 @@ def setup_logging(level: int = None) -> None:
|
|
| 66 |
#file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
|
| 67 |
#file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
|
| 68 |
from file_handler.file_utils import check_create_logfile
|
| 69 |
-
file_handler = logging.FileHandler(check_create_logfile("app_logging.log"), mode="a", encoding="utf-8")
|
| 70 |
## Getting filepermission error
|
| 71 |
|
| 72 |
file_handler.setFormatter(JsonFormatter())
|
|
|
|
| 45 |
return json.dumps(payload, ensure_ascii=False)
|
| 46 |
|
| 47 |
#def setup_logging(level: int = logging.INFO) -> None: ## Causing non-stop logging on HF spaces
|
| 48 |
+
def setup_logging(level: int = None, tz_hours=None, date_format:str="%d%b%Y") -> None: #'%Y-%m-%d
|
| 49 |
"""Configure root logger with JSON output to both stdout and file.
|
| 50 |
|
| 51 |
Args:
|
|
|
|
| 66 |
#file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
|
| 67 |
#file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
|
| 68 |
from file_handler.file_utils import check_create_logfile
|
| 69 |
+
file_handler = logging.FileHandler(check_create_logfile(filename="app_logging.log", tz_hours=tz_hours, date_format=date_format), mode="a", encoding="utf-8")
|
| 70 |
## Getting filepermission error
|
| 71 |
|
| 72 |
file_handler.setFormatter(JsonFormatter())
|
utils/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
def is_dict(variable):
|
| 2 |
"""Checks if a variable is a dict."""
|
| 3 |
if isinstance(variable, dict):
|
|
@@ -12,4 +13,26 @@ def is_list_of_dicts(variable):
|
|
| 12 |
# Return True only if the list is empty or all elements are dicts.
|
| 13 |
return all(isinstance(item, dict) for item in variable)
|
| 14 |
|
| 15 |
-
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
def is_dict(variable):
|
| 3 |
"""Checks if a variable is a dict."""
|
| 4 |
if isinstance(variable, dict):
|
|
|
|
| 13 |
# Return True only if the list is empty or all elements are dicts.
|
| 14 |
return all(isinstance(item, dict) for item in variable)
|
| 15 |
|
| 16 |
+
return False
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_time_now_str(tz_hours=None, date_format:str='%Y-%m-%d'): #date_format:str='%d%b%Y'):
|
| 20 |
+
"""Returns the current time in a specific format + local time: ("%Y-%m-%d %H:%M:%S.%f %Z")."""
|
| 21 |
+
from datetime import datetime, timezone, timedelta
|
| 22 |
+
|
| 23 |
+
# Get the current time or UTC time
|
| 24 |
+
if tz_hours is not None:
|
| 25 |
+
current_utc_time = datetime.now(tz=timezone.utc) + timedelta(hours=tz_hours)
|
| 26 |
+
current_time = current_utc_time
|
| 27 |
+
else:
|
| 28 |
+
current_time = datetime.now()
|
| 29 |
+
|
| 30 |
+
# Format the time as a string
|
| 31 |
+
#formatted_time = current_utc_time.strftime(date_format) #("%Y-%m-%d %H:%M:%S.%f %Z")
|
| 32 |
+
formatted_time = current_time.strftime(date_format) #("%Y-%m-%d %H:%M:%S.%f %Z")
|
| 33 |
+
|
| 34 |
+
#print(f"Current time: {formatted_time}") ##debug
|
| 35 |
+
return formatted_time
|
| 36 |
+
|
| 37 |
+
#get_time_now_str() ##debug
|
| 38 |
+
|