semmyk commited on
Commit
962ef72
Β·
1 Parent(s): 33d9ec0

baseline08_beta01_27Sept25: zipped, yield output, log in

Browse files
file_handler/file_utils.py CHANGED
@@ -2,6 +2,9 @@
2
  #import os
3
  from pathlib import Path
4
  import sys
 
 
 
5
  from itertools import chain
6
  from typing import List, Union, Any, Mapping
7
  from PIL import Image
@@ -95,8 +98,8 @@ def resolve_grandparent_object(gp_object:str):
95
  current_path = Path(sys.argv[0]).resolve()
96
  except IndexError:
97
  # Handle cases where sys.argv[0] might not exist (e.g., in some IDEs)
98
- #current_path = Path(__file__).resolve()
99
- current_path = Path('.').resolve()
100
 
101
  parent_dir = current_path.parent
102
  grandparent_dir = current_path.parent.parent
@@ -109,7 +112,7 @@ def resolve_grandparent_object(gp_object:str):
109
  #print(f"resolve: sys.path[1]: {sys.path[1]}") ##debug
110
 
111
 
112
- def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs") -> Path:
113
  """
114
  check if log file exists, else create one and return the file path.
115
 
@@ -123,6 +126,7 @@ def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs") -> Pa
123
  import datetime
124
  import warnings
125
  import tempfile
 
126
 
127
  # 1. Get the path of the current script's parent directory (the project folder).
128
  # `__file__` is a special variable that holds the path to the current script.
@@ -151,7 +155,8 @@ def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs") -> Pa
151
 
152
  # 4. Create log file with a timestamp inside the new logs directory.
153
  # This ensures a unique log file is created for the day the script runs.
154
- timestamp = datetime.datetime.now().strftime("%Y-%m-%d") #.strftime("%Y-%m-%d_%H-%M-%S")
 
155
  log_file = logs_dir / f"{Path(filename).stem}_{timestamp}.log"
156
 
157
  # 5. Check if the file exists (it won't, if it's not the same day).
@@ -171,8 +176,33 @@ resolve_grandparent_object("file_handler")
171
  print(f'file: {check_create_logfile("app_logging.log")}')
172
  '''
173
 
174
- ##SMY: to revisit. Make generic for any file apart from log files
175
- def check_create_file(filename: str, dir_path: Union[str, Path]="logs") -> Path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  """
177
  check if File exists, else create one and return the file path.
178
 
@@ -182,35 +212,124 @@ def check_create_file(filename: str, dir_path: Union[str, Path]="logs") -> Path:
182
  Returns:
183
  The pathlib.Path object for the file
184
  """
185
- # Get project root
186
- #project_root = Path(__file__).resolve().parent.parent ##SMY: `__file__` is a special variable pointing to current file`
187
- project_root = Path(__file__).resolve().parents[1] ##SMY: leverages parents. Get 2nd level
188
-
189
- #file_dir = Path("logs") / file_dir if not isinstance(file_dir, Path) else Path(file_dir)
190
- dir_path = dir_path if isinstance(dir_path, Path) else Path(dir_path)
191
-
192
- # Ensure the directory exists
193
- # Create the file parent directory, relative to the project root, if it doesn't exist.
194
- # `parents=True` creates any missing parent directories.
195
- # `exist_ok=True` prevents an error if the directory already exists.
196
- dir_path = project_root / dir_path
197
- if not dir_path.is_dir():
198
- dir_path.mkdir(parents=True, exist_ok=True, mode=0o2755) #, mode=0o2644)
199
- #dir_path.chmod(0)
200
 
201
- file_path = dir_path / filename # Concatenate directory and filename to get full path
202
- #print(f"file_path: {file_path}") ##debug
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- #file_path.touch(exist_ok=True, mode=0o2664) # Creates an empty file if it doesn't exists
205
-
206
- #'''
207
- if not file_path.exists(): # check if file doesn't exist
208
- file_path.touch(exist_ok=True) #, mode=0o2664) # Creates an empty file if it doesn't exists
209
- #file_dir.touch(mode=0o2644, exist_ok=True) #, parents=True) ##SMY: Note Permission Errno13 - https://stackoverflow.com/a/57454275
210
- #file_dir.chmod(0)
211
- #'''
212
 
213
- return file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  def is_file_with_extension(path_obj: Path) -> bool:
216
  """
 
2
  #import os
3
  from pathlib import Path
4
  import sys
5
+ import shutil
6
+ import tempfile
7
+
8
  from itertools import chain
9
  from typing import List, Union, Any, Mapping
10
  from PIL import Image
 
98
  current_path = Path(sys.argv[0]).resolve()
99
  except IndexError:
100
  # Handle cases where sys.argv[0] might not exist (e.g., in some IDEs)
101
+ current_path = Path(__file__).resolve()
102
+ #current_path = Path('.').resolve() ##unreliable
103
 
104
  parent_dir = current_path.parent
105
  grandparent_dir = current_path.parent.parent
 
112
  #print(f"resolve: sys.path[1]: {sys.path[1]}") ##debug
113
 
114
 
115
+ def check_create_logfile(filename: str, dir_path: Union[str, Path]="logs", tz_hours=None, date_format="%Y-%m-%d") -> Path:
116
  """
117
  check if log file exists, else create one and return the file path.
118
 
 
126
  import datetime
127
  import warnings
128
  import tempfile
129
+ from utils.utils import get_time_now_str
130
 
131
  # 1. Get the path of the current script's parent directory (the project folder).
132
  # `__file__` is a special variable that holds the path to the current script.
 
155
 
156
  # 4. Create log file with a timestamp inside the new logs directory.
157
  # This ensures a unique log file is created for the day the script runs.
158
+ #timestamp = datetime.datetime.now().strftime("%Y-%m-%d") #.strftime("%Y-%m-%d_%H-%M-%S")
159
+ timestamp = get_time_now_str(tz_hours=tz_hours, date_format="%Y-%m-%d")
160
  log_file = logs_dir / f"{Path(filename).stem}_{timestamp}.log"
161
 
162
  # 5. Check if the file exists (it won't, if it's not the same day).
 
176
  print(f'file: {check_create_logfile("app_logging.log")}')
177
  '''
178
 
179
+ ##SMY:DONE - to revisit. Make generic for any file apart from log files
180
+ def check_create_dir(dir_name: Union[str, Path]) -> Path:
181
+ """
182
+ check if directory exists, else create one and return the directory path.
183
+
184
+ Args:
185
+ directory_path (str): The path to the directory.
186
+ filename (str): The name of the directory to check/create.
187
+ Returns:
188
+ The pathlib.Path object for the directory
189
+ """
190
+
191
+ import warnings
192
+
193
+ try:
194
+ dir_path = Path(dir_name)
195
+ #if dir_path.is_dir():
196
+ # dir_path.mkdir(parents=True, exist_ok=True) #, mode=0o2755)
197
+ dir_path.mkdir(parents=True, exist_ok=True) #, mode=0o2755)
198
+ except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
199
+ warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
200
+ dir_path.mkdir(mode=0o2755, parents=True, exist_ok=True)
201
+ dir_path.chmod(0o2755)
202
+
203
+ return dir_path
204
+
205
+ def check_create_file(filename: Union[str, Path]) -> Path:
206
  """
207
  check if File exists, else create one and return the file path.
208
 
 
212
  Returns:
213
  The pathlib.Path object for the file
214
  """
215
+
216
+ import warnings
217
+
218
+ try:
219
+ filename_path = Path(filename)
220
+ filename_path.touch(exist_ok=True) #, mode=0o2755)
221
+ except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
222
+ warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
223
+ filename_path.touch(exist_ok=True, mode=0o2755) # Creates an empty file if it doesn't exists
224
+ filename_path.chmod(0)
 
 
 
 
 
225
 
226
+ return filename_path
227
+
228
+ def zip_processed_files(root_dir: str, file_paths: list[str], tz_hours=None, date_format='%d%b%Y_%H-%M-%S') -> Path:
229
+ """
230
+ Creates a zip file from a list of file paths (strings) and returns the Path object.
231
+ It preserves the directory structure relative to the specified root directory.
232
+
233
+ Args:
234
+ root_dir (str): The root directory against which relative paths are calculated.
235
+ file_paths (list[str]): A list of string paths to the files to be zipped.
236
+
237
+ Returns:
238
+ str(Path): The string of the Path object of the newly created zip file.
239
+ """
240
+
241
+ import zipfile
242
+ from file_handler import file_utils
243
+ from utils import utils
244
+
245
+ root_path = Path(root_dir)
246
+ if not root_path.is_dir():
247
+ raise ValueError(f"Root directory does not exist: {root_path}")
248
+
249
+ # Create a temporary directory in a location where Gradio can access it.
250
+ gradio_output_dir = Path(tempfile.gettempdir()) / "gradio_temp_output"
251
+ #gradio_output_dir.mkdir(exist_ok=True)
252
+ file_utils.check_create_dir(gradio_output_dir)
253
+ final_zip_path = gradio_output_dir / f"outputs_processed_{utils.get_time_now_str(tz_hours=tz_hours, date_format=date_format)}.zip"
254
+
255
+ # Use a context manager to create the zip file: use zipfile() opposed to shutil.make_archive
256
+ # 'w' mode creates a new file, overwriting if it already exists.
257
+ zip_unprocessed = 0
258
+ with zipfile.ZipFile(final_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
259
+ for file_path_str in file_paths:
260
+ file_path = Path(file_path_str)
261
+ if file_path.exists() and file_path.is_file():
262
+ # Calculate the relative path from the root_dir.
263
+ # The `arcname` parameter tells `zipfile` what the path inside the zip file should be.
264
+ arcname = file_path.relative_to(root_path)
265
+ zipf.write(file_path, arcname=arcname)
266
+ else:
267
+ #print(f"Warning: Skipping {file_path_str}, as it is not a valid file.")
268
+ zip_processed_files += 1 ##SMY:future - to be implemented
269
+
270
+ #return final_zip_path
271
+ return str(final_zip_path)
272
+
273
+
274
+ def process_and_zip(input_dir_path):
275
+ """
276
+ Finds dynamic directories, copies files from a source directory to a temporary directory, zips it,
277
+ and returns the path to the zip file.
278
 
279
+ Args:
280
+ input_dir_path (str): The path to the directory containing files to be processed.
 
 
 
 
 
 
281
 
282
+ Returns:
283
+ pathlib.Path: The path to the generated zip file.
284
+ """
285
+ # Convert the input path to a Path object
286
+ #input_path = Path(input_dir_path)
287
+ parent_input_path = Path(input_dir_path) #.parent
288
+
289
+ # Check if the input directory exists
290
+ if not parent_input_path.is_dir():
291
+ raise ValueError(f"Input directory does not exist: {parent_input_path}")
292
+
293
+ # Create a temporary directory using a context manager
294
+ with tempfile.TemporaryDirectory() as temp_dir_str:
295
+ temp_dir_path = Path(temp_dir_str)
296
+
297
+ # Define the path for the output structure inside the temporary directory
298
+ temp_output_path = temp_dir_path / "output_dir"
299
+
300
+ # Copy all extracted files to the temporary directory
301
+ # We use semantic accurate and performant .iterdir than more robust glob to get all files and folders
302
+
303
+ for input_subdir in parent_input_path.iterdir():
304
+ if input_subdir.is_dir():
305
+ # Create the corresponding subdirectory in the temp directory
306
+ temp_output_subdir = temp_output_path / input_subdir.name
307
+ #temp_output_subdir.mkdir(parents=True, exist_ok=True) #, mode=0o2755)
308
+ #file_handler.file_utils.check_create_dir(temp_output_subdir)
309
+ check_create_dir(temp_output_subdir)
310
+
311
+ # Copy the files from the source subdirectory to the temp subdirectory
312
+ #for item_path in input_path.glob('*'):
313
+ for item_path in input_subdir.iterdir():
314
+ if item_path.is_dir():
315
+ shutil.copytree(src=item_path, dst=temp_output_subdir / item_path.name)
316
+ else:
317
+ shutil.copy2(item_path, temp_output_subdir)
318
+
319
+ # Create the zip file from the temporary directory
320
+ zip_base_name = temp_dir_path / "outputs_processed_files"
321
+ zip_file_path = shutil.make_archive(
322
+ base_name=str(zip_base_name), ##zip file's name
323
+ format='zip',
324
+ root_dir=str( temp_output_path) #(temp_dir_path) ##exclude from the archive
325
+ )
326
+ # Manually move the completed zip file to the Gradio-managed temporary directory
327
+ final_zip_file_path = parent_input_path / Path(zip_file_path).name
328
+ shutil.move(src=zip_file_path, dst=final_zip_file_path)
329
+
330
+ # The shutil function returns a string, so we convert it back to a Path object in gr.File
331
+ return str(final_zip_file_path)
332
+
333
 
334
  def is_file_with_extension(path_obj: Path) -> bool:
335
  """
llm/hf_client.py CHANGED
@@ -6,7 +6,7 @@ import time
6
  import traceback
7
  from huggingface_hub import InferenceClient, login, logout as hf_logout
8
 
9
- from llm.llm_login import login_huggingface, is_login_huggingface
10
 
11
  from utils.logger import get_logger
12
 
@@ -101,9 +101,9 @@ class HFChatClient:
101
  #pass
102
  '''
103
 
104
- login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
105
  ##SMY: TODO: Mapped with openai_client.py
106
- #self.islogged_in = is_login_huggingface()
107
 
108
  @staticmethod
109
  def _normalise_history(history: list, system_message: str, latest_user_message: str) -> list[dict]:
 
6
  import traceback
7
  from huggingface_hub import InferenceClient, login, logout as hf_logout
8
 
9
+ from llm.llm_login import login_huggingface, is_loggedin_huggingface #,is_login_huggingface
10
 
11
  from utils.logger import get_logger
12
 
 
101
  #pass
102
  '''
103
 
104
+ login_huggingface(self.token) if not is_loggedin_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
105
  ##SMY: TODO: Mapped with openai_client.py
106
+ #self.islogged_in = is_loggedin_huggingface()
107
 
108
  @staticmethod
109
  def _normalise_history(history: list, system_message: str, latest_user_message: str) -> list[dict]:
llm/llm_login.py CHANGED
@@ -1,4 +1,4 @@
1
- from huggingface_hub import HfApi, login, logout, get_token
2
  import os
3
  import traceback
4
  from time import sleep
@@ -9,6 +9,11 @@ from utils.logger import get_logger
9
  ## Get logger instance
10
  logger = get_logger(__name__)
11
 
 
 
 
 
 
12
  def login_huggingface(token: Optional[str] = None):
13
  """
14
  Login to Hugging Face account. Prioritize CLI login for privacy and determinism.
@@ -25,13 +30,15 @@ def login_huggingface(token: Optional[str] = None):
25
 
26
  # Disable implicit token propagation for determinism
27
  # Explicitly disable implicit token propagation; we rely on explicit auth or env var
28
- os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
 
29
 
30
  token = token
31
  # Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
32
  try:
33
- if HfApi.whoami():
34
- logger.info("βœ”οΈ hf_login already", extra={"mode": "cli"})
 
35
  #return True
36
  else:
37
  login()
@@ -53,13 +60,15 @@ def login_huggingface(token: Optional[str] = None):
53
  # Silent fallback; client will still work if token is passed directly
54
  #pass
55
 
56
- def is_login_huggingface():
57
- from huggingface_hub import HfApi
 
58
  from huggingface_hub.utils import HfHubHTTPError
59
 
60
  try:
61
  HfApi().whoami()
62
  logger.log(level=20, msg=("βœ”οΈ You are logged in."), extra={"is_logged_in": True})
 
63
  return True
64
  except HfHubHTTPError as exc:
65
  # A 401 status code indicates an authentication error.
 
1
+ from huggingface_hub import HfApi, login, logout, get_token, whoami
2
  import os
3
  import traceback
4
  from time import sleep
 
9
  ## Get logger instance
10
  logger = get_logger(__name__)
11
 
12
+ def disable_immplicit_token():
13
+ # Disable implicit token propagation for determinism
14
+ # Explicitly disable implicit token propagation; we rely on explicit auth or env var
15
+ os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
16
+
17
  def login_huggingface(token: Optional[str] = None):
18
  """
19
  Login to Hugging Face account. Prioritize CLI login for privacy and determinism.
 
30
 
31
  # Disable implicit token propagation for determinism
32
  # Explicitly disable implicit token propagation; we rely on explicit auth or env var
33
+ #os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
34
+ disable_immplicit_token()
35
 
36
  token = token
37
  # Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
38
  try:
39
+ #if HfApi.whoami(): ##SMY requires 'self' = HfApi. Alternatively HfApi().whoami()
40
+ if whoami(): ##SMY: Call HF API to know "whoami".
41
+ logger.info("βœ”οΈ hf_login already", extra={"mode": "HF Oauth"})
42
  #return True
43
  else:
44
  login()
 
60
  # Silent fallback; client will still work if token is passed directly
61
  #pass
62
 
63
+ #def is_login_huggingface():
64
+ def is_loggedin_huggingface():
65
+ #from huggingface_hub import HfApi
66
  from huggingface_hub.utils import HfHubHTTPError
67
 
68
  try:
69
  HfApi().whoami()
70
  logger.log(level=20, msg=("βœ”οΈ You are logged in."), extra={"is_logged_in": True})
71
+ disable_immplicit_token()
72
  return True
73
  except HfHubHTTPError as exc:
74
  # A 401 status code indicates an authentication error.
llm/openai_client.py CHANGED
@@ -7,7 +7,7 @@ from typing import Optional #Iterable, Literal
7
  import traceback
8
  #from huggingface_hub import InferenceClient, login, logout as hf_logout
9
 
10
- from llm.llm_login import login_huggingface, is_login_huggingface
11
 
12
  import dotenv
13
  #dotenv.load_dotenv(".env")
@@ -42,15 +42,17 @@ class OpenAIChatClient:
42
  self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id ##concatenate so HF can pipe to Hf provider
43
  self.hf_provider = hf_provider
44
  self.base_url = base_url #"https://router.huggingface.co/v1" #%22" #HF API proxy
45
- #self.token = api_token if api_token else None ##debug
46
- self.token = openai_api_key_env if openai_api_key_env else api_token #dotenv.get_key(".env", "OPENAI_API_KEY")
47
  #self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
48
- login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
49
  #self.fake_token = api_token or "a1b2c3" #or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 
50
  self.openai_api_key = self.token #self.fake_token
51
  self.temperature = temperature
52
  self.top_p = top_p
53
- self.islogged_in = is_login_huggingface()
 
 
54
 
55
  logger.log(level=2, msg="initialised OpenAIChatClient:", extra={"base_url": self.base_url, "openai_api_key": self.openai_api_key})
56
 
@@ -60,7 +62,7 @@ class OpenAIChatClient:
60
  logger.exception(f'βœ— OpenAI client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
61
  raise RuntimeError(f"βœ— Failed to initialise OpenAI client: {exc}\n{tb}")
62
 
63
- #login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
64
 
65
  ####IN PROGRESS
66
  #
 
7
  import traceback
8
  #from huggingface_hub import InferenceClient, login, logout as hf_logout
9
 
10
+ from llm.llm_login import login_huggingface, is_loggedin_huggingface #, is_login_huggingface
11
 
12
  import dotenv
13
  #dotenv.load_dotenv(".env")
 
42
  self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id ##concatenate so HF can pipe to Hf provider
43
  self.hf_provider = hf_provider
44
  self.base_url = base_url #"https://router.huggingface.co/v1" #%22" #HF API proxy
45
+ self.token = api_token if api_token else openai_api_key_env ##None ##debug
46
+ #self.token = openai_api_key_env if openai_api_key_env else api_token #dotenv.get_key(".env", "OPENAI_API_KEY")
47
  #self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
 
48
  #self.fake_token = api_token or "a1b2c3" #or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
49
+
50
  self.openai_api_key = self.token #self.fake_token
51
  self.temperature = temperature
52
  self.top_p = top_p
53
+ self.islogged_in = is_loggedin_huggingface()
54
+ ##SMY: log in now handled at higher entry level.
55
+ #login_huggingface(self.token) if not is_loggedin_huggingface() else logger.log(level=20, msg=f"You are logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
56
 
57
  logger.log(level=2, msg="initialised OpenAIChatClient:", extra={"base_url": self.base_url, "openai_api_key": self.openai_api_key})
58
 
 
62
  logger.exception(f'βœ— OpenAI client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
63
  raise RuntimeError(f"βœ— Failed to initialise OpenAI client: {exc}\n{tb}")
64
 
65
+ #login_huggingface(self.token) if not is_loggedin_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
66
 
67
  ####IN PROGRESS
68
  #
ui/gradio_ui.py CHANGED
@@ -11,10 +11,10 @@ import file_handler
11
  import file_handler.file_utils
12
  from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
13
  from utils.utils import is_dict, is_list_of_dicts
14
- from file_handler.file_utils import process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
15
  #from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
16
  from llm.provider_validator import is_valid_provider, suggest_providers
17
- from llm.llm_login import login_huggingface
18
 
19
  from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
20
  from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
@@ -34,6 +34,7 @@ pdf2md_converter = PdfToMarkdownConverter()
34
  #md2pdf_converter = MarkdownToPdfConverter()
35
 
36
  # pool executor to convert files called by Gradio
 
37
  def convert_batch(
38
  pdf_files, #: list[str],
39
  pdf_files_count: int,
@@ -59,8 +60,9 @@ def convert_batch(
59
  #output_dir: Optional[Union[str, Path]] = "output_dir",
60
  output_dir_string: str = "output_dir_default",
61
  use_llm: bool = False, #Optional[bool] = False, #True,
62
- page_range: str = None, #Optional[str] = None,
63
- ) -> str:
 
64
  """
65
  Handles the conversion process using multiprocessing.
66
  Spins up a pool and converts all uploaded files in parallel.
@@ -71,13 +73,18 @@ def convert_batch(
71
  # explicitly wrap file object in a list
72
  #pdf_files = pdf_files_wrap(pdf_files) ##Flag: deprecation
73
 
 
 
 
 
74
  ## debug
75
  #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
76
 
77
  #if not files:
78
  if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
79
  logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
80
- return "Initialising ProcessPool: No files uploaded."
 
81
 
82
  # Get config values if not provided
83
  config_file = find_file("config.ini") ##from file_handler.file_utils
@@ -117,11 +124,20 @@ def convert_batch(
117
 
118
  #global docextractor ##SMY: deprecated.
119
  try:
 
120
  login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
 
 
 
 
 
 
 
 
121
  except Exception as exc: # Catch all exceptions
122
  tb = traceback.format_exc()
123
  logger.exception(f"βœ— Error during login_huggingface β†’ {exc}\n{tb}", exc_info=True) # Log the full traceback
124
- return f"βœ— An error occurred during login_huggingface β†’ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
125
 
126
  try:
127
  # Create a pool with init_worker initialiser
@@ -146,38 +162,48 @@ def convert_batch(
146
  #result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
147
  results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
148
  except Exception as exc:
149
- # Raise the exception to stop the Gradio app
150
- #raise # Re-raise the exception to halt execution
151
  logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
152
  traceback.print_exc() # Print the exception traceback
153
- return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
 
154
 
155
  #'''
156
- logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
157
- logs = []
158
- logs_files_images = []
159
- #logs.extend(results) ## performant pythonic
160
- #logs = list[results] ##
161
- logs = [result for result in results] ## pythonic list comprehension
162
- ## logs : [file , images , filepath, image_path]
163
-
164
- #logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
165
- for log in logs:
166
- #logs_files_images.append(log.get("filepath", "Error or No filepath")) # if all(isinstance(log, dict) for item in logs))
167
- #logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
168
-
169
- logs_files_images.append(log.get("filepath") if is_dict(logs) or isinstance(log, Path) else "Error or no image_path") # isinstance(log, (dict, str))
170
- logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- #logs_files_images.append(logs_filepath) ## to del
174
- #logs_files_images.extend(logs_images) ## to del
175
  #'''
176
  except Exception as exc:
177
  tb = traceback.format_exc()
178
  logger.exception(f"βœ— Error during ProcessPoolExecutor β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
179
  #traceback.print_exc() # Print the exception traceback
180
- return f"βœ— An error occurred during ProcessPoolExecutorβ†’ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
 
181
 
182
  '''
183
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
@@ -187,8 +213,25 @@ def convert_batch(
187
  logs = [result for result in results] ## pythonic list comprehension
188
  '''
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  try:
191
- logs_return = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
 
192
  #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
193
 
194
  ##convert the List of Path objects to List of string for gr.Files output
@@ -196,13 +239,20 @@ def convert_batch(
196
 
197
  ## # Convert any Path objects to strings, but leave strings as-is
198
  logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
199
- return logs_return, logs_return, logs_files_images_return
 
 
200
  #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
 
 
 
 
 
201
  except Exception as exc:
202
  tb = traceback.format_exc()
203
  logger.exception(f"βœ— Error during returning result logs β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
204
  #traceback.print_exc() # Print the exception traceback
205
- return f"βœ— An error occurred during returning result logsβ†’ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
206
 
207
 
208
  #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
@@ -346,6 +396,7 @@ def build_interface() -> gr.Blocks:
346
  }
347
  """
348
 
 
349
  def is_file_with_extension(path_obj: Path) -> bool:
350
  """
351
  Checks if a pathlib.Path object is a file and has a non-empty extension.
@@ -353,6 +404,7 @@ def build_interface() -> gr.Blocks:
353
  path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
354
  return path_obj.is_file() and bool(path_obj.suffix)
355
 
 
356
  def accumulate_files(uploaded_files, current_state):
357
  """
358
  Accumulates newly uploaded files with the existing state.
@@ -396,18 +448,18 @@ def build_interface() -> gr.Blocks:
396
  gr.Markdown(f"#### **Backend Configuration**")
397
  system_message = gr.Textbox(
398
  label="System Message",
399
- lines=2
400
  )
401
  with gr.Row():
402
  provider_dd = gr.Dropdown(
403
  choices=["huggingface", "openai"],
404
  label="Provider",
405
  value="huggingface",
406
- #allow_custom_value=True
407
  )
408
  backend_choice = gr.Dropdown(
409
  choices=["model-id", "provider", "endpoint"],
410
- label="HF Backend Choice"
411
  ) ## SMY: ensure HFClient maps correctly
412
  model_tb = gr.Textbox(
413
  label="Model ID",
@@ -415,7 +467,7 @@ def build_interface() -> gr.Blocks:
415
  )
416
  endpoint_tb = gr.Textbox(
417
  label="Endpoint",
418
- placeholder="Optional custom endpoint"
419
  )
420
  with gr.Row():
421
  max_token_sl = gr.Slider(
@@ -423,26 +475,29 @@ def build_interface() -> gr.Blocks:
423
  minimum=1,
424
  maximum=131172, #65536, #32768, #16384, #8192,
425
  value=1024, #512,
426
- step=1
427
  )
428
  temperature_sl = gr.Slider(
429
  label="Temperature",
430
  minimum=0.0,
431
  maximum=1.0,
432
  value=0.0,
433
- step=0.1 #0.01
434
  )
435
  top_p_sl = gr.Slider(
436
  label="Top-p",
437
  minimum=0.0,
438
  maximum=1.0,
439
  value=0.1,
440
- step=0.1 #0.01
441
- )
442
- stream_cb = gr.Checkbox(
443
- label="LLM Streaming",
444
- value=False
445
  )
 
 
 
 
 
 
 
446
  with gr.Row():
447
  api_token_tb = gr.Textbox(
448
  label="API Token [OPTIONAL]",
@@ -524,6 +579,7 @@ def build_interface() -> gr.Blocks:
524
  # Initialise gr.State
525
  state_max_workers = gr.State(4) #max_workers_sl,
526
  state_max_retries = gr.State(2) #max_retries_sl,
 
527
 
528
  def update_state_stored_value(new_component_input):
529
  """ Updates stored state: use for max_workers and max_retries """
@@ -532,30 +588,51 @@ def build_interface() -> gr.Blocks:
532
  # Update gr.State values on slider components change. NB: initial value of `gr.State` must be able to be deepcopied
533
  max_workers_sl.change(update_state_stored_value, inputs=max_workers_sl, outputs=state_max_workers)
534
  max_retries_sl.change(update_state_stored_value, inputs=max_retries_sl, outputs=state_max_retries)
 
535
 
536
 
537
  with gr.Accordion("πŸ€— HuggingFace Client Logout", open=True): #, open=False):
538
  # Logout controls
539
- def do_logout():
540
  try:
541
  #ok = docextractor.client.logout()
542
  ok = docconverter.client.logout()
543
  # Reset token textbox on successful logout
544
- msg = "βœ… Logged out of HuggingFace and cleared tokens. Remember to log out of HuggingFace completely." if ok else "⚠️ Logout failed."
545
- return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace πŸ€—")
 
546
  except AttributeError:
547
- return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace πŸ€—")
548
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  def custom_do_logout():
550
- return gr.update(value="Sign in to HuggingFace πŸ€—")
 
 
 
 
551
 
552
  logout_status = gr.Markdown(visible=False)
553
  with gr.Row():
554
- hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace πŸ€—", logout_value="Logout of HF: ({})", variant="huggingface")
555
- logout_btn = gr.Button("Logout from session and Hugging Face (inference) Client", variant="stop", )
556
 
557
- hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
558
- logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status, hf_login_logout_btn])
 
559
 
560
 
561
  # The gr.State component to hold the accumulated list of files
@@ -695,18 +772,26 @@ def build_interface() -> gr.Blocks:
695
  '''
696
 
697
  # A Files component to display individual processed files as download links
698
- with gr.Accordion("⏬ View and Download processed files", open=False):
 
 
 
 
 
 
 
 
699
  with gr.Row():
700
- files_individual_JSON = gr.JSON(label="Serialised JSON list", max_height=250)
701
- files_individual_downloads = gr.Files(label="Individual Processed Files")
702
 
703
  ## Displays processed file paths
704
- with gr.Accordion("View processing log", open=False):
705
  log_output = gr.Textbox(
706
  label="Conversion Logs",
707
  lines=5,
708
  #max_lines=25,
709
- interactive=False
710
  )
711
 
712
  # file inputs
@@ -745,6 +830,7 @@ def build_interface() -> gr.Blocks:
745
  output_dir_tb,
746
  use_llm_cb,
747
  page_range_tb,
 
748
  ]
749
 
750
  ## debug
@@ -756,12 +842,14 @@ def build_interface() -> gr.Blocks:
756
  #pdf_files.upload(
757
  fn=convert_batch,
758
  inputs=inputs_arg,
759
- outputs=[log_output, files_individual_JSON, files_individual_downloads],
760
  )
761
  except Exception as exc:
762
  tb = traceback.format_exc()
763
  logger.exception(f"βœ— Error during process_button.click β†’ {exc}\n{tb}", exc_info=True)
764
- return f"βœ— An error occurred during process_button.click β†’ {exc}\n{tb}"
 
 
765
 
766
  ##gr.File .upload() event, fire only after a file has been uploaded
767
  # Event handler for the pdf file upload button
@@ -774,37 +862,9 @@ def build_interface() -> gr.Blocks:
774
  btn_pdf_convert.click(
775
  #pdf_files.upload(
776
  fn=convert_batch,
777
- outputs=[log_output, files_individual_downloads],
778
  inputs=inputs_arg,
779
- )
780
- '''
781
- inputs = [
782
- pdf_files,
783
- #pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
784
- pdf_files_count,
785
- provider_dd,
786
- model_tb,
787
- hf_provider_dd,
788
- endpoint_tb,
789
- backend_choice,
790
- system_message,
791
- max_token_sl,
792
- temperature_sl,
793
- top_p_sl,
794
- stream_cb,
795
- api_token_tb,
796
- #gr.State(4), # max_workers
797
- #gr.State(3), # max_retries
798
- openai_base_url_tb,
799
- openai_image_format_dd,
800
- state_max_workers, #gr.State(max_workers_sl), #max_workers_sl,
801
- state_max_retries, #gr.State(max_retries_sl), #max_retries_sl,
802
- output_format_dd,
803
- output_dir_tb,
804
- use_llm_cb,
805
- page_range_tb,
806
- ],
807
- '''
808
  # )
809
 
810
  # reuse the same business logic for HTML tab
@@ -818,7 +878,7 @@ def build_interface() -> gr.Blocks:
818
  btn_html_convert.click(
819
  fn=convert_batch,
820
  inputs=inputs_arg,
821
- outputs=[log_output, files_individual_downloads]
822
  )
823
 
824
  def get_file_count(file_list):
 
11
  import file_handler.file_utils
12
  from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
13
  from utils.utils import is_dict, is_list_of_dicts
14
+ from file_handler.file_utils import zip_processed_files, process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
15
  #from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
16
  from llm.provider_validator import is_valid_provider, suggest_providers
17
+ from llm.llm_login import is_loggedin_huggingface, login_huggingface
18
 
19
  from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
20
  from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
 
34
  #md2pdf_converter = MarkdownToPdfConverter()
35
 
36
  # pool executor to convert files called by Gradio
37
+ ##SMY: TODO: future: refactor to gradio_process.py
38
  def convert_batch(
39
  pdf_files, #: list[str],
40
  pdf_files_count: int,
 
60
  #output_dir: Optional[Union[str, Path]] = "output_dir",
61
  output_dir_string: str = "output_dir_default",
62
  use_llm: bool = False, #Optional[bool] = False, #True,
63
+ page_range: str = None, #Optional[str] = None,
64
+ tz_hours: str = None,
65
+ ): #-> str:
66
  """
67
  Handles the conversion process using multiprocessing.
68
  Spins up a pool and converts all uploaded files in parallel.
 
73
  # explicitly wrap file object in a list
74
  #pdf_files = pdf_files_wrap(pdf_files) ##Flag: deprecation
75
 
76
+ # Update the Gradio UI to improve user-friendly eXperience
77
+ #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
78
+ yield gr.update(interactive=False), f"Processing files...", {"process": "Processing files"}, f"__init__.py"
79
+
80
  ## debug
81
  #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
82
 
83
  #if not files:
84
  if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
85
  logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
86
+ #outputs=[log_output, files_individual_JSON, files_individual_downloads],
87
+ return gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"__init__.py"
88
 
89
  # Get config values if not provided
90
  config_file = find_file("config.ini") ##from file_handler.file_utils
 
124
 
125
  #global docextractor ##SMY: deprecated.
126
  try:
127
+ ##SMY: might deprecate. To replace with oauth login from Gradio ui or integrate cleanly.
128
  login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
129
+
130
+ if is_loggedin_huggingface() and (api_token is None or api_token == ""):
131
+ api_token = get_token()
132
+ else:
133
+ login_huggingface()
134
+ # login: Update the Gradio UI to improve user-friendly eXperience
135
+ yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
136
+
137
  except Exception as exc: # Catch all exceptions
138
  tb = traceback.format_exc()
139
  logger.exception(f"βœ— Error during login_huggingface β†’ {exc}\n{tb}", exc_info=True) # Log the full traceback
140
+ return gr.update(interactive=True), f"βœ— An error occurred during login_huggingface β†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
141
 
142
  try:
143
  # Create a pool with init_worker initialiser
 
162
  #result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
163
  results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
164
  except Exception as exc:
165
+ # Raise the exception to stop the Gradio app: exception to halt execution
 
166
  logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
167
  traceback.print_exc() # Print the exception traceback
168
+ #return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
169
+ yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}"}, f"__init__.py" ## return the exception message
170
 
171
  #'''
172
+ try:
173
+ logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
174
+ logs = []
175
+ logs_files_images = []
176
+ #logs.extend(results) ## performant pythonic
177
+ #logs = list[results] ##
178
+ logs = [result for result in results] ## pythonic list comprehension
179
+ ## logs : [file , images , filepath, image_path]
 
 
 
 
 
 
 
180
 
181
+ #logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
182
+ logs_count = 0
183
+ #for log in logs:
184
+ for i, log in enumerate(logs):
185
+ logs_files_images.append(log.get("filepath") if is_dict(log) or is_list_of_dicts(logs) else "Error or no file_path") # isinstance(log, (dict, str))
186
+ logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
187
+ i_image = log.get("images", 0)
188
+ # Update the Gradio UI to improve user-friendly eXperience
189
+ yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"__init__.py"
190
+ logs_count = i+i_image
191
+
192
+ #logs_files_images.append(logs_filepath) ## to del
193
+ #logs_files_images.extend(logs_images) ## to del
194
+ except Exception as exc:
195
+ logger.exception("Error during processing results logs β†’ {exc}\n{tb}", exc_info=True) # Log the full traceback
196
+ traceback.print_exc() # Print the exception traceback
197
+ #return f"An error occurred during processing results logs: {str(exc)}\n{tb}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
198
+ yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" ## return the exception message
199
 
 
 
200
  #'''
201
  except Exception as exc:
202
  tb = traceback.format_exc()
203
  logger.exception(f"βœ— Error during ProcessPoolExecutor β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
204
  #traceback.print_exc() # Print the exception traceback
205
+ #return gr.update(interactive=True), f"βœ— An error occurred during ProcessPoolExecutorβ†’ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
206
+ yield gr.update(interactive=True), f"βœ— An error occurred during ProcessPoolExecutorβ†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
207
 
208
  '''
209
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
 
213
  logs = [result for result in results] ## pythonic list comprehension
214
  '''
215
 
216
+ # Zip Processed md Files and images. Insert to first index
217
+ try: ##from file_handler.file_utils
218
+ zipped_processed_files = zip_processed_files(root_dir=f"data/{output_dir_string}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y')
219
+ logs_files_images.insert(0, zipped_processed_files)
220
+ #logs_files_images.insert(1, "====================")
221
+ yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"__init__.py"
222
+
223
+ except Exception as exc:
224
+ tb = traceback.format_exc()
225
+ logger.exception(f"βœ— Error during zipping processed files β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
226
+ #traceback.print_exc() # Print the exception traceback
227
+ #return gr.update(interactive=True), f"βœ— An error occurred during zipping files β†’ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
228
+ yield gr.update(interactive=True), f"βœ— An error occurred during zipping files β†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
229
+
230
+
231
+ # Return processed files log
232
  try:
233
+ ## # Convert logs list of dicts to formatted json string
234
+ logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
235
  #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
236
 
237
  ##convert the List of Path objects to List of string for gr.Files output
 
239
 
240
  ## # Convert any Path objects to strings, but leave strings as-is
241
  logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
242
+ logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
243
+
244
+ #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
245
  #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
246
+ #return logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
247
+ #return gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
248
+ yield gr.update(interactive=True), gr.update(), gr.update(visible=True), gr.update(visible=True)
249
+ yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
250
+
251
  except Exception as exc:
252
  tb = traceback.format_exc()
253
  logger.exception(f"βœ— Error during returning result logs β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
254
  #traceback.print_exc() # Print the exception traceback
255
+ return gr.update(interactive=True), f"βœ— An error occurred during returning result logsβ†’ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
256
 
257
 
258
  #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
 
396
  }
397
  """
398
 
399
+ ##SMY: flagged; to move to file_handler.file_utils
400
  def is_file_with_extension(path_obj: Path) -> bool:
401
  """
402
  Checks if a pathlib.Path object is a file and has a non-empty extension.
 
404
  path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
405
  return path_obj.is_file() and bool(path_obj.suffix)
406
 
407
+ ##SMY: flagged; to move to file_handler.file_utils
408
  def accumulate_files(uploaded_files, current_state):
409
  """
410
  Accumulates newly uploaded files with the existing state.
 
448
  gr.Markdown(f"#### **Backend Configuration**")
449
  system_message = gr.Textbox(
450
  label="System Message",
451
+ lines=2,
452
  )
453
  with gr.Row():
454
  provider_dd = gr.Dropdown(
455
  choices=["huggingface", "openai"],
456
  label="Provider",
457
  value="huggingface",
458
+ #allow_custom_value=True,
459
  )
460
  backend_choice = gr.Dropdown(
461
  choices=["model-id", "provider", "endpoint"],
462
+ label="HF Backend Choice",
463
  ) ## SMY: ensure HFClient maps correctly
464
  model_tb = gr.Textbox(
465
  label="Model ID",
 
467
  )
468
  endpoint_tb = gr.Textbox(
469
  label="Endpoint",
470
+ placeholder="Optional custom endpoint",
471
  )
472
  with gr.Row():
473
  max_token_sl = gr.Slider(
 
475
  minimum=1,
476
  maximum=131172, #65536, #32768, #16384, #8192,
477
  value=1024, #512,
478
+ step=1,
479
  )
480
  temperature_sl = gr.Slider(
481
  label="Temperature",
482
  minimum=0.0,
483
  maximum=1.0,
484
  value=0.0,
485
+ step=0.1, #0.01
486
  )
487
  top_p_sl = gr.Slider(
488
  label="Top-p",
489
  minimum=0.0,
490
  maximum=1.0,
491
  value=0.1,
492
+ step=0.1, #0.01
 
 
 
 
493
  )
494
+ with gr.Column():
495
+ stream_cb = gr.Checkbox(
496
+ label="LLM Streaming",
497
+ value=False,
498
+ )
499
+ #tz_hours_tb = gr.Textbox(value=None, label="TZ Hours", placeholder="Timezone in numbers", max_lines=1,)
500
+ tz_hours_num = gr.Number(label="TZ Hours", placeholder="Timezone in numbers", min_width=5,)
501
  with gr.Row():
502
  api_token_tb = gr.Textbox(
503
  label="API Token [OPTIONAL]",
 
579
  # Initialise gr.State
580
  state_max_workers = gr.State(4) #max_workers_sl,
581
  state_max_retries = gr.State(2) #max_retries_sl,
582
+ state_tz_hours = gr.State(value=None)
583
 
584
  def update_state_stored_value(new_component_input):
585
  """ Updates stored state: use for max_workers and max_retries """
 
588
  # Update gr.State values on slider components change. NB: initial value of `gr.State` must be able to be deepcopied
589
  max_workers_sl.change(update_state_stored_value, inputs=max_workers_sl, outputs=state_max_workers)
590
  max_retries_sl.change(update_state_stored_value, inputs=max_retries_sl, outputs=state_max_retries)
591
+ tz_hours_num.change(update_state_stored_value, inputs=tz_hours_num, outputs=state_tz_hours)
592
 
593
 
594
  with gr.Accordion("πŸ€— HuggingFace Client Logout", open=True): #, open=False):
595
  # Logout controls
596
+ '''def do_logout():
597
  try:
598
  #ok = docextractor.client.logout()
599
  ok = docconverter.client.logout()
600
  # Reset token textbox on successful logout
601
+ #msg = "βœ… Logged out of HuggingFace and cleared tokens. Remember to log out of HuggingFace completely." if ok else "⚠️ Logout failed."
602
+ msg = "βœ… Session Cleared. Remember to browser." if ok else "⚠️ Logout failed."
603
+ return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace πŸ€—"), gr.update(value="Clear session")
604
  except AttributeError:
605
+ msg = "⚠️ Logout."
606
+ return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace πŸ€—"), gr.update(value="Clear session", interactive=False)
607
+ '''
608
+ def do_logout_hf():
609
+ try:
610
+ ok = docconverter.client.logout()
611
+ # Reset token textbox on successful logout
612
+ msg = "βœ… Session Cleared. Remember to close browser." if ok else "⚠️ Logout & Session Cleared"
613
+ #return gr.update(value=""), gr.update(visible=True, value=msg), gr.update(value="Sign in to HuggingFace πŸ€—"), gr.update(value="Clear session", interactive=False)
614
+ #return msg
615
+ yield msg
616
+ except AttributeError:
617
+ msg = "⚠️ Logout. No HF session"
618
+ #return msg
619
+ yield msg
620
+
621
  def custom_do_logout():
622
+ #do_logout()
623
+ #return gr.update(value="Sign in to HuggingFace πŸ€—")
624
+ msg = do_logout_hf()
625
+ #return gr.update(value="Sign in to HuggingFace πŸ€—"), gr.update(value=""), gr.update(visible=True, value=msg)
626
+ yield gr.update(value="Sign in to HuggingFace πŸ€—"), gr.update(value=""), gr.update(visible=True, value=msg)
627
 
628
  logout_status = gr.Markdown(visible=False)
629
  with gr.Row():
630
+ hf_login_logout_btn = gr.LoginButton(value="Sign in to HuggingFace πŸ€—", logout_value="Clear Session & Logout of HF: ({})", variant="huggingface")
631
+ #logout_btn = gr.Button("Logout from session and Hugging Face (inference) Client", variant="stop", )
632
 
633
+ #hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=hf_login_logout_btn)
634
+ hf_login_logout_btn.click(fn=custom_do_logout, inputs=None, outputs=[hf_login_logout_btn, api_token_tb, logout_status])
635
+ #logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status, hf_login_logout_btn, logout_btn])
636
 
637
 
638
  # The gr.State component to hold the accumulated list of files
 
772
  '''
773
 
774
  # A Files component to display individual processed files as download links
775
+ with gr.Accordion("⏬ View and Download processed files", open=True): #, open=False
776
+ processed_file_state = gr.State([])
777
+
778
+ ##SMY: future
779
+ zip_btn = gr.DownloadButton("Download Zip file of all processed files", visible=False) #.Button()
780
+
781
+ # Placeholder to download zip file of processed files
782
+ download_zip_file = gr.File(label="Download processed Files (ZIP)", interactive=False, visible=False) #, height="1"
783
+
784
  with gr.Row():
785
+ files_individual_JSON = gr.JSON(label="Serialised JSON list", max_height=250, visible=False)
786
+ files_individual_downloads = gr.Files(label="Individual Processed Files", visible=False)
787
 
788
  ## Displays processed file paths
789
+ with gr.Accordion("View processing log", open=True): #open=False):
790
  log_output = gr.Textbox(
791
  label="Conversion Logs",
792
  lines=5,
793
  #max_lines=25,
794
+ #interactive=False
795
  )
796
 
797
  # file inputs
 
830
  output_dir_tb,
831
  use_llm_cb,
832
  page_range_tb,
833
+ tz_hours_num,
834
  ]
835
 
836
  ## debug
 
842
  #pdf_files.upload(
843
  fn=convert_batch,
844
  inputs=inputs_arg,
845
+ outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
846
  )
847
  except Exception as exc:
848
  tb = traceback.format_exc()
849
  logger.exception(f"βœ— Error during process_button.click β†’ {exc}\n{tb}", exc_info=True)
850
+ msg = "βœ— An error occurred during process_button.click" # β†’
851
+ #return f"βœ— An error occurred during process_button.click β†’ {exc}\n{tb}"
852
+ return gr.update(interactive=True), f"{msg} β†’ {exc}\n{tb}", f"{msg} β†’ {exc}", f"{msg} β†’ {exc}"
853
 
854
  ##gr.File .upload() event, fire only after a file has been uploaded
855
  # Event handler for the pdf file upload button
 
862
  btn_pdf_convert.click(
863
  #pdf_files.upload(
864
  fn=convert_batch,
865
+ outputs=[btn_pdf_convert, log_output, files_individual_JSON, files_individual_downloads],
866
  inputs=inputs_arg,
867
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  # )
869
 
870
  # reuse the same business logic for HTML tab
 
878
  btn_html_convert.click(
879
  fn=convert_batch,
880
  inputs=inputs_arg,
881
+ outputs=[btn_html_convert,log_output, files_individual_JSON, files_individual_downloads]
882
  )
883
 
884
  def get_file_count(file_list):
utils/get_config.py CHANGED
@@ -9,7 +9,7 @@ import traceback
9
  import sys
10
  from pathlib import Path
11
  #base_grandparent = Path(__file__).resolve().parent.parent
12
- grandparent_dir = Path('.').resolve() #.parent.parent
13
  sys.path.insert(0, f"{grandparent_dir}") #\\file_handler")
14
  ##end debug
15
  #'''
 
9
  import sys
10
  from pathlib import Path
11
  #base_grandparent = Path(__file__).resolve().parent.parent
12
+ grandparent_dir = Path('.').resolve() #.parent.parent ##unreliable
13
  sys.path.insert(0, f"{grandparent_dir}") #\\file_handler")
14
  ##end debug
15
  #'''
utils/logger.py CHANGED
@@ -45,7 +45,7 @@ class JsonFormatter(logging.Formatter):
45
  return json.dumps(payload, ensure_ascii=False)
46
 
47
  #def setup_logging(level: int = logging.INFO) -> None: ## Causing non-stop logging on HF spaces
48
- def setup_logging(level: int = None) -> None:
49
  """Configure root logger with JSON output to both stdout and file.
50
 
51
  Args:
@@ -66,7 +66,7 @@ def setup_logging(level: int = None) -> None:
66
  #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
67
  #file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
68
  from file_handler.file_utils import check_create_logfile
69
- file_handler = logging.FileHandler(check_create_logfile("app_logging.log"), mode="a", encoding="utf-8")
70
  ## Getting filepermission error
71
 
72
  file_handler.setFormatter(JsonFormatter())
 
45
  return json.dumps(payload, ensure_ascii=False)
46
 
47
  #def setup_logging(level: int = logging.INFO) -> None: ## Causing non-stop logging on HF spaces
48
+ def setup_logging(level: int = None, tz_hours=None, date_format:str="%d%b%Y") -> None: #'%Y-%m-%d
49
  """Configure root logger with JSON output to both stdout and file.
50
 
51
  Args:
 
66
  #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
67
  #file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
68
  from file_handler.file_utils import check_create_logfile
69
+ file_handler = logging.FileHandler(check_create_logfile(filename="app_logging.log", tz_hours=tz_hours, date_format=date_format), mode="a", encoding="utf-8")
70
  ## Getting filepermission error
71
 
72
  file_handler.setFormatter(JsonFormatter())
utils/utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  def is_dict(variable):
2
  """Checks if a variable is a dict."""
3
  if isinstance(variable, dict):
@@ -12,4 +13,26 @@ def is_list_of_dicts(variable):
12
  # Return True only if the list is empty or all elements are dicts.
13
  return all(isinstance(item, dict) for item in variable)
14
 
15
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
  def is_dict(variable):
3
  """Checks if a variable is a dict."""
4
  if isinstance(variable, dict):
 
13
  # Return True only if the list is empty or all elements are dicts.
14
  return all(isinstance(item, dict) for item in variable)
15
 
16
+ return False
17
+
18
+
19
+ def get_time_now_str(tz_hours=None, date_format:str='%Y-%m-%d'): #date_format:str='%d%b%Y'):
20
+ """Returns the current time in a specific format + local time: ("%Y-%m-%d %H:%M:%S.%f %Z")."""
21
+ from datetime import datetime, timezone, timedelta
22
+
23
+ # Get the current time or UTC time
24
+ if tz_hours is not None:
25
+ current_utc_time = datetime.now(tz=timezone.utc) + timedelta(hours=tz_hours)
26
+ current_time = current_utc_time
27
+ else:
28
+ current_time = datetime.now()
29
+
30
+ # Format the time as a string
31
+ #formatted_time = current_utc_time.strftime(date_format) #("%Y-%m-%d %H:%M:%S.%f %Z")
32
+ formatted_time = current_time.strftime(date_format) #("%Y-%m-%d %H:%M:%S.%f %Z")
33
+
34
+ #print(f"Current time: {formatted_time}") ##debug
35
+ return formatted_time
36
+
37
+ #get_time_now_str() ##debug
38
+