|
|
import logging |
|
|
import os |
|
|
import re |
|
|
import socket |
|
|
import tempfile |
|
|
import urllib.parse |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
from typing import List |
|
|
|
|
|
import bleach |
|
|
from dotenv import load_dotenv |
|
|
from tldextract import TLDExtract |
|
|
|
|
|
from tools.secure_path_utils import ( |
|
|
secure_file_read, |
|
|
secure_path_join, |
|
|
validate_path_safety, |
|
|
) |
|
|
|
|
|
today_rev = datetime.now().strftime("%Y%m%d") |
|
|
HOST_NAME = socket.gethostname() |
|
|
|
|
|
|
|
|
def _get_env_list(env_var_name: str) -> List[str]: |
|
|
"""Parses a comma-separated environment variable into a list of strings.""" |
|
|
value = env_var_name[1:-1].strip().replace('"', "").replace("'", "") |
|
|
if not value: |
|
|
return [] |
|
|
|
|
|
return [s.strip() for s in value.split(",") if s.strip()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_string_to_boolean(value: str) -> bool: |
|
|
"""Convert string to boolean, handling various formats.""" |
|
|
if isinstance(value, bool): |
|
|
return value |
|
|
elif value in ["True", "1", "true", "TRUE"]: |
|
|
return True |
|
|
elif value in ["False", "0", "false", "FALSE"]: |
|
|
return False |
|
|
else: |
|
|
raise ValueError(f"Invalid boolean value: {value}") |
|
|
|
|
|
|
|
|
def ensure_folder_within_app_directory( |
|
|
folder_path: str, app_base_dir: str = None |
|
|
) -> str: |
|
|
""" |
|
|
Ensure that a folder path is within the app directory for security. |
|
|
|
|
|
This function validates that user-defined folder paths are contained within |
|
|
the app directory to prevent path traversal attacks and ensure data isolation. |
|
|
|
|
|
Args: |
|
|
folder_path: The folder path to validate and normalize |
|
|
app_base_dir: The base directory of the app (defaults to current working directory) |
|
|
|
|
|
Returns: |
|
|
A normalized folder path that is guaranteed to be within the app directory |
|
|
|
|
|
Raises: |
|
|
ValueError: If the path cannot be safely contained within the app directory |
|
|
""" |
|
|
if not folder_path or not folder_path.strip(): |
|
|
return folder_path |
|
|
|
|
|
|
|
|
if app_base_dir is None: |
|
|
app_base_dir = os.getcwd() |
|
|
|
|
|
app_base_dir = Path(app_base_dir).resolve() |
|
|
folder_path = folder_path.strip() |
|
|
|
|
|
|
|
|
has_trailing_sep = folder_path.endswith(("/", "\\")) |
|
|
|
|
|
|
|
|
if folder_path == "TEMP": |
|
|
return folder_path |
|
|
|
|
|
|
|
|
if os.path.isabs(folder_path): |
|
|
folder_path_resolved = Path(folder_path).resolve() |
|
|
|
|
|
try: |
|
|
folder_path_resolved.relative_to(app_base_dir) |
|
|
|
|
|
result = str(folder_path_resolved) |
|
|
if has_trailing_sep and not result.endswith(os.sep): |
|
|
result = result + os.sep |
|
|
return result |
|
|
except ValueError: |
|
|
|
|
|
|
|
|
|
|
|
normalized_path = os.path.normpath(folder_path).lower() |
|
|
system_path_prefixes = [ |
|
|
"/usr", |
|
|
"/opt", |
|
|
"/var", |
|
|
"/etc", |
|
|
"/tmp", |
|
|
] |
|
|
if any( |
|
|
normalized_path.startswith(prefix) for prefix in system_path_prefixes |
|
|
): |
|
|
|
|
|
print( |
|
|
f"Warning: Using system path outside app directory: {folder_path}" |
|
|
) |
|
|
return folder_path |
|
|
else: |
|
|
raise ValueError( |
|
|
f"Folder path '{folder_path}' is outside the app directory '{app_base_dir}'. " |
|
|
f"For security, all user-defined folder paths must be within the app directory." |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
safe_path = secure_path_join(app_base_dir, folder_path) |
|
|
result = str(safe_path) |
|
|
if has_trailing_sep and not result.endswith(os.sep): |
|
|
result = result + os.sep |
|
|
return result |
|
|
except (PermissionError, ValueError) as e: |
|
|
|
|
|
|
|
|
folder_name = os.path.basename(folder_path.rstrip("/\\")) |
|
|
if folder_name: |
|
|
safe_path = secure_path_join(app_base_dir, folder_name) |
|
|
result = str(safe_path) |
|
|
if has_trailing_sep and not result.endswith(os.sep): |
|
|
result = result + os.sep |
|
|
print( |
|
|
f"Warning: Sanitized folder path '{folder_path}' to '{result}' for security" |
|
|
) |
|
|
return result |
|
|
else: |
|
|
raise ValueError( |
|
|
f"Cannot safely normalize folder path: {folder_path}" |
|
|
) from e |
|
|
|
|
|
|
|
|
def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False): |
|
|
""" |
|
|
Get an environmental variable, and set it to a default value if it doesn't exist |
|
|
""" |
|
|
|
|
|
value = os.environ.get(var_name) |
|
|
|
|
|
|
|
|
if value is None: |
|
|
os.environ[var_name] = default_value |
|
|
value = default_value |
|
|
|
|
|
if print_val is True: |
|
|
print(f"The value of {var_name} is {value}") |
|
|
|
|
|
return value |
|
|
|
|
|
|
|
|
def add_folder_to_path(folder_path: str): |
|
|
""" |
|
|
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run) |
|
|
""" |
|
|
|
|
|
if os.path.exists(folder_path) and os.path.isdir(folder_path): |
|
|
|
|
|
|
|
|
|
|
|
absolute_path = os.path.abspath(folder_path) |
|
|
|
|
|
current_path = os.environ["PATH"] |
|
|
if absolute_path not in current_path.split(os.pathsep): |
|
|
full_path_extension = absolute_path + os.pathsep + current_path |
|
|
os.environ["PATH"] = full_path_extension |
|
|
|
|
|
else: |
|
|
pass |
|
|
|
|
|
else: |
|
|
print(f"Folder not found at {folder_path} - not added to PATH") |
|
|
|
|
|
|
|
|
def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str: |
|
|
""" |
|
|
Validate and return a safe URL with enhanced security checks. |
|
|
""" |
|
|
if allowed_domains is None: |
|
|
allowed_domains = [ |
|
|
"seanpedrick-case.github.io", |
|
|
"github.io", |
|
|
"github.com", |
|
|
"sharepoint.com", |
|
|
] |
|
|
|
|
|
try: |
|
|
parsed = urllib.parse.urlparse(url_candidate) |
|
|
|
|
|
|
|
|
if not parsed.scheme or not parsed.netloc: |
|
|
raise ValueError("Invalid URL structure") |
|
|
|
|
|
|
|
|
if parsed.scheme not in ["https"]: |
|
|
raise ValueError("Only HTTPS URLs are allowed for security") |
|
|
|
|
|
|
|
|
domain = parsed.netloc.lower() |
|
|
if not any(domain.endswith(allowed) for allowed in allowed_domains): |
|
|
raise ValueError(f"Domain not in allowed list: {domain}") |
|
|
|
|
|
|
|
|
if any( |
|
|
suspicious in domain for suspicious in ["..", "//", "javascript:", "data:"] |
|
|
): |
|
|
raise ValueError("Suspicious URL patterns detected") |
|
|
|
|
|
|
|
|
if ".." in parsed.path or "//" in parsed.path: |
|
|
raise ValueError("Path traversal attempts detected") |
|
|
|
|
|
return url_candidate |
|
|
|
|
|
except Exception as e: |
|
|
print(f"URL validation failed: {e}") |
|
|
return "https://seanpedrick-case.github.io/doc_redaction" |
|
|
|
|
|
|
|
|
def sanitize_markdown_text(text: str) -> str: |
|
|
""" |
|
|
Sanitize markdown text by removing dangerous HTML/scripts while preserving |
|
|
safe markdown syntax. |
|
|
""" |
|
|
if not text or not isinstance(text, str): |
|
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
allowed_tags = [ |
|
|
"a", |
|
|
"b", |
|
|
"strong", |
|
|
"em", |
|
|
"i", |
|
|
"u", |
|
|
"code", |
|
|
"pre", |
|
|
"blockquote", |
|
|
"ul", |
|
|
"ol", |
|
|
"li", |
|
|
"p", |
|
|
"br", |
|
|
"hr", |
|
|
] |
|
|
allowed_attributes = {"a": ["href", "title", "rel"]} |
|
|
|
|
|
text = bleach.clean( |
|
|
text, tags=allowed_tags, attributes=allowed_attributes, strip=True |
|
|
) |
|
|
|
|
|
|
|
|
text = re.sub( |
|
|
r"<(iframe|object|embed)[^>]*>.*?</\1>", |
|
|
"", |
|
|
text, |
|
|
flags=re.IGNORECASE | re.DOTALL, |
|
|
) |
|
|
|
|
|
|
|
|
text = re.sub(r'\s*on\w+\s*=\s*["\'][^"\']*["\']', "", text, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
text = re.sub( |
|
|
r"\[([^\]]+)\]\(javascript:[^\)]+\)", r"[\1]", text, flags=re.IGNORECASE |
|
|
) |
|
|
text = re.sub(r"\[([^\]]+)\]\(data:[^\)]+\)", r"[\1]", text, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
text = re.sub( |
|
|
r'\s*(style|onerror|onload|onclick)\s*=\s*["\'][^"\']*["\']', |
|
|
"", |
|
|
text, |
|
|
flags=re.IGNORECASE, |
|
|
) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/") |
|
|
CONFIG_FOLDER = ensure_folder_within_app_directory(CONFIG_FOLDER) |
|
|
|
|
|
|
|
|
APP_CONFIG_PATH = get_or_create_env_var( |
|
|
"APP_CONFIG_PATH", CONFIG_FOLDER + "app_config.env" |
|
|
) |
|
|
|
|
|
if APP_CONFIG_PATH: |
|
|
if os.path.exists(APP_CONFIG_PATH): |
|
|
print(f"Loading app variables from config file {APP_CONFIG_PATH}") |
|
|
load_dotenv(APP_CONFIG_PATH) |
|
|
else: |
|
|
print("App config file not found at location:", APP_CONFIG_PATH) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AWS_CONFIG_PATH = get_or_create_env_var( |
|
|
"AWS_CONFIG_PATH", "" |
|
|
) |
|
|
|
|
|
if AWS_CONFIG_PATH: |
|
|
if os.path.exists(AWS_CONFIG_PATH): |
|
|
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}") |
|
|
load_dotenv(AWS_CONFIG_PATH) |
|
|
else: |
|
|
print("AWS config file not found at location:", AWS_CONFIG_PATH) |
|
|
|
|
|
RUN_AWS_FUNCTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("RUN_AWS_FUNCTIONS", "False") |
|
|
) |
|
|
|
|
|
AWS_REGION = get_or_create_env_var("AWS_REGION", "") |
|
|
|
|
|
AWS_CLIENT_ID = get_or_create_env_var("AWS_CLIENT_ID", "") |
|
|
|
|
|
AWS_CLIENT_SECRET = get_or_create_env_var("AWS_CLIENT_SECRET", "") |
|
|
|
|
|
AWS_USER_POOL_ID = get_or_create_env_var("AWS_USER_POOL_ID", "") |
|
|
|
|
|
AWS_ACCESS_KEY = get_or_create_env_var("AWS_ACCESS_KEY", "") |
|
|
|
|
|
|
|
|
AWS_SECRET_KEY = get_or_create_env_var("AWS_SECRET_KEY", "") |
|
|
|
|
|
|
|
|
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "") |
|
|
|
|
|
|
|
|
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = convert_string_to_boolean( |
|
|
get_or_create_env_var("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "True") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
CUSTOM_HEADER = get_or_create_env_var("CUSTOM_HEADER", "") |
|
|
|
|
|
|
|
|
CUSTOM_HEADER_VALUE = get_or_create_env_var("CUSTOM_HEADER_VALUE", "") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0")) |
|
|
LOAD_TRUNCATED_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("LOAD_TRUNCATED_IMAGES", "True") |
|
|
) |
|
|
MAX_IMAGE_PIXELS = get_or_create_env_var( |
|
|
"MAX_IMAGE_PIXELS", "" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SESSION_OUTPUT_FOLDER = convert_string_to_boolean( |
|
|
get_or_create_env_var("SESSION_OUTPUT_FOLDER", "False") |
|
|
) |
|
|
|
|
|
OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/") |
|
|
INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/") |
|
|
|
|
|
|
|
|
SAVE_OUTPUTS_TO_S3 = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_OUTPUTS_TO_S3", "False") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
S3_OUTPUTS_FOLDER = get_or_create_env_var("S3_OUTPUTS_FOLDER", "") |
|
|
|
|
|
S3_OUTPUTS_BUCKET = get_or_create_env_var( |
|
|
"S3_OUTPUTS_BUCKET", DOCUMENT_REDACTION_BUCKET |
|
|
) |
|
|
|
|
|
|
|
|
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP": |
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
print(f"Temporary directory created at: {temp_dir}") |
|
|
|
|
|
if OUTPUT_FOLDER == "TEMP": |
|
|
OUTPUT_FOLDER = temp_dir + "/" |
|
|
if INPUT_FOLDER == "TEMP": |
|
|
INPUT_FOLDER = temp_dir + "/" |
|
|
else: |
|
|
|
|
|
OUTPUT_FOLDER = ensure_folder_within_app_directory(OUTPUT_FOLDER) |
|
|
INPUT_FOLDER = ensure_folder_within_app_directory(INPUT_FOLDER) |
|
|
|
|
|
GRADIO_TEMP_DIR = get_or_create_env_var( |
|
|
"GRADIO_TEMP_DIR", "" |
|
|
) |
|
|
if GRADIO_TEMP_DIR: |
|
|
GRADIO_TEMP_DIR = ensure_folder_within_app_directory(GRADIO_TEMP_DIR) |
|
|
MPLCONFIGDIR = get_or_create_env_var("MPLCONFIGDIR", "") |
|
|
if MPLCONFIGDIR: |
|
|
MPLCONFIGDIR = ensure_folder_within_app_directory(MPLCONFIGDIR) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAVE_LOGS_TO_CSV = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_LOGS_TO_CSV", "True") |
|
|
) |
|
|
|
|
|
USE_LOG_SUBFOLDERS = convert_string_to_boolean( |
|
|
get_or_create_env_var("USE_LOG_SUBFOLDERS", "True") |
|
|
) |
|
|
|
|
|
FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/") |
|
|
ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/") |
|
|
USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/") |
|
|
|
|
|
|
|
|
FEEDBACK_LOGS_FOLDER = ensure_folder_within_app_directory(FEEDBACK_LOGS_FOLDER) |
|
|
ACCESS_LOGS_FOLDER = ensure_folder_within_app_directory(ACCESS_LOGS_FOLDER) |
|
|
USAGE_LOGS_FOLDER = ensure_folder_within_app_directory(USAGE_LOGS_FOLDER) |
|
|
|
|
|
if USE_LOG_SUBFOLDERS: |
|
|
day_log_subfolder = today_rev + "/" |
|
|
host_name_subfolder = HOST_NAME + "/" |
|
|
full_log_subfolder = day_log_subfolder + host_name_subfolder |
|
|
|
|
|
FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder |
|
|
ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder |
|
|
USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder |
|
|
|
|
|
|
|
|
FEEDBACK_LOGS_FOLDER = ensure_folder_within_app_directory(FEEDBACK_LOGS_FOLDER) |
|
|
ACCESS_LOGS_FOLDER = ensure_folder_within_app_directory(ACCESS_LOGS_FOLDER) |
|
|
USAGE_LOGS_FOLDER = ensure_folder_within_app_directory(USAGE_LOGS_FOLDER) |
|
|
|
|
|
S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var( |
|
|
"S3_FEEDBACK_LOGS_FOLDER", "feedback/" + full_log_subfolder |
|
|
) |
|
|
S3_ACCESS_LOGS_FOLDER = get_or_create_env_var( |
|
|
"S3_ACCESS_LOGS_FOLDER", "logs/" + full_log_subfolder |
|
|
) |
|
|
S3_USAGE_LOGS_FOLDER = get_or_create_env_var( |
|
|
"S3_USAGE_LOGS_FOLDER", "usage/" + full_log_subfolder |
|
|
) |
|
|
|
|
|
|
|
|
DISPLAY_FILE_NAMES_IN_LOGS = convert_string_to_boolean( |
|
|
get_or_create_env_var("DISPLAY_FILE_NAMES_IN_LOGS", "False") |
|
|
) |
|
|
|
|
|
|
|
|
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var( |
|
|
"CSV_ACCESS_LOG_HEADERS", "" |
|
|
) |
|
|
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var( |
|
|
"CSV_FEEDBACK_LOG_HEADERS", "" |
|
|
) |
|
|
CSV_USAGE_LOG_HEADERS = get_or_create_env_var( |
|
|
"CSV_USAGE_LOG_HEADERS", |
|
|
'["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call", "task"]', |
|
|
) |
|
|
|
|
|
|
|
|
SAVE_LOGS_TO_DYNAMODB = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False") |
|
|
) |
|
|
|
|
|
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var( |
|
|
"ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log" |
|
|
) |
|
|
DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var("DYNAMODB_ACCESS_LOG_HEADERS", "") |
|
|
|
|
|
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var( |
|
|
"FEEDBACK_LOG_DYNAMODB_TABLE_NAME", "redaction_feedback" |
|
|
) |
|
|
DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var( |
|
|
"DYNAMODB_FEEDBACK_LOG_HEADERS", "" |
|
|
) |
|
|
|
|
|
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var( |
|
|
"USAGE_LOG_DYNAMODB_TABLE_NAME", "redaction_usage" |
|
|
) |
|
|
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "") |
|
|
|
|
|
|
|
|
LOGGING = convert_string_to_boolean(get_or_create_env_var("LOGGING", "False")) |
|
|
|
|
|
if LOGGING: |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" |
|
|
) |
|
|
|
|
|
LOG_FILE_NAME = get_or_create_env_var("LOG_FILE_NAME", "log.csv") |
|
|
USAGE_LOG_FILE_NAME = get_or_create_env_var("USAGE_LOG_FILE_NAME", LOG_FILE_NAME) |
|
|
FEEDBACK_LOG_FILE_NAME = get_or_create_env_var("FEEDBACK_LOG_FILE_NAME", LOG_FILE_NAME) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png") |
|
|
|
|
|
RUN_FASTAPI = convert_string_to_boolean(get_or_create_env_var("RUN_FASTAPI", "False")) |
|
|
|
|
|
RUN_MCP_SERVER = convert_string_to_boolean( |
|
|
get_or_create_env_var("RUN_MCP_SERVER", "False") |
|
|
) |
|
|
|
|
|
MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5")) |
|
|
|
|
|
MAX_FILE_SIZE = get_or_create_env_var("MAX_FILE_SIZE", "250mb").lower() |
|
|
|
|
|
GRADIO_SERVER_NAME = get_or_create_env_var( |
|
|
"GRADIO_SERVER_NAME", "127.0.0.1" |
|
|
) |
|
|
|
|
|
GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860")) |
|
|
|
|
|
ALLOWED_ORIGINS = get_or_create_env_var( |
|
|
"ALLOWED_ORIGINS", "" |
|
|
) |
|
|
|
|
|
ALLOWED_HOSTS = get_or_create_env_var("ALLOWED_HOSTS", "") |
|
|
|
|
|
ROOT_PATH = get_or_create_env_var("ROOT_PATH", "") |
|
|
FASTAPI_ROOT_PATH = get_or_create_env_var("FASTAPI_ROOT_PATH", "/") |
|
|
|
|
|
DEFAULT_CONCURRENCY_LIMIT = int(get_or_create_env_var("DEFAULT_CONCURRENCY_LIMIT", "3")) |
|
|
|
|
|
|
|
|
PAGE_BREAK_VALUE = int(get_or_create_env_var("PAGE_BREAK_VALUE", "99999")) |
|
|
|
|
|
MAX_TIME_VALUE = int(get_or_create_env_var("MAX_TIME_VALUE", "999999")) |
|
|
MAX_SIMULTANEOUS_FILES = int(get_or_create_env_var("MAX_SIMULTANEOUS_FILES", "10")) |
|
|
MAX_DOC_PAGES = int(get_or_create_env_var("MAX_DOC_PAGES", "3000")) |
|
|
MAX_TABLE_ROWS = int(get_or_create_env_var("MAX_TABLE_ROWS", "250000")) |
|
|
MAX_TABLE_COLUMNS = int(get_or_create_env_var("MAX_TABLE_COLUMNS", "100")) |
|
|
MAX_OPEN_TEXT_CHARACTERS = int( |
|
|
get_or_create_env_var("MAX_OPEN_TEXT_CHARACTERS", "50000") |
|
|
) |
|
|
|
|
|
|
|
|
LOAD_REDACTION_ANNOTATIONS_FROM_PDF = convert_string_to_boolean( |
|
|
get_or_create_env_var("LOAD_REDACTION_ANNOTATIONS_FROM_PDF", "True") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
TESSERACT_FOLDER = get_or_create_env_var( |
|
|
"TESSERACT_FOLDER", "" |
|
|
) |
|
|
if TESSERACT_FOLDER: |
|
|
TESSERACT_FOLDER = ensure_folder_within_app_directory(TESSERACT_FOLDER) |
|
|
add_folder_to_path(TESSERACT_FOLDER) |
|
|
|
|
|
TESSERACT_DATA_FOLDER = get_or_create_env_var( |
|
|
"TESSERACT_DATA_FOLDER", "/usr/share/tessdata" |
|
|
) |
|
|
|
|
|
if TESSERACT_DATA_FOLDER and not os.path.isabs(TESSERACT_DATA_FOLDER): |
|
|
TESSERACT_DATA_FOLDER = ensure_folder_within_app_directory(TESSERACT_DATA_FOLDER) |
|
|
|
|
|
POPPLER_FOLDER = get_or_create_env_var( |
|
|
"POPPLER_FOLDER", "" |
|
|
) |
|
|
if POPPLER_FOLDER: |
|
|
POPPLER_FOLDER = ensure_folder_within_app_directory(POPPLER_FOLDER) |
|
|
add_folder_to_path(POPPLER_FOLDER) |
|
|
|
|
|
|
|
|
EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = convert_string_to_boolean( |
|
|
get_or_create_env_var("EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT", "True") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var( |
|
|
"SELECTABLE_TEXT_EXTRACT_OPTION", "Local model - selectable text" |
|
|
) |
|
|
TESSERACT_TEXT_EXTRACT_OPTION = get_or_create_env_var( |
|
|
"TESSERACT_TEXT_EXTRACT_OPTION", "Local OCR model - PDFs without selectable text" |
|
|
) |
|
|
TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var( |
|
|
"TEXTRACT_TEXT_EXTRACT_OPTION", "AWS Textract service - all PDF types" |
|
|
) |
|
|
|
|
|
|
|
|
NO_REDACTION_PII_OPTION = get_or_create_env_var( |
|
|
"NO_REDACTION_PII_OPTION", "Only extract text (no redaction)" |
|
|
) |
|
|
LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local") |
|
|
AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend") |
|
|
|
|
|
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True") |
|
|
) |
|
|
SHOW_AWS_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True") |
|
|
) |
|
|
|
|
|
|
|
|
if not SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS and not SHOW_AWS_TEXT_EXTRACTION_OPTIONS: |
|
|
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = True |
|
|
|
|
|
local_model_options = list() |
|
|
aws_model_options = list() |
|
|
text_extraction_models = list() |
|
|
|
|
|
if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS: |
|
|
local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION) |
|
|
local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION) |
|
|
|
|
|
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS: |
|
|
aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION) |
|
|
|
|
|
TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options |
|
|
DO_INITIAL_TABULAR_DATA_CLEAN = convert_string_to_boolean( |
|
|
get_or_create_env_var("DO_INITIAL_TABULAR_DATA_CLEAN", "True") |
|
|
) |
|
|
|
|
|
SHOW_LOCAL_PII_DETECTION_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_LOCAL_PII_DETECTION_OPTIONS", "True") |
|
|
) |
|
|
SHOW_AWS_PII_DETECTION_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_AWS_PII_DETECTION_OPTIONS", "True") |
|
|
) |
|
|
|
|
|
if not SHOW_LOCAL_PII_DETECTION_OPTIONS and not SHOW_AWS_PII_DETECTION_OPTIONS: |
|
|
SHOW_LOCAL_PII_DETECTION_OPTIONS = True |
|
|
|
|
|
local_model_options = [NO_REDACTION_PII_OPTION] |
|
|
aws_model_options = list() |
|
|
pii_detection_models = list() |
|
|
|
|
|
if SHOW_LOCAL_PII_DETECTION_OPTIONS: |
|
|
local_model_options.append(LOCAL_PII_OPTION) |
|
|
|
|
|
if SHOW_AWS_PII_DETECTION_OPTIONS: |
|
|
aws_model_options.append(AWS_PII_OPTION) |
|
|
|
|
|
PII_DETECTION_MODELS = local_model_options + aws_model_options |
|
|
|
|
|
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS: |
|
|
DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var( |
|
|
"DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION |
|
|
) |
|
|
else: |
|
|
DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var( |
|
|
"DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION |
|
|
) |
|
|
|
|
|
if SHOW_AWS_PII_DETECTION_OPTIONS: |
|
|
DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var( |
|
|
"DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION |
|
|
) |
|
|
else: |
|
|
DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var( |
|
|
"DEFAULT_PII_DETECTION_MODEL", LOCAL_PII_OPTION |
|
|
) |
|
|
|
|
|
|
|
|
TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy() |
|
|
if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS: |
|
|
TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION) |
|
|
|
|
|
DEFAULT_TEXT_COLUMNS = get_or_create_env_var("DEFAULT_TEXT_COLUMNS", "[]") |
|
|
DEFAULT_EXCEL_SHEETS = get_or_create_env_var("DEFAULT_EXCEL_SHEETS", "[]") |
|
|
|
|
|
DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var( |
|
|
"DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SHOW_VLM_MODEL_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_VLM_MODEL_OPTIONS", "False") |
|
|
) |
|
|
|
|
|
SELECTED_MODEL = get_or_create_env_var( |
|
|
"SELECTED_MODEL", "Qwen3-VL-4B-Instruct" |
|
|
) |
|
|
|
|
|
if SHOW_VLM_MODEL_OPTIONS: |
|
|
VLM_MODEL_OPTIONS = [ |
|
|
SELECTED_MODEL, |
|
|
] |
|
|
|
|
|
MAX_SPACES_GPU_RUN_TIME = int( |
|
|
get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "60") |
|
|
) |
|
|
|
|
|
MAX_NEW_TOKENS = int( |
|
|
get_or_create_env_var("MAX_NEW_TOKENS", "4096") |
|
|
) |
|
|
|
|
|
DEFAULT_MAX_NEW_TOKENS = int( |
|
|
get_or_create_env_var("DEFAULT_MAX_NEW_TOKENS", "4096") |
|
|
) |
|
|
|
|
|
HYBRID_OCR_MAX_NEW_TOKENS = int( |
|
|
get_or_create_env_var("HYBRID_OCR_MAX_NEW_TOKENS", "30") |
|
|
) |
|
|
|
|
|
MAX_INPUT_TOKEN_LENGTH = int( |
|
|
get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "8192") |
|
|
) |
|
|
|
|
|
VLM_MAX_IMAGE_SIZE = int( |
|
|
get_or_create_env_var("VLM_MAX_IMAGE_SIZE", "819200") |
|
|
) |
|
|
|
|
|
VLM_MIN_IMAGE_SIZE = int( |
|
|
get_or_create_env_var("VLM_MIN_IMAGE_SIZE", "614400") |
|
|
) |
|
|
|
|
|
VLM_MAX_DPI = float( |
|
|
get_or_create_env_var("VLM_MAX_DPI", "300.0") |
|
|
) |
|
|
|
|
|
USE_FLASH_ATTENTION = convert_string_to_boolean( |
|
|
get_or_create_env_var("USE_FLASH_ATTENTION", "False") |
|
|
) |
|
|
|
|
|
QUANTISE_VLM_MODELS = convert_string_to_boolean( |
|
|
get_or_create_env_var("QUANTISE_VLM_MODELS", "False") |
|
|
) |
|
|
|
|
|
REPORT_VLM_OUTPUTS_TO_GUI = convert_string_to_boolean( |
|
|
get_or_create_env_var("REPORT_VLM_OUTPUTS_TO_GUI", "False") |
|
|
) |
|
|
|
|
|
OVERWRITE_EXISTING_OCR_RESULTS = convert_string_to_boolean( |
|
|
get_or_create_env_var("OVERWRITE_EXISTING_OCR_RESULTS", "False") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
VLM_SEED = get_or_create_env_var( |
|
|
"VLM_SEED", "" |
|
|
) |
|
|
if VLM_SEED and VLM_SEED.strip(): |
|
|
VLM_SEED = int(VLM_SEED) |
|
|
else: |
|
|
VLM_SEED = None |
|
|
|
|
|
VLM_DEFAULT_TEMPERATURE = get_or_create_env_var( |
|
|
"VLM_DEFAULT_TEMPERATURE", "" |
|
|
) |
|
|
if VLM_DEFAULT_TEMPERATURE and VLM_DEFAULT_TEMPERATURE.strip(): |
|
|
VLM_DEFAULT_TEMPERATURE = float(VLM_DEFAULT_TEMPERATURE) |
|
|
else: |
|
|
VLM_DEFAULT_TEMPERATURE = None |
|
|
|
|
|
VLM_DEFAULT_TOP_P = get_or_create_env_var( |
|
|
"VLM_DEFAULT_TOP_P", "" |
|
|
) |
|
|
if VLM_DEFAULT_TOP_P and VLM_DEFAULT_TOP_P.strip(): |
|
|
VLM_DEFAULT_TOP_P = float(VLM_DEFAULT_TOP_P) |
|
|
else: |
|
|
VLM_DEFAULT_TOP_P = None |
|
|
|
|
|
VLM_DEFAULT_MIN_P = get_or_create_env_var( |
|
|
"VLM_DEFAULT_MIN_P", "" |
|
|
) |
|
|
if VLM_DEFAULT_MIN_P and VLM_DEFAULT_MIN_P.strip(): |
|
|
VLM_DEFAULT_MIN_P = float(VLM_DEFAULT_MIN_P) |
|
|
else: |
|
|
VLM_DEFAULT_MIN_P = None |
|
|
|
|
|
VLM_DEFAULT_TOP_K = get_or_create_env_var( |
|
|
"VLM_DEFAULT_TOP_K", "" |
|
|
) |
|
|
if VLM_DEFAULT_TOP_K and VLM_DEFAULT_TOP_K.strip(): |
|
|
VLM_DEFAULT_TOP_K = int(VLM_DEFAULT_TOP_K) |
|
|
else: |
|
|
VLM_DEFAULT_TOP_K = None |
|
|
|
|
|
VLM_DEFAULT_REPETITION_PENALTY = get_or_create_env_var( |
|
|
"VLM_DEFAULT_REPETITION_PENALTY", "" |
|
|
) |
|
|
if VLM_DEFAULT_REPETITION_PENALTY and VLM_DEFAULT_REPETITION_PENALTY.strip(): |
|
|
VLM_DEFAULT_REPETITION_PENALTY = float(VLM_DEFAULT_REPETITION_PENALTY) |
|
|
else: |
|
|
VLM_DEFAULT_REPETITION_PENALTY = None |
|
|
|
|
|
VLM_DEFAULT_DO_SAMPLE = get_or_create_env_var( |
|
|
"VLM_DEFAULT_DO_SAMPLE", "" |
|
|
) |
|
|
if VLM_DEFAULT_DO_SAMPLE and VLM_DEFAULT_DO_SAMPLE.strip(): |
|
|
VLM_DEFAULT_DO_SAMPLE = convert_string_to_boolean(VLM_DEFAULT_DO_SAMPLE) |
|
|
else: |
|
|
VLM_DEFAULT_DO_SAMPLE = None |
|
|
|
|
|
VLM_DEFAULT_PRESENCE_PENALTY = get_or_create_env_var( |
|
|
"VLM_DEFAULT_PRESENCE_PENALTY", "" |
|
|
) |
|
|
if VLM_DEFAULT_PRESENCE_PENALTY and VLM_DEFAULT_PRESENCE_PENALTY.strip(): |
|
|
VLM_DEFAULT_PRESENCE_PENALTY = float(VLM_DEFAULT_PRESENCE_PENALTY) |
|
|
else: |
|
|
VLM_DEFAULT_PRESENCE_PENALTY = None |
|
|
|
|
|
|
|
|
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var( |
|
|
"CHOSEN_LOCAL_OCR_MODEL", "tesseract" |
|
|
) |
|
|
|
|
|
|
|
|
SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False") |
|
|
) |
|
|
|
|
|
SHOW_PADDLE_MODEL_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_PADDLE_MODEL_OPTIONS", "False") |
|
|
) |
|
|
|
|
|
SHOW_INFERENCE_SERVER_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_INFERENCE_SERVER_OPTIONS", "False") |
|
|
) |
|
|
|
|
|
SHOW_HYBRID_MODELS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_HYBRID_MODELS", "False") |
|
|
) |
|
|
|
|
|
LOCAL_OCR_MODEL_OPTIONS = ["tesseract"] |
|
|
|
|
|
CHOSEN_LOCAL_MODEL_INTRO_TEXT = get_or_create_env_var( |
|
|
"CHOSEN_LOCAL_MODEL_INTRO_TEXT", |
|
|
"""Choose a local OCR model. "tesseract" is the default and will work for documents with clear typed text. """, |
|
|
) |
|
|
|
|
|
PADDLE_OCR_INTRO_TEXT = get_or_create_env_var( |
|
|
"PADDLE_OCR_INTRO_TEXT", |
|
|
""""paddle" is more accurate for text extraction where the text is not clear or well-formatted, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. """, |
|
|
) |
|
|
|
|
|
PADDLE_OCR_HYBRID_INTRO_TEXT = get_or_create_env_var( |
|
|
"PADDLE_OCR_HYBRID_INTRO_TEXT", |
|
|
""""hybrid-paddle" will do the first pass with Tesseract, and the second with PaddleOCR. """, |
|
|
) |
|
|
|
|
|
VLM_OCR_INTRO_TEXT = get_or_create_env_var( |
|
|
"VLM_OCR_INTRO_TEXT", |
|
|
""""vlm" will call the chosen vision model (VLM) to return a structured json output that is then parsed into word-level bounding boxes. """, |
|
|
) |
|
|
|
|
|
VLM_OCR_HYBRID_INTRO_TEXT = get_or_create_env_var( |
|
|
"VLM_OCR_HYBRID_INTRO_TEXT", |
|
|
""""hybrid-vlm" is a combination of Tesseract for OCR, and a second pass with the chosen vision model (VLM). """, |
|
|
) |
|
|
|
|
|
INFERENCE_SERVER_OCR_INTRO_TEXT = get_or_create_env_var( |
|
|
"INFERENCE_SERVER_OCR_INTRO_TEXT", |
|
|
""""inference-server" will call an external inference-server API to perform OCR using a vision model hosted remotely. """, |
|
|
) |
|
|
|
|
|
HYBRID_PADDLE_VLM_INTRO_TEXT = get_or_create_env_var( |
|
|
"HYBRID_PADDLE_VLM_INTRO_TEXT", |
|
|
""""hybrid-paddle-vlm" is a combination of PaddleOCR with the chosen VLM.""", |
|
|
) |
|
|
|
|
|
HYBRID_PADDLE_INFERENCE_SERVER_INTRO_TEXT = get_or_create_env_var( |
|
|
"HYBRID_PADDLE_INFERENCE_SERVER_INTRO_TEXT", |
|
|
""""hybrid-paddle-inference-server" is a combination of PaddleOCR with an external inference-server API.""", |
|
|
) |
|
|
|
|
|
paddle_options = ["paddle"] |
|
|
|
|
|
|
|
|
if SHOW_PADDLE_MODEL_OPTIONS: |
|
|
LOCAL_OCR_MODEL_OPTIONS.extend(paddle_options) |
|
|
CHOSEN_LOCAL_MODEL_INTRO_TEXT += PADDLE_OCR_INTRO_TEXT |
|
|
|
|
|
|
|
|
|
|
|
vlm_options = ["vlm"] |
|
|
|
|
|
|
|
|
if SHOW_VLM_MODEL_OPTIONS: |
|
|
LOCAL_OCR_MODEL_OPTIONS.extend(vlm_options) |
|
|
CHOSEN_LOCAL_MODEL_INTRO_TEXT += VLM_OCR_INTRO_TEXT |
|
|
|
|
|
|
|
|
|
|
|
if SHOW_PADDLE_MODEL_OPTIONS and SHOW_VLM_MODEL_OPTIONS and SHOW_HYBRID_MODELS: |
|
|
LOCAL_OCR_MODEL_OPTIONS.append("hybrid-paddle-vlm") |
|
|
CHOSEN_LOCAL_MODEL_INTRO_TEXT += HYBRID_PADDLE_VLM_INTRO_TEXT |
|
|
|
|
|
if SHOW_PADDLE_MODEL_OPTIONS and SHOW_INFERENCE_SERVER_OPTIONS and SHOW_HYBRID_MODELS: |
|
|
LOCAL_OCR_MODEL_OPTIONS.append("hybrid-paddle-inference-server") |
|
|
CHOSEN_LOCAL_MODEL_INTRO_TEXT += HYBRID_PADDLE_INFERENCE_SERVER_INTRO_TEXT |
|
|
|
|
|
inference_server_options = ["inference-server"] |
|
|
if SHOW_INFERENCE_SERVER_OPTIONS: |
|
|
LOCAL_OCR_MODEL_OPTIONS.extend(inference_server_options) |
|
|
CHOSEN_LOCAL_MODEL_INTRO_TEXT += INFERENCE_SERVER_OCR_INTRO_TEXT |
|
|
|
|
|
|
|
|
INFERENCE_SERVER_API_URL = get_or_create_env_var( |
|
|
"INFERENCE_SERVER_API_URL", "http://localhost:8080" |
|
|
) |
|
|
|
|
|
INFERENCE_SERVER_MODEL_NAME = get_or_create_env_var( |
|
|
"INFERENCE_SERVER_MODEL_NAME", "" |
|
|
) |
|
|
|
|
|
INFERENCE_SERVER_TIMEOUT = int( |
|
|
get_or_create_env_var("INFERENCE_SERVER_TIMEOUT", "300") |
|
|
) |
|
|
|
|
|
MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache") |
|
|
MODEL_CACHE_PATH = ensure_folder_within_app_directory(MODEL_CACHE_PATH) |
|
|
|
|
|
|
|
|
HYBRID_OCR_CONFIDENCE_THRESHOLD = int( |
|
|
get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "95") |
|
|
) |
|
|
|
|
|
HYBRID_OCR_PADDING = int( |
|
|
get_or_create_env_var("HYBRID_OCR_PADDING", "1") |
|
|
) |
|
|
|
|
|
TESSERACT_WORD_LEVEL_OCR = convert_string_to_boolean( |
|
|
get_or_create_env_var("TESSERACT_WORD_LEVEL_OCR", "True") |
|
|
) |
|
|
|
|
|
TESSERACT_SEGMENTATION_LEVEL = int( |
|
|
get_or_create_env_var("TESSERACT_SEGMENTATION_LEVEL", "11") |
|
|
) |
|
|
|
|
|
CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean( |
|
|
get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "False") |
|
|
) |
|
|
|
|
|
LOAD_PADDLE_AT_STARTUP = convert_string_to_boolean( |
|
|
get_or_create_env_var("LOAD_PADDLE_AT_STARTUP", "False") |
|
|
) |
|
|
|
|
|
PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean( |
|
|
get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False") |
|
|
) |
|
|
|
|
|
PADDLE_DET_DB_UNCLIP_RATIO = float( |
|
|
get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2") |
|
|
) |
|
|
|
|
|
SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False") |
|
|
) |
|
|
|
|
|
SAVE_PAGE_OCR_VISUALISATIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_PAGE_OCR_VISUALISATIONS", "False") |
|
|
) |
|
|
|
|
|
INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES = convert_string_to_boolean( |
|
|
get_or_create_env_var("INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES", "False") |
|
|
) |
|
|
|
|
|
SAVE_WORD_SEGMENTER_OUTPUT_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_WORD_SEGMENTER_OUTPUT_IMAGES", "False") |
|
|
) |
|
|
|
|
|
|
|
|
PADDLE_MODEL_PATH = get_or_create_env_var( |
|
|
"PADDLE_MODEL_PATH", "" |
|
|
) |
|
|
if PADDLE_MODEL_PATH: |
|
|
PADDLE_MODEL_PATH = ensure_folder_within_app_directory(PADDLE_MODEL_PATH) |
|
|
|
|
|
PADDLE_FONT_PATH = get_or_create_env_var( |
|
|
"PADDLE_FONT_PATH", "" |
|
|
) |
|
|
if PADDLE_FONT_PATH: |
|
|
PADDLE_FONT_PATH = ensure_folder_within_app_directory(PADDLE_FONT_PATH) |
|
|
|
|
|
SPACY_MODEL_PATH = get_or_create_env_var( |
|
|
"SPACY_MODEL_PATH", "" |
|
|
) |
|
|
if SPACY_MODEL_PATH: |
|
|
SPACY_MODEL_PATH = ensure_folder_within_app_directory(SPACY_MODEL_PATH) |
|
|
|
|
|
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var( |
|
|
"PREPROCESS_LOCAL_OCR_IMAGES", "True" |
|
|
) |
|
|
|
|
|
SAVE_PREPROCESS_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_PREPROCESS_IMAGES", "False") |
|
|
) |
|
|
|
|
|
SAVE_VLM_INPUT_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SAVE_VLM_INPUT_IMAGES", "False") |
|
|
) |
|
|
|
|
|
|
|
|
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var( |
|
|
"CHOSEN_COMPREHEND_ENTITIES", |
|
|
"['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']", |
|
|
) |
|
|
|
|
|
FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var( |
|
|
"FULL_COMPREHEND_ENTITY_LIST", |
|
|
"['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', 'CUSTOM_FUZZY']", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
CHOSEN_REDACT_ENTITIES = get_or_create_env_var( |
|
|
"CHOSEN_REDACT_ENTITIES", |
|
|
"['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']", |
|
|
) |
|
|
|
|
|
FULL_ENTITY_LIST = get_or_create_env_var( |
|
|
"FULL_ENTITY_LIST", |
|
|
"['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']", |
|
|
) |
|
|
|
|
|
|
|
|
CUSTOM_ENTITIES = get_or_create_env_var( |
|
|
"CUSTOM_ENTITIES", |
|
|
"['TITLES', 'UKPOSTCODE', 'STREETNAME', 'CUSTOM']", |
|
|
) |
|
|
|
|
|
|
|
|
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var( |
|
|
"DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX", "['Extract handwriting']" |
|
|
) |
|
|
|
|
|
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = get_or_create_env_var( |
|
|
"HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS", |
|
|
"['Extract handwriting', 'Extract signatures']", |
|
|
) |
|
|
|
|
|
if HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS: |
|
|
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS = _get_env_list( |
|
|
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS |
|
|
) |
|
|
|
|
|
INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var( |
|
|
"INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False" |
|
|
) |
|
|
INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var( |
|
|
"INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False" |
|
|
) |
|
|
INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION = get_or_create_env_var( |
|
|
"INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False" |
|
|
) |
|
|
|
|
|
if INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION == "True": |
|
|
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract forms") |
|
|
if INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION == "True": |
|
|
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract layout") |
|
|
if INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION == "True": |
|
|
HANDWRITE_SIGNATURE_TEXTBOX_FULL_OPTIONS.append("Extract tables") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SPLIT_PUNCTUATION_FROM_WORDS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SPLIT_PUNCTUATION_FROM_WORDS", "False") |
|
|
) |
|
|
|
|
|
DEFAULT_SEARCH_QUERY = get_or_create_env_var("DEFAULT_SEARCH_QUERY", "") |
|
|
DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int( |
|
|
get_or_create_env_var("DEFAULT_FUZZY_SPELLING_MISTAKES_NUM", "1") |
|
|
) |
|
|
|
|
|
DEFAULT_PAGE_MIN = int(get_or_create_env_var("DEFAULT_PAGE_MIN", "0")) |
|
|
|
|
|
DEFAULT_PAGE_MAX = int(get_or_create_env_var("DEFAULT_PAGE_MAX", "0")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SHOW_LANGUAGE_SELECTION = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False") |
|
|
) |
|
|
|
|
|
DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var( |
|
|
"DEFAULT_LANGUAGE_FULL_NAME", "english" |
|
|
) |
|
|
DEFAULT_LANGUAGE = get_or_create_env_var( |
|
|
"DEFAULT_LANGUAGE", "en" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
textract_language_choices = get_or_create_env_var( |
|
|
"textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']" |
|
|
) |
|
|
aws_comprehend_language_choices = get_or_create_env_var( |
|
|
"aws_comprehend_language_choices", "['en', 'es']" |
|
|
) |
|
|
|
|
|
|
|
|
MAPPED_LANGUAGE_CHOICES = get_or_create_env_var( |
|
|
"MAPPED_LANGUAGE_CHOICES", |
|
|
"['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']", |
|
|
) |
|
|
LANGUAGE_CHOICES = get_or_create_env_var( |
|
|
"LANGUAGE_CHOICES", |
|
|
"['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float( |
|
|
get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95") |
|
|
) |
|
|
DEFAULT_MIN_CONSECUTIVE_PAGES = int( |
|
|
get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1") |
|
|
) |
|
|
USE_GREEDY_DUPLICATE_DETECTION = convert_string_to_boolean( |
|
|
get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True") |
|
|
) |
|
|
DEFAULT_COMBINE_PAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True") |
|
|
) |
|
|
DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10")) |
|
|
REMOVE_DUPLICATE_ROWS = convert_string_to_boolean( |
|
|
get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
USE_GUI_BOX_COLOURS_FOR_OUTPUTS = convert_string_to_boolean( |
|
|
get_or_create_env_var("USE_GUI_BOX_COLOURS_FOR_OUTPUTS", "False") |
|
|
) |
|
|
|
|
|
|
|
|
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "(0, 0, 0)") |
|
|
|
|
|
if CUSTOM_BOX_COLOUR == "grey": |
|
|
|
|
|
CUSTOM_BOX_COLOUR = (128, 128, 128) |
|
|
else: |
|
|
try: |
|
|
components_str = CUSTOM_BOX_COLOUR.strip("()").split(",") |
|
|
CUSTOM_BOX_COLOUR = tuple( |
|
|
int(c.strip()) for c in components_str |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"Error initialising CUSTOM_BOX_COLOUR: {e}, returning default black") |
|
|
CUSTOM_BOX_COLOUR = ( |
|
|
0, |
|
|
0, |
|
|
0, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
APPLY_REDACTIONS_IMAGES = int( |
|
|
get_or_create_env_var("APPLY_REDACTIONS_IMAGES", "0") |
|
|
) |
|
|
APPLY_REDACTIONS_GRAPHICS = int( |
|
|
get_or_create_env_var("APPLY_REDACTIONS_GRAPHICS", "0") |
|
|
) |
|
|
APPLY_REDACTIONS_TEXT = int( |
|
|
get_or_create_env_var("APPLY_REDACTIONS_TEXT", "0") |
|
|
) |
|
|
|
|
|
|
|
|
RETURN_PDF_FOR_REVIEW = convert_string_to_boolean( |
|
|
get_or_create_env_var("RETURN_PDF_FOR_REVIEW", "True") |
|
|
) |
|
|
|
|
|
RETURN_REDACTED_PDF = convert_string_to_boolean( |
|
|
get_or_create_env_var("RETURN_REDACTED_PDF", "True") |
|
|
) |
|
|
|
|
|
COMPRESS_REDACTED_PDF = convert_string_to_boolean( |
|
|
get_or_create_env_var("COMPRESS_REDACTED_PDF", "False") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
USER_GUIDE_URL = validate_safe_url( |
|
|
get_or_create_env_var( |
|
|
"USER_GUIDE_URL", "https://seanpedrick-case.github.io/doc_redaction" |
|
|
) |
|
|
) |
|
|
|
|
|
DEFAULT_INTRO_TEXT = f"""# Document redaction |
|
|
|
|
|
Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app. |
|
|
|
|
|
To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost. |
|
|
|
|
|
Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document. |
|
|
|
|
|
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""" |
|
|
|
|
|
INTRO_TEXT = get_or_create_env_var("INTRO_TEXT", DEFAULT_INTRO_TEXT) |
|
|
|
|
|
|
|
|
if INTRO_TEXT.endswith(".txt"): |
|
|
|
|
|
if validate_path_safety(INTRO_TEXT, base_path="."): |
|
|
try: |
|
|
|
|
|
INTRO_TEXT = secure_file_read(".", INTRO_TEXT, encoding="utf-8") |
|
|
|
|
|
INTRO_TEXT = INTRO_TEXT.format(USER_GUIDE_URL=USER_GUIDE_URL) |
|
|
except FileNotFoundError: |
|
|
print(f"Warning: Intro text file not found: {INTRO_TEXT}") |
|
|
INTRO_TEXT = DEFAULT_INTRO_TEXT |
|
|
except Exception as e: |
|
|
print(f"Error reading intro text file: {e}") |
|
|
|
|
|
INTRO_TEXT = DEFAULT_INTRO_TEXT |
|
|
else: |
|
|
print(f"Warning: Unsafe file path detected for INTRO_TEXT: {INTRO_TEXT}") |
|
|
INTRO_TEXT = DEFAULT_INTRO_TEXT |
|
|
|
|
|
|
|
|
INTRO_TEXT = sanitize_markdown_text(INTRO_TEXT.strip('"').strip("'")) |
|
|
|
|
|
|
|
|
if not INTRO_TEXT or not INTRO_TEXT.strip(): |
|
|
print("Warning: Intro text is empty after sanitization, using default intro text") |
|
|
INTRO_TEXT = sanitize_markdown_text(DEFAULT_INTRO_TEXT) |
|
|
|
|
|
TLDEXTRACT_CACHE = get_or_create_env_var("TLDEXTRACT_CACHE", "tmp/tld/") |
|
|
TLDEXTRACT_CACHE = ensure_folder_within_app_directory(TLDEXTRACT_CACHE) |
|
|
try: |
|
|
extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE) |
|
|
except Exception as e: |
|
|
print(f"Error initialising TLDExtract: {e}") |
|
|
extract = TLDExtract(cache_dir=None) |
|
|
|
|
|
|
|
|
COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False")) |
|
|
|
|
|
SHOW_FEEDBACK_BUTTONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_FEEDBACK_BUTTONS", "False") |
|
|
) |
|
|
|
|
|
SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER", "False") |
|
|
) |
|
|
|
|
|
|
|
|
SHOW_EXAMPLES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_EXAMPLES", "True") |
|
|
) |
|
|
SHOW_AWS_EXAMPLES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_AWS_EXAMPLES", "False") |
|
|
) |
|
|
SHOW_DIFFICULT_OCR_EXAMPLES = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_DIFFICULT_OCR_EXAMPLES", "False") |
|
|
) |
|
|
|
|
|
FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "200")) |
|
|
|
|
|
RUN_DIRECT_MODE = convert_string_to_boolean( |
|
|
get_or_create_env_var("RUN_DIRECT_MODE", "False") |
|
|
) |
|
|
|
|
|
|
|
|
DIRECT_MODE_DEFAULT_USER = get_or_create_env_var( |
|
|
"DIRECT_MODE_DEFAULT_USER", "" |
|
|
) |
|
|
DIRECT_MODE_TASK = get_or_create_env_var( |
|
|
"DIRECT_MODE_TASK", "redact" |
|
|
) |
|
|
DIRECT_MODE_INPUT_FILE = get_or_create_env_var( |
|
|
"DIRECT_MODE_INPUT_FILE", "" |
|
|
) |
|
|
DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var( |
|
|
"DIRECT_MODE_OUTPUT_DIR", OUTPUT_FOLDER |
|
|
) |
|
|
DIRECT_MODE_OUTPUT_DIR = ensure_folder_within_app_directory(DIRECT_MODE_OUTPUT_DIR) |
|
|
DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var( |
|
|
"DIRECT_MODE_DUPLICATE_TYPE", "pages" |
|
|
) |
|
|
|
|
|
|
|
|
DIRECT_MODE_LANGUAGE = get_or_create_env_var( |
|
|
"DIRECT_MODE_LANGUAGE", DEFAULT_LANGUAGE |
|
|
) |
|
|
DIRECT_MODE_PII_DETECTOR = get_or_create_env_var( |
|
|
"DIRECT_MODE_PII_DETECTOR", LOCAL_PII_OPTION |
|
|
) |
|
|
DIRECT_MODE_OCR_METHOD = get_or_create_env_var( |
|
|
"DIRECT_MODE_OCR_METHOD", "Local OCR" |
|
|
) |
|
|
DIRECT_MODE_PAGE_MIN = int( |
|
|
get_or_create_env_var("DIRECT_MODE_PAGE_MIN", str(DEFAULT_PAGE_MIN)) |
|
|
) |
|
|
DIRECT_MODE_PAGE_MAX = int( |
|
|
get_or_create_env_var("DIRECT_MODE_PAGE_MAX", str(DEFAULT_PAGE_MAX)) |
|
|
) |
|
|
DIRECT_MODE_IMAGES_DPI = float( |
|
|
get_or_create_env_var("DIRECT_MODE_IMAGES_DPI", str(IMAGES_DPI)) |
|
|
) |
|
|
DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var( |
|
|
"DIRECT_MODE_CHOSEN_LOCAL_OCR_MODEL", CHOSEN_LOCAL_OCR_MODEL |
|
|
) |
|
|
DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_PREPROCESS_LOCAL_OCR_IMAGES", str(PREPROCESS_LOCAL_OCR_IMAGES) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_COMPRESS_REDACTED_PDF = convert_string_to_boolean( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_COMPRESS_REDACTED_PDF", str(COMPRESS_REDACTED_PDF) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_RETURN_PDF_END_OF_REDACTION = convert_string_to_boolean( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_RETURN_PDF_END_OF_REDACTION", str(RETURN_REDACTED_PDF) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_EXTRACT_FORMS = convert_string_to_boolean( |
|
|
get_or_create_env_var("DIRECT_MODE_EXTRACT_FORMS", "False") |
|
|
) |
|
|
DIRECT_MODE_EXTRACT_TABLES = convert_string_to_boolean( |
|
|
get_or_create_env_var("DIRECT_MODE_EXTRACT_TABLES", "False") |
|
|
) |
|
|
DIRECT_MODE_EXTRACT_LAYOUT = convert_string_to_boolean( |
|
|
get_or_create_env_var("DIRECT_MODE_EXTRACT_LAYOUT", "False") |
|
|
) |
|
|
DIRECT_MODE_EXTRACT_SIGNATURES = convert_string_to_boolean( |
|
|
get_or_create_env_var("DIRECT_MODE_EXTRACT_SIGNATURES", "False") |
|
|
) |
|
|
DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL = convert_string_to_boolean( |
|
|
get_or_create_env_var("DIRECT_MODE_MATCH_FUZZY_WHOLE_PHRASE_BOOL", "True") |
|
|
) |
|
|
DIRECT_MODE_ANON_STRATEGY = get_or_create_env_var( |
|
|
"DIRECT_MODE_ANON_STRATEGY", DEFAULT_TABULAR_ANONYMISATION_STRATEGY |
|
|
) |
|
|
DIRECT_MODE_FUZZY_MISTAKES = int( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_FUZZY_MISTAKES", str(DEFAULT_FUZZY_SPELLING_MISTAKES_NUM) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_SIMILARITY_THRESHOLD = float( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_SIMILARITY_THRESHOLD", str(DEFAULT_DUPLICATE_DETECTION_THRESHOLD) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_MIN_WORD_COUNT = int( |
|
|
get_or_create_env_var("DIRECT_MODE_MIN_WORD_COUNT", str(DEFAULT_MIN_WORD_COUNT)) |
|
|
) |
|
|
DIRECT_MODE_MIN_CONSECUTIVE_PAGES = int( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_MIN_CONSECUTIVE_PAGES", str(DEFAULT_MIN_CONSECUTIVE_PAGES) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_GREEDY_MATCH = convert_string_to_boolean( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_GREEDY_MATCH", str(USE_GREEDY_DUPLICATE_DETECTION) |
|
|
) |
|
|
) |
|
|
DIRECT_MODE_COMBINE_PAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("DIRECT_MODE_COMBINE_PAGES", str(DEFAULT_COMBINE_PAGES)) |
|
|
) |
|
|
DIRECT_MODE_REMOVE_DUPLICATE_ROWS = convert_string_to_boolean( |
|
|
get_or_create_env_var( |
|
|
"DIRECT_MODE_REMOVE_DUPLICATE_ROWS", str(REMOVE_DUPLICATE_ROWS) |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
DIRECT_MODE_TEXTRACT_ACTION = get_or_create_env_var( |
|
|
"DIRECT_MODE_TEXTRACT_ACTION", "" |
|
|
) |
|
|
DIRECT_MODE_JOB_ID = get_or_create_env_var( |
|
|
"DIRECT_MODE_JOB_ID", "" |
|
|
) |
|
|
|
|
|
|
|
|
LAMBDA_POLL_INTERVAL = int( |
|
|
get_or_create_env_var("LAMBDA_POLL_INTERVAL", "30") |
|
|
) |
|
|
LAMBDA_MAX_POLL_ATTEMPTS = int( |
|
|
get_or_create_env_var("LAMBDA_MAX_POLL_ATTEMPTS", "120") |
|
|
) |
|
|
LAMBDA_PREPARE_IMAGES = convert_string_to_boolean( |
|
|
get_or_create_env_var("LAMBDA_PREPARE_IMAGES", "True") |
|
|
) |
|
|
LAMBDA_EXTRACT_SIGNATURES = convert_string_to_boolean( |
|
|
get_or_create_env_var("LAMBDA_EXTRACT_SIGNATURES", "False") |
|
|
) |
|
|
LAMBDA_DEFAULT_USERNAME = get_or_create_env_var( |
|
|
"LAMBDA_DEFAULT_USERNAME", "lambda_user" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
GET_DEFAULT_ALLOW_LIST = convert_string_to_boolean( |
|
|
get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "False") |
|
|
) |
|
|
|
|
|
ALLOW_LIST_PATH = get_or_create_env_var( |
|
|
"ALLOW_LIST_PATH", "" |
|
|
) |
|
|
|
|
|
S3_ALLOW_LIST_PATH = get_or_create_env_var( |
|
|
"S3_ALLOW_LIST_PATH", "" |
|
|
) |
|
|
|
|
|
if ALLOW_LIST_PATH: |
|
|
OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH |
|
|
else: |
|
|
OUTPUT_ALLOW_LIST_PATH = "config/default_allow_list.csv" |
|
|
|
|
|
|
|
|
|
|
|
GET_DEFAULT_DENY_LIST = convert_string_to_boolean( |
|
|
get_or_create_env_var("GET_DEFAULT_DENY_LIST", "False") |
|
|
) |
|
|
|
|
|
S3_DENY_LIST_PATH = get_or_create_env_var( |
|
|
"S3_DENY_LIST_PATH", "" |
|
|
) |
|
|
|
|
|
DENY_LIST_PATH = get_or_create_env_var( |
|
|
"DENY_LIST_PATH", "" |
|
|
) |
|
|
|
|
|
if DENY_LIST_PATH: |
|
|
OUTPUT_DENY_LIST_PATH = DENY_LIST_PATH |
|
|
else: |
|
|
OUTPUT_DENY_LIST_PATH = "config/default_deny_list.csv" |
|
|
|
|
|
|
|
|
|
|
|
GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST = get_or_create_env_var( |
|
|
"GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST", "False" |
|
|
) |
|
|
|
|
|
S3_WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var( |
|
|
"S3_WHOLE_PAGE_REDACTION_LIST_PATH", "" |
|
|
) |
|
|
|
|
|
WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var( |
|
|
"WHOLE_PAGE_REDACTION_LIST_PATH", "" |
|
|
) |
|
|
|
|
|
if WHOLE_PAGE_REDACTION_LIST_PATH: |
|
|
OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = WHOLE_PAGE_REDACTION_LIST_PATH |
|
|
else: |
|
|
OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = ( |
|
|
"config/default_whole_page_redaction_list.csv" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SHOW_COSTS = convert_string_to_boolean(get_or_create_env_var("SHOW_COSTS", "False")) |
|
|
|
|
|
GET_COST_CODES = convert_string_to_boolean( |
|
|
get_or_create_env_var("GET_COST_CODES", "False") |
|
|
) |
|
|
|
|
|
DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "") |
|
|
|
|
|
COST_CODES_PATH = get_or_create_env_var( |
|
|
"COST_CODES_PATH", "" |
|
|
) |
|
|
|
|
|
S3_COST_CODES_PATH = get_or_create_env_var( |
|
|
"S3_COST_CODES_PATH", "" |
|
|
) |
|
|
|
|
|
|
|
|
if COST_CODES_PATH: |
|
|
OUTPUT_COST_CODES_PATH = COST_CODES_PATH |
|
|
else: |
|
|
OUTPUT_COST_CODES_PATH = "config/cost_codes.csv" |
|
|
|
|
|
ENFORCE_COST_CODES = convert_string_to_boolean( |
|
|
get_or_create_env_var("ENFORCE_COST_CODES", "False") |
|
|
) |
|
|
|
|
|
|
|
|
if ENFORCE_COST_CODES: |
|
|
GET_COST_CODES = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = convert_string_to_boolean( |
|
|
get_or_create_env_var("SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False") |
|
|
) |
|
|
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var( |
|
|
"TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", "" |
|
|
) |
|
|
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var( |
|
|
"TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER", "input" |
|
|
) |
|
|
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var( |
|
|
"TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output" |
|
|
) |
|
|
|
|
|
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = convert_string_to_boolean( |
|
|
get_or_create_env_var("LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False") |
|
|
) |
|
|
|
|
|
|
|
|
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var( |
|
|
"TEXTRACT_JOBS_S3_LOC", "output" |
|
|
) |
|
|
|
|
|
TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var( |
|
|
"TEXTRACT_JOBS_S3_INPUT_LOC", "input" |
|
|
) |
|
|
|
|
|
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var( |
|
|
"TEXTRACT_JOBS_LOCAL_LOC", "output" |
|
|
) |
|
|
|
|
|
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int( |
|
|
get_or_create_env_var("DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7") |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS) |
|
|
CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS) |
|
|
CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS) |
|
|
|
|
|
DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS) |
|
|
DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS) |
|
|
DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS) |
|
|
if CHOSEN_COMPREHEND_ENTITIES: |
|
|
CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES) |
|
|
if FULL_COMPREHEND_ENTITY_LIST: |
|
|
FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST) |
|
|
if CHOSEN_REDACT_ENTITIES: |
|
|
CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES) |
|
|
if FULL_ENTITY_LIST: |
|
|
FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST) |
|
|
|
|
|
if SHOW_VLM_MODEL_OPTIONS or SHOW_INFERENCE_SERVER_OPTIONS: |
|
|
FULL_ENTITY_LIST.extend(["CUSTOM_VLM_PERSON", "CUSTOM_VLM_SIGNATURE"]) |
|
|
FULL_COMPREHEND_ENTITY_LIST.extend(["CUSTOM_VLM_PERSON", "CUSTOM_VLM_SIGNATURE"]) |
|
|
|
|
|
if DEFAULT_TEXT_COLUMNS: |
|
|
DEFAULT_TEXT_COLUMNS = _get_env_list(DEFAULT_TEXT_COLUMNS) |
|
|
if DEFAULT_EXCEL_SHEETS: |
|
|
DEFAULT_EXCEL_SHEETS = _get_env_list(DEFAULT_EXCEL_SHEETS) |
|
|
|
|
|
if CUSTOM_ENTITIES: |
|
|
CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES) |
|
|
|
|
|
if DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX: |
|
|
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = _get_env_list( |
|
|
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX |
|
|
) |
|
|
|
|
|
if ALLOWED_ORIGINS: |
|
|
ALLOWED_ORIGINS = _get_env_list(ALLOWED_ORIGINS) |
|
|
|
|
|
if ALLOWED_HOSTS: |
|
|
ALLOWED_HOSTS = _get_env_list(ALLOWED_HOSTS) |
|
|
|
|
|
if textract_language_choices: |
|
|
textract_language_choices = _get_env_list(textract_language_choices) |
|
|
if aws_comprehend_language_choices: |
|
|
aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices) |
|
|
|
|
|
if MAPPED_LANGUAGE_CHOICES: |
|
|
MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES) |
|
|
if LANGUAGE_CHOICES: |
|
|
LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES) |
|
|
|
|
|
LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES)) |
|
|
|