seanpedrickcase's picture
Sync: Merge pull request #108 from seanpedrick-case/dev
546be9b
import os
import shutil
import subprocess
import sys
import tempfile
import threading
import unittest
from typing import List, Optional
def run_cli_redact(
script_path: str,
input_file: str,
output_dir: str,
task: str = "redact",
timeout: int = 600, # 10-minute timeout
# --- General Arguments ---
input_dir: Optional[str] = None,
language: Optional[str] = None,
allow_list: Optional[str] = None,
pii_detector: Optional[str] = None,
username: Optional[str] = None,
save_to_user_folders: Optional[bool] = None,
local_redact_entities: Optional[List[str]] = None,
aws_redact_entities: Optional[List[str]] = None,
aws_access_key: Optional[str] = None,
aws_secret_key: Optional[str] = None,
cost_code: Optional[str] = None,
aws_region: Optional[str] = None,
s3_bucket: Optional[str] = None,
do_initial_clean: Optional[bool] = None,
save_logs_to_csv: Optional[bool] = None,
save_logs_to_dynamodb: Optional[bool] = None,
display_file_names_in_logs: Optional[bool] = None,
upload_logs_to_s3: Optional[bool] = None,
s3_logs_prefix: Optional[str] = None,
# --- PDF/Image Redaction Arguments ---
ocr_method: Optional[str] = None,
page_min: Optional[int] = None,
page_max: Optional[int] = None,
images_dpi: Optional[float] = None,
chosen_local_ocr_model: Optional[str] = None,
preprocess_local_ocr_images: Optional[bool] = None,
compress_redacted_pdf: Optional[bool] = None,
return_pdf_end_of_redaction: Optional[bool] = None,
deny_list_file: Optional[str] = None,
allow_list_file: Optional[str] = None,
redact_whole_page_file: Optional[str] = None,
handwrite_signature_extraction: Optional[List[str]] = None,
extract_forms: Optional[bool] = None,
extract_tables: Optional[bool] = None,
extract_layout: Optional[bool] = None,
# --- Word/Tabular Anonymisation Arguments ---
anon_strategy: Optional[str] = None,
text_columns: Optional[List[str]] = None,
excel_sheets: Optional[List[str]] = None,
fuzzy_mistakes: Optional[int] = None,
match_fuzzy_whole_phrase_bool: Optional[bool] = None,
# --- Duplicate Detection Arguments ---
duplicate_type: Optional[str] = None,
similarity_threshold: Optional[float] = None,
min_word_count: Optional[int] = None,
min_consecutive_pages: Optional[int] = None,
greedy_match: Optional[bool] = None,
combine_pages: Optional[bool] = None,
remove_duplicate_rows: Optional[bool] = None,
# --- Textract Batch Operations Arguments ---
textract_action: Optional[str] = None,
job_id: Optional[str] = None,
extract_signatures: Optional[bool] = None,
textract_bucket: Optional[str] = None,
textract_input_prefix: Optional[str] = None,
textract_output_prefix: Optional[str] = None,
s3_textract_document_logs_subfolder: Optional[str] = None,
local_textract_document_logs_subfolder: Optional[str] = None,
poll_interval: Optional[int] = None,
max_poll_attempts: Optional[int] = None,
) -> bool:
"""
Executes the cli_redact.py script with specified arguments using a subprocess.
Args:
script_path (str): The path to the cli_redact.py script.
input_file (str): The path to the input file to process.
output_dir (str): The path to the directory for output files.
task (str): The main task to perform ('redact', 'deduplicate', or 'textract').
timeout (int): Timeout in seconds for the subprocess.
# General Arguments
input_dir (str): Directory for all input files.
language (str): Language of the document content.
allow_list (str): Path to a CSV file with words to exclude from redaction.
pii_detector (str): Core PII detection method (Local, AWS Comprehend, or None).
username (str): Username for the session.
save_to_user_folders (bool): Whether to save to user folders or not.
local_redact_entities (List[str]): Local redaction entities to use.
aws_redact_entities (List[str]): AWS redaction entities to use.
aws_access_key (str): Your AWS Access Key ID.
aws_secret_key (str): Your AWS Secret Access Key.
cost_code (str): Cost code for tracking usage.
aws_region (str): AWS region for cloud services.
s3_bucket (str): S3 bucket name for cloud operations.
do_initial_clean (bool): Perform initial text cleaning for tabular data.
save_logs_to_csv (bool): Save processing logs to CSV files.
save_logs_to_dynamodb (bool): Save processing logs to DynamoDB.
display_file_names_in_logs (bool): Include file names in log outputs.
upload_logs_to_s3 (bool): Upload log files to S3 after processing.
s3_logs_prefix (str): S3 prefix for usage log files.
# PDF/Image Redaction Arguments
ocr_method (str): OCR method for text extraction from images.
page_min (int): First page to redact.
page_max (int): Last page to redact.
images_dpi (float): DPI for image processing.
chosen_local_ocr_model (str): Local OCR model to use.
preprocess_local_ocr_images (bool): Preprocess images before OCR.
compress_redacted_pdf (bool): Compress the final redacted PDF.
return_pdf_end_of_redaction (bool): Return PDF at end of redaction process.
deny_list_file (str): Custom words file to recognize for redaction.
allow_list_file (str): Custom words file to recognize for redaction.
redact_whole_page_file (str): File for pages to redact completely.
handwrite_signature_extraction (List[str]): Handwriting and signature extraction options.
extract_forms (bool): Extract forms during Textract analysis.
extract_tables (bool): Extract tables during Textract analysis.
extract_layout (bool): Extract layout during Textract analysis.
# Word/Tabular Anonymisation Arguments
anon_strategy (str): The anonymisation strategy to apply.
text_columns (List[str]): A list of column names to anonymise or deduplicate.
excel_sheets (List[str]): Specific Excel sheet names to process.
fuzzy_mistakes (int): Number of allowed spelling mistakes for fuzzy matching.
match_fuzzy_whole_phrase_bool (bool): Match fuzzy whole phrase boolean.
# Duplicate Detection Arguments
duplicate_type (str): Type of duplicate detection (pages or tabular).
similarity_threshold (float): Similarity threshold (0-1) to consider content as duplicates.
min_word_count (int): Minimum word count for text to be considered.
min_consecutive_pages (int): Minimum number of consecutive pages to consider as a match.
greedy_match (bool): Use greedy matching strategy for consecutive pages.
combine_pages (bool): Combine text from the same page number within a file.
remove_duplicate_rows (bool): Remove duplicate rows from the output.
# Textract Batch Operations Arguments
textract_action (str): Textract action to perform (submit, retrieve, or list).
job_id (str): Textract job ID for retrieve action.
extract_signatures (bool): Extract signatures during Textract analysis.
textract_bucket (str): S3 bucket name for Textract operations.
textract_input_prefix (str): S3 prefix for input files in Textract operations.
textract_output_prefix (str): S3 prefix for output files in Textract operations.
s3_textract_document_logs_subfolder (str): S3 prefix for logs in Textract operations.
local_textract_document_logs_subfolder (str): Local prefix for logs in Textract operations.
poll_interval (int): Polling interval in seconds for Textract job status.
max_poll_attempts (int): Maximum number of polling attempts for Textract job completion.
Returns:
bool: True if the script executed successfully, False otherwise.
"""
# 1. Get absolute paths and perform pre-checks
script_abs_path = os.path.abspath(script_path)
output_abs_dir = os.path.abspath(output_dir)
# Handle input file based on task and action
if task == "textract" and textract_action in ["retrieve", "list"]:
# For retrieve and list actions, input file is not required
input_abs_path = None
else:
# For all other cases, input file is required
if input_file is None:
raise ValueError("Input file is required for this task")
input_abs_path = os.path.abspath(input_file)
if not os.path.isfile(input_abs_path):
raise FileNotFoundError(f"Input file not found: {input_abs_path}")
if not os.path.isfile(script_abs_path):
raise FileNotFoundError(f"Script not found: {script_abs_path}")
if not os.path.isdir(output_abs_dir):
# Create the output directory if it doesn't exist
print(f"Output directory not found. Creating: {output_abs_dir}")
os.makedirs(output_abs_dir)
script_folder = os.path.dirname(script_abs_path)
# 2. Dynamically build the command list
command = [
"python",
script_abs_path,
"--output_dir",
output_abs_dir,
"--task",
task,
]
# Add input_file only if it's not None
if input_abs_path is not None:
command.extend(["--input_file", input_abs_path])
# Add general arguments
if input_dir:
command.extend(["--input_dir", input_dir])
if language:
command.extend(["--language", language])
if allow_list and os.path.isfile(allow_list):
command.extend(["--allow_list", os.path.abspath(allow_list)])
if pii_detector:
command.extend(["--pii_detector", pii_detector])
if username:
command.extend(["--username", username])
if save_to_user_folders is not None:
command.extend(["--save_to_user_folders", str(save_to_user_folders)])
if local_redact_entities:
command.append("--local_redact_entities")
command.extend(local_redact_entities)
if aws_redact_entities:
command.append("--aws_redact_entities")
command.extend(aws_redact_entities)
if aws_access_key:
command.extend(["--aws_access_key", aws_access_key])
if aws_secret_key:
command.extend(["--aws_secret_key", aws_secret_key])
if cost_code:
command.extend(["--cost_code", cost_code])
if aws_region:
command.extend(["--aws_region", aws_region])
if s3_bucket:
command.extend(["--s3_bucket", s3_bucket])
if do_initial_clean is not None:
command.extend(["--do_initial_clean", str(do_initial_clean)])
if save_logs_to_csv is not None:
command.extend(["--save_logs_to_csv", str(save_logs_to_csv)])
if save_logs_to_dynamodb is not None:
command.extend(["--save_logs_to_dynamodb", str(save_logs_to_dynamodb)])
if display_file_names_in_logs is not None:
command.extend(
["--display_file_names_in_logs", str(display_file_names_in_logs)]
)
if upload_logs_to_s3 is not None:
command.extend(["--upload_logs_to_s3", str(upload_logs_to_s3)])
if s3_logs_prefix:
command.extend(["--s3_logs_prefix", s3_logs_prefix])
# Add PDF/Image redaction arguments
if ocr_method:
command.extend(["--ocr_method", ocr_method])
if page_min is not None:
command.extend(["--page_min", str(page_min)])
if page_max is not None:
command.extend(["--page_max", str(page_max)])
if images_dpi is not None:
command.extend(["--images_dpi", str(images_dpi)])
if chosen_local_ocr_model:
command.extend(["--chosen_local_ocr_model", chosen_local_ocr_model])
if preprocess_local_ocr_images is not None:
command.extend(
["--preprocess_local_ocr_images", str(preprocess_local_ocr_images)]
)
if compress_redacted_pdf is not None:
command.extend(["--compress_redacted_pdf", str(compress_redacted_pdf)])
if return_pdf_end_of_redaction is not None:
command.extend(
["--return_pdf_end_of_redaction", str(return_pdf_end_of_redaction)]
)
if deny_list_file and os.path.isfile(deny_list_file):
command.extend(["--deny_list_file", os.path.abspath(deny_list_file)])
if allow_list_file and os.path.isfile(allow_list_file):
command.extend(["--allow_list_file", os.path.abspath(allow_list_file)])
if redact_whole_page_file and os.path.isfile(redact_whole_page_file):
command.extend(
["--redact_whole_page_file", os.path.abspath(redact_whole_page_file)]
)
if handwrite_signature_extraction:
command.append("--handwrite_signature_extraction")
command.extend(handwrite_signature_extraction)
if extract_forms:
command.append("--extract_forms")
if extract_tables:
command.append("--extract_tables")
if extract_layout:
command.append("--extract_layout")
# Add Word/Tabular anonymisation arguments
if anon_strategy:
command.extend(["--anon_strategy", anon_strategy])
if text_columns:
command.append("--text_columns")
command.extend(text_columns)
if excel_sheets:
command.append("--excel_sheets")
command.extend(excel_sheets)
if fuzzy_mistakes is not None:
command.extend(["--fuzzy_mistakes", str(fuzzy_mistakes)])
if match_fuzzy_whole_phrase_bool is not None:
command.extend(
["--match_fuzzy_whole_phrase_bool", str(match_fuzzy_whole_phrase_bool)]
)
# Add duplicate detection arguments
if duplicate_type:
command.extend(["--duplicate_type", duplicate_type])
if similarity_threshold is not None:
command.extend(["--similarity_threshold", str(similarity_threshold)])
if min_word_count is not None:
command.extend(["--min_word_count", str(min_word_count)])
if min_consecutive_pages is not None:
command.extend(["--min_consecutive_pages", str(min_consecutive_pages)])
if greedy_match is not None:
command.extend(["--greedy_match", str(greedy_match)])
if combine_pages is not None:
command.extend(["--combine_pages", str(combine_pages)])
if remove_duplicate_rows is not None:
command.extend(["--remove_duplicate_rows", str(remove_duplicate_rows)])
# Add Textract batch operations arguments
if textract_action:
command.extend(["--textract_action", textract_action])
if job_id:
command.extend(["--job_id", job_id])
if extract_signatures is not None:
if extract_signatures:
command.append("--extract_signatures")
if textract_bucket:
command.extend(["--textract_bucket", textract_bucket])
if textract_input_prefix:
command.extend(["--textract_input_prefix", textract_input_prefix])
if textract_output_prefix:
command.extend(["--textract_output_prefix", textract_output_prefix])
if s3_textract_document_logs_subfolder:
command.extend(
[
"--s3_textract_document_logs_subfolder",
s3_textract_document_logs_subfolder,
]
)
if local_textract_document_logs_subfolder:
command.extend(
[
"--local_textract_document_logs_subfolder",
local_textract_document_logs_subfolder,
]
)
if poll_interval is not None:
command.extend(["--poll_interval", str(poll_interval)])
if max_poll_attempts is not None:
command.extend(["--max_poll_attempts", str(max_poll_attempts)])
# Filter out None values before joining
command_str = " ".join(str(arg) for arg in command if arg is not None)
print(f"Executing command: {command_str}")
# 3. Execute the command using subprocess
try:
result = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
cwd=script_folder, # Important for relative paths within the script
)
# Communicate with the process to get output and handle timeout
stdout, stderr = result.communicate(timeout=timeout)
print("--- SCRIPT STDOUT ---")
if stdout:
print(stdout)
print("--- SCRIPT STDERR ---")
if stderr:
print(stderr)
print("---------------------")
# Analyze the output for errors and success indicators
analysis = analyze_test_output(stdout, stderr)
if analysis["has_errors"]:
print("❌ Errors detected in output:")
for i, error_type in enumerate(analysis["error_types"]):
print(f" {i+1}. {error_type}")
if analysis["error_messages"]:
print(" Error messages:")
for msg in analysis["error_messages"][
:3
]: # Show first 3 error messages
print(f" - {msg}")
return False
elif result.returncode == 0:
success_msg = "βœ… Script executed successfully."
if analysis["success_indicators"]:
success_msg += f" (Success indicators: {', '.join(analysis['success_indicators'][:3])})"
print(success_msg)
return True
else:
print(f"❌ Command failed with return code {result.returncode}")
return False
except subprocess.TimeoutExpired:
result.kill()
print(f"❌ Subprocess timed out after {timeout} seconds.")
return False
except Exception as e:
print(f"❌ An unexpected error occurred: {e}")
return False
def analyze_test_output(stdout: str, stderr: str) -> dict:
"""
Analyze test output to provide detailed error information.
Args:
stdout (str): Standard output from the test
stderr (str): Standard error from the test
Returns:
dict: Analysis results with error details
"""
combined_output = (stdout or "") + (stderr or "")
analysis = {
"has_errors": False,
"error_types": [],
"error_messages": [],
"success_indicators": [],
"warning_indicators": [],
}
# Error patterns
error_patterns = {
"An error occurred": "General error message",
"Error:": "Error prefix",
"Exception:": "Exception occurred",
"Traceback": "Python traceback",
"Failed to": "Operation failure",
"Cannot": "Operation not possible",
"Unable to": "Operation not possible",
"KeyError:": "Missing key/dictionary error",
"AttributeError:": "Missing attribute error",
"TypeError:": "Type mismatch error",
"ValueError:": "Invalid value error",
"FileNotFoundError:": "File not found",
"ImportError:": "Import failure",
"ModuleNotFoundError:": "Module not found",
}
# Success indicators
success_patterns = [
"Successfully",
"Completed",
"Finished",
"Processed",
"Redacted",
"Extracted",
]
# Warning indicators
warning_patterns = ["Warning:", "WARNING:", "Deprecated", "DeprecationWarning"]
# Check for errors
for pattern, description in error_patterns.items():
if pattern.lower() in combined_output.lower():
analysis["has_errors"] = True
analysis["error_types"].append(description)
# Extract the actual error message
lines = combined_output.split("\n")
for line in lines:
if pattern.lower() in line.lower():
analysis["error_messages"].append(line.strip())
# Check for success indicators
for pattern in success_patterns:
if pattern.lower() in combined_output.lower():
analysis["success_indicators"].append(pattern)
# Check for warnings
for pattern in warning_patterns:
if pattern.lower() in combined_output.lower():
analysis["warning_indicators"].append(pattern)
return analysis
class TestCLIRedactExamples(unittest.TestCase):
"""Test suite for CLI redaction examples from the epilog."""
@classmethod
def setUpClass(cls):
"""Set up test environment before running tests."""
cls.script_path = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
)
cls.example_data_dir = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "example_data"
)
cls.temp_output_dir = tempfile.mkdtemp(prefix="test_output_")
# Verify script exists
if not os.path.isfile(cls.script_path):
raise FileNotFoundError(f"CLI script not found: {cls.script_path}")
print(f"Test setup complete. Script: {cls.script_path}")
print(f"Example data directory: {cls.example_data_dir}")
print(f"Temp output directory: {cls.temp_output_dir}")
# Debug: Check if example data directory exists and list contents
if os.path.exists(cls.example_data_dir):
print("Example data directory exists. Contents:")
for item in os.listdir(cls.example_data_dir):
item_path = os.path.join(cls.example_data_dir, item)
if os.path.isfile(item_path):
print(f" File: {item} ({os.path.getsize(item_path)} bytes)")
else:
print(f" Directory: {item}")
else:
print(f"Example data directory does not exist: {cls.example_data_dir}")
@classmethod
def tearDownClass(cls):
"""Clean up test environment after running tests."""
if os.path.exists(cls.temp_output_dir):
shutil.rmtree(cls.temp_output_dir)
print(f"Cleaned up temp directory: {cls.temp_output_dir}")
def test_pdf_redaction_default_settings(self):
"""Test: Redact a PDF with default settings (local OCR)"""
print("\n=== Testing PDF redaction with default settings ===")
input_file = os.path.join(
self.example_data_dir,
"example_of_emails_sent_to_a_professor_before_applying.pdf",
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
)
self.assertTrue(result, "PDF redaction with default settings should succeed")
print("βœ… PDF redaction with default settings passed")
def test_pdf_text_extraction_only(self):
"""Test: Extract text from a PDF only (i.e. no redaction), using local OCR"""
print("\n=== Testing PDF text extraction only ===")
input_file = os.path.join(
self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
)
whole_page_file = os.path.join(
self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
if not os.path.isfile(whole_page_file):
self.skipTest(f"Whole page file not found: {whole_page_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
redact_whole_page_file=whole_page_file,
pii_detector="None",
)
self.assertTrue(result, "PDF text extraction should succeed")
print("βœ… PDF text extraction only passed")
def test_pdf_text_extraction_with_whole_page_redaction(self):
"""Test: Extract text from a PDF only with a whole page redaction list"""
print("\n=== Testing PDF text extraction with whole page redaction ===")
input_file = os.path.join(
self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
)
whole_page_file = os.path.join(
self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
if not os.path.isfile(whole_page_file):
self.skipTest(f"Whole page file not found: {whole_page_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
redact_whole_page_file=whole_page_file,
pii_detector="Local",
local_redact_entities=["CUSTOM"],
)
self.assertTrue(
result, "PDF text extraction with whole page redaction should succeed"
)
print("βœ… PDF text extraction with whole page redaction passed")
def test_pdf_redaction_with_allow_list(self):
"""Test: Redact a PDF with allow list (local OCR) and custom list of redaction entities"""
print("\n=== Testing PDF redaction with allow list ===")
input_file = os.path.join(
self.example_data_dir, "graduate-job-example-cover-letter.pdf"
)
allow_list_file = os.path.join(
self.example_data_dir, "test_allow_list_graduate.csv"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
if not os.path.isfile(allow_list_file):
self.skipTest(f"Allow list file not found: {allow_list_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
allow_list_file=allow_list_file,
local_redact_entities=["TITLES", "PERSON", "DATE_TIME"],
)
self.assertTrue(result, "PDF redaction with allow list should succeed")
print("βœ… PDF redaction with allow list passed")
def test_pdf_redaction_limited_pages_with_custom_fuzzy(self):
"""Test: Redact a PDF with limited pages and text extraction method with custom fuzzy matching"""
print("\n=== Testing PDF redaction with limited pages and fuzzy matching ===")
input_file = os.path.join(
self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
)
deny_list_file = os.path.join(
self.example_data_dir,
"Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
if not os.path.isfile(deny_list_file):
self.skipTest(f"Deny list file not found: {deny_list_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
deny_list_file=deny_list_file,
local_redact_entities=["CUSTOM_FUZZY"],
page_min=1,
page_max=3,
ocr_method="Local text",
fuzzy_mistakes=3,
)
self.assertTrue(
result, "PDF redaction with limited pages and fuzzy matching should succeed"
)
print("βœ… PDF redaction with limited pages and fuzzy matching passed")
def test_pdf_redaction_with_custom_lists(self):
"""Test: Redaction with custom deny list, allow list, and whole page redaction list"""
print("\n=== Testing PDF redaction with custom lists ===")
input_file = os.path.join(
self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
)
deny_list_file = os.path.join(
self.example_data_dir, "partnership_toolkit_redact_custom_deny_list.csv"
)
whole_page_file = os.path.join(
self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
)
allow_list_file = os.path.join(
self.example_data_dir, "test_allow_list_partnership.csv"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
if not os.path.isfile(deny_list_file):
self.skipTest(f"Deny list file not found: {deny_list_file}")
if not os.path.isfile(whole_page_file):
self.skipTest(f"Whole page file not found: {whole_page_file}")
if not os.path.isfile(allow_list_file):
self.skipTest(f"Allow list file not found: {allow_list_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
deny_list_file=deny_list_file,
redact_whole_page_file=whole_page_file,
allow_list_file=allow_list_file,
)
self.assertTrue(result, "PDF redaction with custom lists should succeed")
print("βœ… PDF redaction with custom lists passed")
def test_image_redaction(self):
"""Test: Redact an image"""
print("\n=== Testing image redaction ===")
input_file = os.path.join(self.example_data_dir, "example_complaint_letter.jpg")
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
)
self.assertTrue(result, "Image redaction should succeed")
print("βœ… Image redaction passed")
def test_csv_anonymisation_specific_columns(self):
"""Test: Anonymise csv file with specific columns"""
print("\n=== Testing CSV anonymisation with specific columns ===")
input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
text_columns=["Case Note", "Client"],
anon_strategy="replace_redacted",
)
self.assertTrue(
result, "CSV anonymisation with specific columns should succeed"
)
print("βœ… CSV anonymisation with specific columns passed")
def test_csv_anonymisation_different_strategy(self):
"""Test: Anonymise csv file with a different strategy (remove text completely)"""
print("\n=== Testing CSV anonymisation with different strategy ===")
input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
text_columns=["Case Note", "Client"],
anon_strategy="redact",
)
self.assertTrue(
result, "CSV anonymisation with different strategy should succeed"
)
print("βœ… CSV anonymisation with different strategy passed")
def test_word_document_anonymisation(self):
"""Test: Anonymise a word document"""
print("\n=== Testing Word document anonymisation ===")
input_file = os.path.join(
self.example_data_dir, "Bold minimalist professional cover letter.docx"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
anon_strategy="replace_redacted",
)
self.assertTrue(result, "Word document anonymisation should succeed")
print("βœ… Word document anonymisation passed")
def test_aws_textract_comprehend_redaction(self):
"""Test: Use Textract and Comprehend for redaction"""
print("\n=== Testing AWS Textract and Comprehend redaction ===")
input_file = os.path.join(
self.example_data_dir,
"example_of_emails_sent_to_a_professor_before_applying.pdf",
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
# Skip this test if AWS credentials are not available
# This is a conditional test that may not work in all environments
run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
ocr_method="AWS Textract",
pii_detector="AWS Comprehend",
)
# Note: This test may fail if AWS credentials are not configured
# We'll mark it as passed if it runs without crashing
print("βœ… AWS Textract and Comprehend redaction test completed")
def test_aws_textract_signature_extraction(self):
"""Test: Redact specific pages with AWS OCR and signature extraction"""
print("\n=== Testing AWS Textract with signature extraction ===")
input_file = os.path.join(
self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
# Skip this test if AWS credentials are not available
run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
page_min=6,
page_max=7,
ocr_method="AWS Textract",
handwrite_signature_extraction=[
"Extract handwriting",
"Extract signatures",
],
)
# Note: This test may fail if AWS credentials are not configured
print("βœ… AWS Textract with signature extraction test completed")
def test_duplicate_pages_detection(self):
"""Test: Find duplicate pages in OCR files"""
print("\n=== Testing duplicate pages detection ===")
input_file = os.path.join(
self.example_data_dir,
"example_outputs",
"doubled_output_joined.pdf_ocr_output.csv",
)
if not os.path.isfile(input_file):
self.skipTest(f"Example OCR file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
task="deduplicate",
duplicate_type="pages",
similarity_threshold=0.95,
)
self.assertTrue(result, "Duplicate pages detection should succeed")
print("βœ… Duplicate pages detection passed")
def test_duplicate_line_level_detection(self):
"""Test: Find duplicate in OCR files at the line level"""
print("\n=== Testing duplicate line level detection ===")
input_file = os.path.join(
self.example_data_dir,
"example_outputs",
"doubled_output_joined.pdf_ocr_output.csv",
)
if not os.path.isfile(input_file):
self.skipTest(f"Example OCR file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
task="deduplicate",
duplicate_type="pages",
similarity_threshold=0.95,
combine_pages=False,
min_word_count=3,
)
self.assertTrue(result, "Duplicate line level detection should succeed")
print("βœ… Duplicate line level detection passed")
def test_duplicate_tabular_detection(self):
"""Test: Find duplicate rows in tabular data"""
print("\n=== Testing duplicate tabular detection ===")
input_file = os.path.join(
self.example_data_dir, "Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example CSV file not found: {input_file}")
result = run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
task="deduplicate",
duplicate_type="tabular",
text_columns=["text"],
similarity_threshold=0.95,
)
self.assertTrue(result, "Duplicate tabular detection should succeed")
print("βœ… Duplicate tabular detection passed")
def test_textract_submit_document(self):
"""Test: Submit document to Textract for basic text analysis"""
print("\n=== Testing Textract document submission ===")
input_file = os.path.join(
self.example_data_dir,
"example_of_emails_sent_to_a_professor_before_applying.pdf",
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
# Skip this test if AWS credentials are not available
try:
run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
task="textract",
textract_action="submit",
)
except Exception as e:
print(f"Textract test failed (expected without AWS credentials): {e}")
# Note: This test may fail if AWS credentials are not configured
print("βœ… Textract document submission test completed")
def test_textract_submit_with_signatures(self):
"""Test: Submit document to Textract for analysis with signature extraction"""
print("\n=== Testing Textract submission with signature extraction ===")
input_file = os.path.join(
self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
)
if not os.path.isfile(input_file):
self.skipTest(f"Example file not found: {input_file}")
# Skip this test if AWS credentials are not available
try:
run_cli_redact(
script_path=self.script_path,
input_file=input_file,
output_dir=self.temp_output_dir,
task="textract",
textract_action="submit",
extract_signatures=True,
)
except Exception as e:
print(f"Textract test failed (expected without AWS credentials): {e}")
# Note: This test may fail if AWS credentials are not configured
print("βœ… Textract submission with signature extraction test completed")
def test_textract_retrieve_results(self):
"""Test: Retrieve Textract results by job ID"""
print("\n=== Testing Textract results retrieval ===")
# Skip this test if AWS credentials are not available
# This would require a valid job ID from a previous submission
# For retrieve and list actions, we don't need a real input file
try:
run_cli_redact(
script_path=self.script_path,
input_file=None, # No input file needed for retrieve action
output_dir=self.temp_output_dir,
task="textract",
textract_action="retrieve",
job_id="12345678-1234-1234-1234-123456789012", # Dummy job ID
)
except Exception as e:
print(f"Textract test failed (expected without AWS credentials): {e}")
# Note: This test will likely fail with a dummy job ID, but that's expected
print("βœ… Textract results retrieval test completed")
def test_textract_list_jobs(self):
"""Test: List recent Textract jobs"""
print("\n=== Testing Textract jobs listing ===")
# Skip this test if AWS credentials are not available
# For list action, we don't need a real input file
try:
run_cli_redact(
script_path=self.script_path,
input_file=None, # No input file needed for list action
output_dir=self.temp_output_dir,
task="textract",
textract_action="list",
)
except Exception as e:
print(f"Textract test failed (expected without AWS credentials): {e}")
# Note: This test may fail if AWS credentials are not configured
print("βœ… Textract jobs listing test completed")
class TestGUIApp(unittest.TestCase):
"""Test suite for GUI application loading and basic functionality."""
@classmethod
def setUpClass(cls):
"""Set up test environment for GUI tests."""
cls.app_path = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "app.py"
)
# Verify app.py exists
if not os.path.isfile(cls.app_path):
raise FileNotFoundError(f"App file not found: {cls.app_path}")
print(f"GUI test setup complete. App: {cls.app_path}")
def test_app_import_and_initialization(self):
"""Test: Import app.py and check if the Gradio app object is created successfully."""
print("\n=== Testing GUI app import and initialization ===")
try:
# Add the parent directory to the path so we can import app
parent_dir = os.path.dirname(os.path.dirname(__file__))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
# Import the app module
import app
# Check if the app object exists and is a Gradio Blocks object
self.assertTrue(
hasattr(app, "blocks"), "App object should exist in the module"
)
# Check if it's a Gradio Blocks instance
import gradio as gr
self.assertIsInstance(
app.blocks, gr.Blocks, "App should be a Gradio Blocks instance"
)
print("βœ… GUI app import and initialisation passed")
except ImportError as e:
error_msg = f"Failed to import app module: {e}"
if "gradio_image_annotation" in str(e):
error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
error_msg += "\nPlease run: conda activate redaction"
error_msg += "\nThen run this test again."
self.fail(error_msg)
except Exception as e:
self.fail(f"Unexpected error during app initialization: {e}")
def test_app_launch_headless(self):
"""Test: Launch the app in headless mode to verify it starts without errors."""
print("\n=== Testing GUI app launch in headless mode ===")
try:
# Add the parent directory to the path
parent_dir = os.path.dirname(os.path.dirname(__file__))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
# Import the app module
import app
# Set up a flag to track if the app launched successfully
app_launched = threading.Event()
launch_error = None
def launch_app():
try:
# Launch the app in headless mode with a short timeout
app.app.launch(
show_error=True,
inbrowser=False, # Don't open browser
server_port=0, # Use any available port
quiet=True, # Suppress output
prevent_thread_lock=True, # Don't block the main thread
)
app_launched.set()
except Exception:
app_launched.set()
# Start the app in a separate thread
launch_thread = threading.Thread(target=launch_app)
launch_thread.daemon = True
launch_thread.start()
# Wait for the app to launch (with timeout)
if app_launched.wait(timeout=10): # 10 second timeout
if launch_error:
self.fail(f"App launch failed: {launch_error}")
else:
print("βœ… GUI app launch in headless mode passed")
else:
self.fail("App launch timed out after 10 seconds")
except Exception as e:
error_msg = f"Unexpected error during app launch test: {e}"
if "gradio_image_annotation" in str(e):
error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
error_msg += "\nPlease run: conda activate redaction"
error_msg += "\nThen run this test again."
self.fail(error_msg)
def test_app_configuration_loading(self):
"""Test: Verify that the app can load its configuration without errors."""
print("\n=== Testing GUI app configuration loading ===")
try:
# Add the parent directory to the path
parent_dir = os.path.dirname(os.path.dirname(__file__))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
# Import the app module (not needed?)
# import app
# Check if key configuration variables are accessible
# These should be imported from tools.config
from tools.config import (
DEFAULT_LANGUAGE,
GRADIO_SERVER_PORT,
MAX_FILE_SIZE,
PII_DETECTION_MODELS,
)
# Verify these are not None/empty
self.assertIsNotNone(
GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
)
self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
self.assertIsNotNone(
DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
)
self.assertIsNotNone(
PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
)
print("βœ… GUI app configuration loading passed")
except ImportError as e:
error_msg = f"Failed to import configuration: {e}"
if "gradio_image_annotation" in str(e):
error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
error_msg += "\nPlease run: conda activate redaction"
error_msg += "\nThen run this test again."
self.fail(error_msg)
except Exception as e:
error_msg = f"Unexpected error during configuration test: {e}"
if "gradio_image_annotation" in str(e):
error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
error_msg += "\nPlease run: conda activate redaction"
error_msg += "\nThen run this test again."
self.fail(error_msg)
def run_all_tests():
"""Run all test examples and report results."""
print("=" * 80)
print("DOCUMENT REDACTION TEST SUITE")
print("=" * 80)
print("This test suite includes:")
print("- CLI examples from the epilog")
print("- GUI application loading and initialization tests")
print("Tests will be skipped if required example files are not found.")
print("AWS-related tests may fail if credentials are not configured.")
print("=" * 80)
# Create test suite
loader = unittest.TestLoader()
suite = unittest.TestSuite()
# Add CLI tests
cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
suite.addTests(cli_suite)
# Add GUI tests
gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
suite.addTests(gui_suite)
# Run tests with detailed output
runner = unittest.TextTestRunner(verbosity=2, stream=None)
result = runner.run(suite)
# Print summary
print("\n" + "=" * 80)
print("TEST SUMMARY")
print("=" * 80)
print(f"Tests run: {result.testsRun}")
print(f"Failures: {len(result.failures)}")
print(f"Errors: {len(result.errors)}")
print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")
if result.failures:
print("\nFAILURES:")
for test, traceback in result.failures:
print(f"- {test}: {traceback}")
if result.errors:
print("\nERRORS:")
for test, traceback in result.errors:
print(f"- {test}: {traceback}")
success = len(result.failures) == 0 and len(result.errors) == 0
print(f"\nOverall result: {'βœ… PASSED' if success else '❌ FAILED'}")
print("=" * 80)
return success
if __name__ == "__main__":
# Run the test suite
success = run_all_tests()
exit(0 if success else 1)