Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 9 days ago

Commit

546be9b

0 Parent(s):

Sync: Merge pull request #108 from seanpedrick-case/dev

Browse files

Can now save all output files to a specified S3 bucket and folder. Ad…

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.coveragerc +56 -0
.dockerignore +38 -0
.gitattributes +8 -0
.github/scripts/setup_test_data.py +311 -0
.github/workflow_README.md +183 -0
.github/workflows/archive_workflows/multi-os-test.yml +109 -0
.github/workflows/ci.yml +260 -0
.github/workflows/simple-test.yml +67 -0
.github/workflows/sync_to_hf.yml +53 -0
.github/workflows/sync_to_hf_zero_gpu.yml +53 -0
.gitignore +41 -0
DocRedactApp.spec +66 -0
Dockerfile +186 -0
README.md +1261 -0
_quarto.yml +28 -0
app.py +0 -0
cdk/__init__.py +0 -0
cdk/app.py +83 -0
cdk/cdk_config.py +362 -0
cdk/cdk_functions.py +1482 -0
cdk/cdk_stack.py +1869 -0
cdk/check_resources.py +375 -0
cdk/post_cdk_build_quickstart.py +40 -0
cdk/requirements.txt +5 -0
cli_redact.py +1431 -0
entrypoint.sh +33 -0
example_config.env +49 -0
example_data/Bold minimalist professional cover letter.docx +3 -0
example_data/Difficult handwritten note.jpg +3 -0
example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf +3 -0
example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv +0 -0
example_data/Partnership-Agreement-Toolkit_0_0.pdf +3 -0
example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv +2 -0
example_data/combined_case_notes.csv +19 -0
example_data/combined_case_notes.xlsx +3 -0
example_data/doubled_output_joined.pdf +3 -0
example_data/example_complaint_letter.jpg +3 -0
example_data/example_of_emails_sent_to_a_professor_before_applying.pdf +3 -0
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv +277 -0
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv +77 -0
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv +0 -0
example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv +923 -0
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv +40 -0
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv +432 -0
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv +15 -0
example_data/graduate-job-example-cover-letter.pdf +3 -0
example_data/partnership_toolkit_redact_custom_deny_list.csv +2 -0
example_data/partnership_toolkit_redact_some_pages.csv +2 -0
example_data/test_allow_list_graduate.csv +1 -0
example_data/test_allow_list_partnership.csv +1 -0

.coveragerc ADDED Viewed

	@@ -0,0 +1,56 @@

+[run]
+source = .
+omit =
+    */tests/*
+    */test/*
+    */__pycache__/*
+    */venv/*
+    */env/*
+    */build/*
+    */dist/*
+    */cdk/*
+    */docs/*
+    */example_data/*
+    */examples/*
+    */feedback/*
+    */logs/*
+    */old_code/*
+    */output/*
+    */tmp/*
+    */usage/*
+    */tld/*
+    */tesseract/*
+    */poppler/*
+    config*.py
+    setup.py
+    lambda_entrypoint.py
+    entrypoint.sh
+    cli_redact.py
+    load_dynamo_logs.py
+    load_s3_logs.py
+    *.spec
+    Dockerfile
+    *.qmd
+    *.md
+    *.txt
+    *.yml
+    *.yaml
+    *.json
+    *.csv
+    *.env
+    *.bat
+    *.ps1
+    *.sh
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod

.dockerignore ADDED Viewed

	@@ -0,0 +1,38 @@

+*.url
+*.ipynb
+*.pyc
+examples/*
+processing/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+docs/*
+build_deps/*
+user_guide/*
+cdk/config/*
+tld/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+cdk.json
+cdk.context.json
+.quarto/*
+logs/
+output/
+input/
+feedback/
+config/
+usage/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*

.gitattributes ADDED Viewed

	@@ -0,0 +1,8 @@

+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text

.github/scripts/setup_test_data.py ADDED Viewed

	@@ -0,0 +1,311 @@

+#!/usr/bin/env python3
+"""
+Setup script for GitHub Actions test data.
+Creates dummy test files when example data is not available.
+"""
+import os
+import sys
+import pandas as pd
+def create_directories():
+    """Create necessary directories."""
+    dirs = ["example_data", "example_data/example_outputs"]
+    for dir_path in dirs:
+        os.makedirs(dir_path, exist_ok=True)
+        print(f"Created directory: {dir_path}")
+def create_dummy_pdf():
+    """Create dummy PDFs for testing."""
+    # Install reportlab if not available
+    try:
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+    except ImportError:
+        import subprocess
+        subprocess.check_call(["pip", "install", "reportlab"])
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+    try:
+        # Create the main test PDF
+        pdf_path = (
+            "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
+        )
+        print(f"Creating PDF: {pdf_path}")
+        print(f"Directory exists: {os.path.exists('example_data')}")
+        c = canvas.Canvas(pdf_path, pagesize=letter)
+        c.drawString(100, 750, "This is a test document for redaction testing.")
+        c.drawString(100, 700, "Email: test@example.com")
+        c.drawString(100, 650, "Phone: 123-456-7890")
+        c.drawString(100, 600, "Name: John Doe")
+        c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
+        c.showPage()
+        # Add second page
+        c.drawString(100, 750, "Second page content")
+        c.drawString(100, 700, "More test data: jane.doe@example.com")
+        c.drawString(100, 650, "Another phone: 987-654-3210")
+        c.save()
+        print(f"Created dummy PDF: {pdf_path}")
+        # Create Partnership Agreement Toolkit PDF
+        partnership_pdf_path = "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
+        print(f"Creating PDF: {partnership_pdf_path}")
+        c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
+        c.drawString(100, 750, "Partnership Agreement Toolkit")
+        c.drawString(100, 700, "This is a test partnership agreement document.")
+        c.drawString(100, 650, "Contact: partnership@example.com")
+        c.drawString(100, 600, "Phone: (555) 123-4567")
+        c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
+        c.showPage()
+        # Add second page
+        c.drawString(100, 750, "Page 2 - Partnership Details")
+        c.drawString(100, 700, "More partnership information here.")
+        c.drawString(100, 650, "Contact: info@partnership.org")
+        c.showPage()
+        # Add third page
+        c.drawString(100, 750, "Page 3 - Terms and Conditions")
+        c.drawString(100, 700, "Terms and conditions content.")
+        c.drawString(100, 650, "Legal contact: legal@partnership.org")
+        c.save()
+        print(f"Created dummy PDF: {partnership_pdf_path}")
+        # Create Graduate Job Cover Letter PDF
+        cover_letter_pdf_path = "example_data/graduate-job-example-cover-letter.pdf"
+        print(f"Creating PDF: {cover_letter_pdf_path}")
+        c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
+        c.drawString(100, 750, "Cover Letter Example")
+        c.drawString(100, 700, "Dear Hiring Manager,")
+        c.drawString(100, 650, "I am writing to apply for the position.")
+        c.drawString(100, 600, "Contact: applicant@example.com")
+        c.drawString(100, 550, "Phone: (555) 987-6543")
+        c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
+        c.drawString(100, 450, "Sincerely,")
+        c.drawString(100, 400, "John Applicant")
+        c.save()
+        print(f"Created dummy PDF: {cover_letter_pdf_path}")
+    except ImportError:
+        print("ReportLab not available, skipping PDF creation")
+        # Create simple text files instead
+        with open(
+            "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy PDF file for testing")
+        with open(
+            "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy Partnership Agreement PDF file for testing")
+        with open(
+            "example_data/graduate-job-example-cover-letter.pdf",
+            "w",
+        ) as f:
+            f.write("This is a dummy cover letter PDF file for testing")
+        print("Created dummy text files instead of PDFs")
+def create_dummy_csv():
+    """Create dummy CSV files for testing."""
+    # Main CSV
+    csv_data = {
+        "Case Note": [
+            "Client visited for consultation regarding housing issues",
+            "Follow-up appointment scheduled for next week",
+            "Documentation submitted for review",
+        ],
+        "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
+        "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
+    }
+    df = pd.DataFrame(csv_data)
+    df.to_csv("example_data/combined_case_notes.csv", index=False)
+    print("Created dummy CSV: example_data/combined_case_notes.csv")
+    # Lambeth CSV
+    lambeth_data = {
+        "text": [
+            "Lambeth 2030 vision document content",
+            "Our Future Our Lambeth strategic plan",
+            "Community engagement and development",
+        ],
+        "page": [1, 2, 3],
+    }
+    df_lambeth = pd.DataFrame(lambeth_data)
+    df_lambeth.to_csv(
+        "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv", index=False
+    )
+    print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
+def create_dummy_word_doc():
+    """Create dummy Word document."""
+    try:
+        from docx import Document
+        doc = Document()
+        doc.add_heading("Test Document for Redaction", 0)
+        doc.add_paragraph("This is a test document for redaction testing.")
+        doc.add_paragraph("Contact Information:")
+        doc.add_paragraph("Email: test@example.com")
+        doc.add_paragraph("Phone: 123-456-7890")
+        doc.add_paragraph("Name: John Doe")
+        doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
+        doc.save("example_data/Bold minimalist professional cover letter.docx")
+        print("Created dummy Word document")
+    except ImportError:
+        print("python-docx not available, skipping Word document creation")
+def create_allow_deny_lists():
+    """Create dummy allow/deny lists."""
+    # Allow lists
+    allow_data = {"word": ["test", "example", "document"]}
+    pd.DataFrame(allow_data).to_csv(
+        "example_data/test_allow_list_graduate.csv", index=False
+    )
+    pd.DataFrame(allow_data).to_csv(
+        "example_data/test_allow_list_partnership.csv", index=False
+    )
+    print("Created allow lists")
+    # Deny lists
+    deny_data = {"word": ["sensitive", "confidential", "private"]}
+    pd.DataFrame(deny_data).to_csv(
+        "example_data/partnership_toolkit_redact_custom_deny_list.csv", index=False
+    )
+    pd.DataFrame(deny_data).to_csv(
+        "example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
+        index=False,
+    )
+    print("Created deny lists")
+    # Whole page redaction list
+    page_data = {"page": [1, 2]}
+    pd.DataFrame(page_data).to_csv(
+        "example_data/partnership_toolkit_redact_some_pages.csv", index=False
+    )
+    print("Created whole page redaction list")
+def create_ocr_output():
+    """Create dummy OCR output CSV."""
+    ocr_data = {
+        "page": [1, 2, 3],
+        "text": [
+            "This is page 1 content with some text",
+            "This is page 2 content with different text",
+            "This is page 3 content with more text",
+        ],
+        "left": [0.1, 0.3, 0.5],
+        "top": [0.95, 0.92, 0.88],
+        "width": [0.05, 0.02, 0.02],
+        "height": [0.01, 0.02, 0.02],
+        "line": [1, 2, 3],
+    }
+    df = pd.DataFrame(ocr_data)
+    df.to_csv(
+        "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
+        index=False,
+    )
+    print("Created dummy OCR output CSV")
+def create_dummy_image():
+    """Create dummy image for testing."""
+    try:
+        from PIL import Image, ImageDraw, ImageFont
+        img = Image.new("RGB", (800, 600), color="white")
+        draw = ImageDraw.Draw(img)
+        # Try to use a system font
+        try:
+            font = ImageFont.truetype(
+                "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
+            )
+        except Exception as e:
+            print(f"Error loading DejaVuSans font: {e}")
+            try:
+                font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
+            except Exception as e:
+                print(f"Error loading Arial font: {e}")
+                font = ImageFont.load_default()
+        # Add text to image
+        draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
+        draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
+        draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
+        draw.text((50, 200), "Name: John Doe", fill="black", font=font)
+        draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
+        img.save("example_data/example_complaint_letter.jpg")
+        print("Created dummy image")
+    except ImportError:
+        print("PIL not available, skipping image creation")
+def main():
+    """Main setup function."""
+    print("Setting up test data for GitHub Actions...")
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Python version: {sys.version}")
+    create_directories()
+    create_dummy_pdf()
+    create_dummy_csv()
+    create_dummy_word_doc()
+    create_allow_deny_lists()
+    create_ocr_output()
+    create_dummy_image()
+    print("\nTest data setup complete!")
+    print("Created files:")
+    for root, dirs, files in os.walk("example_data"):
+        for file in files:
+            file_path = os.path.join(root, file)
+            print(f"  {file_path}")
+            # Verify the file exists and has content
+            if os.path.exists(file_path):
+                file_size = os.path.getsize(file_path)
+                print(f"    Size: {file_size} bytes")
+            else:
+                print("    WARNING: File does not exist!")
+    # Verify critical files exist
+    critical_files = [
+        "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+        "example_data/graduate-job-example-cover-letter.pdf",
+        "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+    ]
+    print("\nVerifying critical test files:")
+    for file_path in critical_files:
+        if os.path.exists(file_path):
+            file_size = os.path.getsize(file_path)
+            print(f"✅ {file_path} exists ({file_size} bytes)")
+        else:
+            print(f"❌ {file_path} MISSING!")
+if __name__ == "__main__":
+    main()

.github/workflow_README.md ADDED Viewed

	@@ -0,0 +1,183 @@

+# GitHub Actions CI/CD Setup
+This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
+## Workflows Overview
+### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
+- **Purpose**: Basic test execution
+- **Triggers**: Push to main/dev, Pull requests
+- **OS**: Ubuntu Latest
+- **Python**: 3.11
+- **Features**:
+  - Installs system dependencies
+  - Sets up test data
+  - Runs CLI tests
+  - Runs pytest
+### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
+- **Purpose**: Full CI/CD pipeline
+- **Features**:
+  - Linting (Ruff, Black)
+  - Unit tests (Python 3.10, 3.11, 3.12)
+  - Integration tests
+  - Security scanning (Safety, Bandit)
+  - Coverage reporting
+  - Package building (on main branch)
+### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
+- **Purpose**: Cross-platform testing
+- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
+- **Python**: 3.10, 3.11, 3.12
+- **Features**: Tests compatibility across different operating systems
+### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
+- **Purpose**: Original test workflow
+- **Features**:
+  - Multiple Python versions
+  - System dependency installation
+  - Test data creation
+  - Coverage reporting
+## Setup Scripts
+### Test Data Setup (`.github/scripts/setup_test_data.py`)
+Creates dummy test files when example data is not available:
+- PDF documents
+- CSV files
+- Word documents
+- Images
+- Allow/deny lists
+- OCR output files
+## Usage
+### Running Tests Locally
+```bash
+# Install dependencies
+pip install -r requirements.txt
+pip install pytest pytest-cov
+# Setup test data
+python .github/scripts/setup_test_data.py
+# Run tests
+cd test
+python test.py
+```
+### GitHub Actions Triggers
+1. **Push to main/dev**: Runs all tests
+2. **Pull Request**: Runs tests and linting
+3. **Daily Schedule**: Runs tests at 2 AM UTC
+4. **Manual Trigger**: Can be triggered manually from GitHub
+## Configuration
+### Environment Variables
+- `PYTHON_VERSION`: Default Python version (3.11)
+- `PYTHONPATH`: Set automatically for test discovery
+### Caching
+- Pip dependencies are cached for faster builds
+- Cache key based on requirements.txt hash
+### Artifacts
+- Test results (JUnit XML)
+- Coverage reports (HTML, XML)
+- Security reports
+- Build artifacts (on main branch)
+## Test Data
+The workflows automatically create test data when example files are missing:
+### Required Files Created:
+- `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
+- `example_data/combined_case_notes.csv`
+- `example_data/Bold minimalist professional cover letter.docx`
+- `example_data/example_complaint_letter.jpg`
+- `example_data/test_allow_list_*.csv`
+- `example_data/partnership_toolkit_redact_*.csv`
+- `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
+### Dependencies Installed:
+- **System**: tesseract-ocr, poppler-utils, OpenGL libraries
+- **Python**: All requirements.txt packages + pytest, reportlab, pillow
+## Workflow Status
+### Success Criteria:
+- ✅ All tests pass
+- ✅ No linting errors
+- ✅ Security checks pass
+- ✅ Coverage meets threshold (if configured)
+### Failure Handling:
+- Tests are designed to skip gracefully if files are missing
+- AWS tests are expected to fail without credentials
+- System dependency failures are handled with fallbacks
+## Customization
+### Adding New Tests:
+1. Add test methods to `test/test.py`
+2. Update test data in `setup_test_data.py` if needed
+3. Tests will automatically run in all workflows
+### Modifying Workflows:
+1. Edit the appropriate `.yml` file
+2. Test locally first
+3. Push to trigger the workflow
+### Environment-Specific Settings:
+- **Ubuntu**: Full system dependencies
+- **Windows**: Python packages only
+- **macOS**: Homebrew dependencies
+## Troubleshooting
+### Common Issues:
+1. **Missing Dependencies**:
+   - Check system dependency installation
+   - Verify Python package versions
+2. **Test Failures**:
+   - Check test data creation
+   - Verify file paths
+   - Review test output logs
+3. **AWS Test Failures**:
+   - Expected without credentials
+   - Tests are designed to handle this gracefully
+4. **System Dependency Issues**:
+   - Different OS have different requirements
+   - Check the specific OS section in workflows
+### Debug Mode:
+Add `--verbose` or `-v` flags to pytest commands for more detailed output.
+## Security
+- Dependencies are scanned with Safety
+- Code is scanned with Bandit
+- No secrets are exposed in logs
+- Test data is temporary and cleaned up
+## Performance
+- Tests run in parallel where possible
+- Dependencies are cached
+- Only necessary system packages are installed
+- Test data is created efficiently
+## Monitoring
+- Workflow status is visible in GitHub Actions tab
+- Coverage reports are uploaded to Codecov
+- Test results are available as artifacts
+- Security reports are generated and stored

.github/workflows/archive_workflows/multi-os-test.yml ADDED Viewed

	@@ -0,0 +1,109 @@

+name: Multi-OS Test
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
+        python-version: ["3.11", "3.12", "3.13"]
+        exclude:
+          # Exclude some combinations to reduce CI time
+          #- os: windows-latest
+          #  python-version: ["3.12", "3.13"]
+          - os: macos-latest
+            python-version: ["3.12", "3.13"]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install system dependencies (Ubuntu)
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0
+    - name: Install system dependencies (macOS)
+      if: matrix.os == 'macos-latest'
+      run: |
+        brew install tesseract poppler
+    - name: Install system dependencies (Windows)
+      if: matrix.os == 'windows-latest'
+      run: |
+        # Create tools directory
+        if (!(Test-Path "C:\tools")) {
+            mkdir C:\tools
+        }
+        # Download and install Tesseract
+        $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
+        $tesseractInstaller = "C:\tools\tesseract-installer.exe"
+        Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
+        # Install Tesseract silently
+        Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
+        # Download and extract Poppler
+        $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
+        $popplerZip = "C:\tools\poppler.zip"
+        Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
+        # Extract Poppler
+        Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
+        # Add to PATH
+        echo "C:\tools\tesseract" >> $env:GITHUB_PATH
+        echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
+        # Set environment variables for your application
+        echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
+        echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
+        echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
+        # Verify installation using full paths (since PATH won't be updated in current session)
+        & "C:\tools\tesseract\tesseract.exe" --version
+        & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+    - name: Run CLI tests
+      run: |
+        cd test
+        python test.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/test.py -v --tb=short

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,260 @@

+name: CI/CD Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  #schedule:
+  # Run tests daily at 2 AM UTC
+  #  - cron: '0 2 * * *'
+permissions:
+  contents: read
+  actions: read
+  pull-requests: write
+  issues: write
+env:
+  PYTHON_VERSION: "3.11"
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff black
+    - name: Run Ruff linter
+      run: ruff check .
+    - name: Run Black formatter check
+      run: black --check .
+  test-unit:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.11, 3.12, 3.13]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Cache pip dependencies
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0 \
+          libsm6 \
+          libxext6 \
+          libxrender-dev \
+          libgomp1
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la example_data/ || echo "example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la example_data/
+        echo "Checking for specific PDF files:"
+        ls -la example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Clean up problematic config files
+      run: |
+        rm -f config*.py || true
+    - name: Run CLI tests
+      run: |
+        cd test
+        python test.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/test.py -v --tb=short --junitxml=test-results.xml
+    - name: Run tests with coverage
+      run: |
+        pytest test/test.py --cov=. --cov-config=.coveragerc --cov-report=xml --cov-report=html --cov-report=term
+    #- name: Upload coverage to Codecov - not necessary
+    #  uses: codecov/codecov-action@v3
+    #  if: matrix.python-version == '3.11'
+    #  with:
+    #    file: ./coverage.xml
+    #    flags: unittests
+    #    name: codecov-umbrella
+    #    fail_ci_if_error: false
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: test-results-python-${{ matrix.python-version }}
+        path: |
+          test-results.xml
+          htmlcov/
+          coverage.xml
+  test-integration:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la example_data/ || echo "example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la example_data/
+        echo "Checking for specific PDF files:"
+        ls -la example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Run integration tests
+      run: |
+        cd test
+        python demo_single_test.py
+    - name: Test CLI help
+      run: |
+        python cli_redact.py --help
+    - name: Test CLI version
+      run: |
+        python -c "import sys; print(f'Python {sys.version}')"
+  security:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install safety bandit
+    #- name: Run safety scan - removed as now requires login
+    #  run: |
+    #    safety scan -r requirements.txt
+    - name: Run bandit security check
+      run: |
+        bandit -r . -f json -o bandit-report.json || true
+    - name: Upload security report
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: security-report
+        path: bandit-report.json
+  build:
+    runs-on: ubuntu-latest
+    needs: [lint, test-unit]
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install build dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Build package
+      run: |
+        python -m build
+    - name: Check package
+      run: |
+        twine check dist/*
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v4
+      with:
+        name: dist
+        path: dist/

.github/workflows/simple-test.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+name: Simple Test Run
+on:
+  push:
+    branches: [ dev ]
+  pull_request:
+    branches: [ dev ]
+permissions:
+  contents: read
+  actions: read
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.12"
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y \
+          tesseract-ocr \
+          tesseract-ocr-eng \
+          poppler-utils \
+          libgl1-mesa-dri \
+          libglib2.0-0
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements_lightweight.txt
+        pip install pytest pytest-cov reportlab pillow
+    - name: Download spaCy model
+      run: |
+        python -m spacy download en_core_web_lg
+    - name: Setup test data
+      run: |
+        python .github/scripts/setup_test_data.py
+        echo "Setup script completed. Checking results:"
+        ls -la example_data/ || echo "example_data directory not found"
+    - name: Verify test data files
+      run: |
+        echo "Checking if critical test files exist:"
+        ls -la example_data/
+        echo "Checking for specific PDF files:"
+        ls -la example_data/*.pdf || echo "No PDF files found"
+        echo "Checking file sizes:"
+        find example_data -name "*.pdf" -exec ls -lh {} \;
+    - name: Run CLI tests
+      run: |
+        cd test
+        python test.py
+    - name: Run tests with pytest
+      run: |
+        pytest test/test.py -v --tb=short

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+permissions:
+  contents: read
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID: ${{ secrets.HF_REPO_ID }}

.github/workflows/sync_to_hf_zero_gpu.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: Sync to Hugging Face hub Zero GPU
+on:
+  push:
+    branches: [dev]
+permissions:
+  contents: read
+jobs:
+  sync-to-hub-zero-gpu:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1      # Only get the latest state
+          lfs: true           # Download actual LFS files so they can be pushed
+      - name: Install Git LFS
+        run: git lfs install
+      - name: Recreate repo history (single-commit force push)
+        run: |
+          # 1. Capture the message BEFORE we delete the .git folder
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Syncing commit message: $COMMIT_MSG"
+          # 2. DELETE the .git folder.
+          # This turns the repo into a standard folder of files.
+          rm -rf .git
+          # 3. Re-initialize a brand new git repo
+          git init -b main
+          git config --global user.name "$HF_USERNAME"
+          git config --global user.email "$HF_EMAIL"
+          # 4. Re-install LFS (needs to be done after git init)
+          git lfs install
+          # 5. Add the remote
+          git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
+          # 6. Add all files
+          # Since this is a fresh init, Git sees EVERY file as "New"
+          git add .
+          # 7. Commit and Force Push
+          git commit -m "Sync: $COMMIT_MSG"
+          git push --force hf main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_EMAIL: ${{ secrets.HF_EMAIL }}
+          HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+*.url
+*.ipynb
+*.pyc
+examples/*
+processing/*
+input/*
+output/*
+tools/__pycache__/*
+old_code/*
+tesseract/*
+poppler/*
+build/*
+dist/*
+build_deps/*
+logs/*
+usage/*
+feedback/*
+config/*
+user_guide/*
+cdk/config/*
+cdk/cdk.out/*
+cdk/archive/*
+tld/*
+tmp/*
+docs/*
+cdk.out/*
+cdk.json
+cdk.context.json
+.quarto/*
+/.quarto/
+/_site/
+test/config/*
+test/feedback/*
+test/input/*
+test/logs/*
+test/output/*
+test/tmp/*
+test/usage/*
+.ruff_cache/*
+model_cache/*
+sanitized_file/*

DocRedactApp.spec ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_all
+datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
+binaries = []
+hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_image_annotation')
+tmp_ret = collect_all('gradio_image_annotation')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('safehttpx')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_analyzer')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_anonymizer')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('presidio_image_redactor')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+a = Analysis(
+    ['app.py'],
+    pathex=[],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=['build_deps'],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    }
+)
+pyz = PYZ(a.pure)
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name='DocRedactApp',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='DocRedactApp',
+)

Dockerfile ADDED Viewed

	@@ -0,0 +1,186 @@

+# Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.12.11-slim-trixie AS builder
+# Install system dependencies
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        cmake \
+        unzip \
+        libcurl4-openssl-dev \
+        git \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY requirements_lightweight.txt .
+RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
+# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True.
+ARG INSTALL_PADDLEOCR=False
+ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
+RUN if [ "$INSTALL_PADDLEOCR" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install paddlepaddle==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/ \
+    pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0; \
+fi
+ARG INSTALL_VLM=False
+ENV INSTALL_VLM=${INSTALL_VLM}
+# Optionally install VLM if the INSTALL_VLM environment variable is set to True. Use index-url https://download.pytorch.org/whl/cu129 for GPU version of PyTorch.
+RUN if [ "$INSTALL_VLM" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu; \
+    pip install --verbose --no-cache-dir --target=/install torchvision --index-url https://download.pytorch.org/whl/cpu; \
+    pip install --verbose --no-cache-dir --target=/install transformers<=4.57.2 accelerate<=1.11.0 bitsandbytes<=0.48.1 sentencepiece==0.2.1; \
+fi
+# ===================================================================
+# Stage 2: A common 'base' for both Lambda and Gradio
+# ===================================================================
+FROM public.ecr.aws/docker/library/python:3.12.11-slim-trixie AS base
+# Set build-time and runtime environment variable for whether to run in Gradio mode or Lambda mode
+ARG APP_MODE=gradio
+ENV APP_MODE=${APP_MODE}
+# Set build-time and runtime environment variable for whether to run in FastAPI mode
+ARG RUN_FASTAPI=False
+ENV RUN_FASTAPI=${RUN_FASTAPI}
+# Install runtime system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr poppler-utils libgl1 libglib2.0-0 \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+ENV APP_HOME=/home/user
+# Set env variables for Gradio & other apps
+ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
+    TLDEXTRACT_CACHE=/tmp/tld/ \
+    MPLCONFIGDIR=/tmp/matplotlib_cache/ \
+    GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
+    GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
+    FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
+    ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
+    USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
+    CONFIG_FOLDER=$APP_HOME/app/config/ \
+    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
+    TESSERACT_DATA_FOLDER=/usr/share/tessdata \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_ANALYTICS_ENABLED=False \
+    DEFAULT_CONCURRENCY_LIMIT=3
+# Copy Python packages from the builder stage
+COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
+COPY --from=builder /install/bin /usr/local/bin/
+# Copy your application code and entrypoint
+COPY . ${APP_HOME}/app
+COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
+# Fix line endings and set execute permissions
+RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
+    && chmod +x ${APP_HOME}/app/entrypoint.sh
+WORKDIR ${APP_HOME}/app
+# ===================================================================
+# FINAL Stage 3: The Lambda Image (runs as root for simplicity)
+# ===================================================================
+FROM base AS lambda
+# Set runtime ENV for Lambda mode
+ENV APP_MODE=lambda
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["lambda_entrypoint.lambda_handler"]
+# ===================================================================
+# FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
+# ===================================================================
+FROM base AS gradio
+# Set runtime ENV for Gradio mode
+ENV APP_MODE=gradio
+# Create non-root user
+RUN useradd -m -u 1000 user
+# Create the base application directory and set its ownership
+RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
+# Create required sub-folders within the app directory and set their permissions
+# This ensures these specific directories are owned by 'user'
+RUN mkdir -p \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chown user:user \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config \
+    && chmod 755 \
+    ${APP_HOME}/app/output \
+    ${APP_HOME}/app/input \
+    ${APP_HOME}/app/logs \
+    ${APP_HOME}/app/usage \
+    ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config
+# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
+RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
+    && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
+    && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME} \
+    && mkdir -p ${APP_HOME}/.paddlex \
+    && chown user:user ${APP_HOME}/.paddlex \
+    && chmod 755 ${APP_HOME}/.paddlex \
+    && mkdir -p ${APP_HOME}/.local/share/spacy/data \
+    && chown user:user ${APP_HOME}/.local/share/spacy/data \
+    && chmod 755 ${APP_HOME}/.local/share/spacy/data \
+    && mkdir -p /usr/share/tessdata \
+    && chown user:user /usr/share/tessdata \
+    && chmod 755 /usr/share/tessdata
+# Fix apply user ownership to all files in the home directory
+RUN chown -R user:user /home/user
+# Set permissions for Python executable
+RUN chmod 755 /usr/local/bin/python
+# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
+VOLUME ["/tmp/matplotlib_cache"]
+VOLUME ["/tmp/gradio_tmp"]
+VOLUME ["/tmp/tld"]
+VOLUME ["/home/user/app/output"]
+VOLUME ["/home/user/app/input"]
+VOLUME ["/home/user/app/logs"]
+VOLUME ["/home/user/app/usage"]
+VOLUME ["/home/user/app/feedback"]
+VOLUME ["/home/user/app/config"]
+VOLUME ["/home/user/.paddlex"]
+VOLUME ["/home/user/.local/share/spacy/data"]
+VOLUME ["/usr/share/tessdata"]
+VOLUME ["/tmp"]
+VOLUME ["/var/tmp"]
+USER user
+EXPOSE $GRADIO_SERVER_PORT
+ENTRYPOINT ["/home/user/app/entrypoint.sh"]
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,1261 @@

+---
+title: Document redaction
+emoji: 📝
+colorFrom: blue
+colorTo: yellow
+sdk: docker
+app_file: app.py
+pinned: true
+license: agpl-3.0
+short_description: OCR / redact PDF documents and tabular data
+---
+# Document redaction
+version: 1.6.2
+Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
+To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
+For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings).  AWS Comprehend gives better results at a small cost.
+Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
+---
+## 🚀 Quick Start - Installation and first run
+Follow these instructions to get the document redaction application running on your local machine.
+### 1. Prerequisites: System Dependencies
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
+---
+#### **On Windows**
+Installation on Windows requires downloading installers and adding the programs to your system's PATH.
+1.  **Install Tesseract OCR:**
+    *   Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+    *   Run the installer.
+    *   **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+2.  **Install Poppler:**
+    *   Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
+    *   Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+    *   You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+        *   Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+        *   Click the "Environment Variables..." button.
+        *   In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+        *   Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+        *   Click OK on all windows to save the changes.
+    To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+---
+#### **On Linux (Debian/Ubuntu)**
+Open your terminal and run the following command to install Tesseract and Poppler:
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+#### **On Linux (Fedora/CentOS/RHEL)**
+Open your terminal and use the `dnf` or `yum` package manager:
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+### 2. Installation: Code and Python Packages
+Once the system prerequisites are installed, you can set up the Python environment.
+#### Step 1: Clone the Repository
+Open your terminal or Git Bash and clone this repository:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+```
+#### Step 2: Create and Activate a Virtual Environment (Recommended)
+It is highly recommended to use a virtual environment to isolate project dependencies and avoid conflicts with other Python projects.
+```bash
+# Create the virtual environment
+python -m venv venv
+# Activate it
+# On Windows:
+.\venv\Scripts\activate
+# On macOS/Linux:
+source venv/bin/activate
+```
+#### Step 3: Install Python Dependencies
+##### Lightweight version (without PaddleOCR and VLM support)
+This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
+```bash
+pip install .
+```
+Alternatively, you can install from the `requirements_lightweight.txt` file:
+```bash
+pip install -r requirements_lightweight.txt
+```
+##### Full version (with Paddle and VLM support)
+Run the following command to install the additional dependencies:
+```bash
+pip install .[paddle,vlm]
+```
+Alternatively, you can use the full `requirements.txt` file, that contains references to the PaddleOCR and related Torch/transformers dependencies (for cuda 12.9):
+```bash
+pip install -r requirements.txt
+```
+Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
+```bash
+pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+```
+**Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
+```bash
+pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
+pip install torchvision --index-url https://download.pytorch.org/whl/cu129
+```
+### 3. Run the Application
+With all dependencies installed, you can now start the Gradio application.
+```bash
+python app.py
+```
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+Open this URL in your web browser to use the document redaction tool
+#### Command line interface
+If instead you want to run redactions or other app functions in CLI mode, run the following for instructions:
+```bash
+python cli_redact.py --help
+```
+---
+### 4. ⚙️ Configuration (Optional)
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+To get started:
+1.  Locate the `example_config.env` file in the root of the project.
+2.  Create a new file named `app_config.env` inside the `config/` directory (i.e., `config/app_config.env`).
+3.  Copy the contents from `example_config.env` into your new `config/app_config.env` file.
+4.  Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+If you do not create this file, the application will run with default settings.
+#### Configuration Breakdown
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+---
+#### **Local & General Settings (No AWS Required)**
+These settings are useful for all users, regardless of whether you are using AWS.
+*   `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+    *   Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+    *   Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+    *   **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+*   `SHOW_LANGUAGE_SELECTION=True`
+    *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
+*   `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+*   `SESSION_OUTPUT_FOLDER=False`
+    *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+*   `DISPLAY_FILE_NAMES_IN_LOGS=False`
+    *   For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+---
+#### **AWS-Specific Settings**
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+*   `RUN_AWS_FUNCTIONS=True`
+    *   **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
+*   **UI Options:**
+    *   `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+    *   `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+*   **Core AWS Configuration:**
+    *   `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+    *   `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+*   **AWS Logging:**
+    *   `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+    *   `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+*   **Advanced AWS Textract Features:**
+    *   `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+    *   `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+    *   `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+*   **Cost Tracking (for internal accounting):**
+    *   `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+    *   `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+    *   `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+    *   `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+Now you have the app installed, what follows is a guide on how to use it for basic and advanced redaction.
+# User guide
+## Table of contents
+### Getting Started
+- [Built-in example data](#built-in-example-data)
+- [Basic redaction](#basic-redaction)
+- [Customising redaction options](#customising-redaction-options)
+    - [Custom allow, deny, and page redaction lists](#custom-allow-deny-and-page-redaction-lists)
+        - [Allow list example](#allow-list-example)
+        - [Deny list example](#deny-list-example)
+        - [Full page redaction list example](#full-page-redaction-list-example)
+    - [Redacting additional types of personal information](#redacting-additional-types-of-personal-information)
+    - [Redacting only specific pages](#redacting-only-specific-pages)
+    - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
+- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+- [Redacting Word, tabular data files (CSV/XLSX) or copy and pasted text](#redacting-word-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
+- [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
+### Advanced user guide
+- [Fuzzy search and redaction](#fuzzy-search-and-redaction)
+- [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
+    - [Using _for_review.pdf files with Adobe Acrobat](#using-_for_reviewpdf-files-with-adobe-acrobat)
+    - [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
+    - [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
+- [Using the AWS Textract document API](#using-the-aws-textract-document-api)
+- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
+- [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
+- [Merging redaction review files](#merging-redaction-review-files)
+### Features for expert users/system administrators
+- [Advanced OCR options (Hybrid OCR)](#advanced-ocr-options-hybrid-ocr)
+- [Command Line Interface (CLI)](#command-line-interface-cli)
+## Built-in example data
+The app now includes built-in example files that you can use to quickly test different features. These examples are automatically loaded and can be accessed directly from the interface without needing to download files separately.
+### Using built-in examples
+**For PDF/image redaction:** On the 'Redact PDFs/images' tab, you'll see a section titled "Try an example - Click on an example below and then the 'Extract text and redact document' button". Simply click on any of the available examples to load them with pre-configured settings:
+- **PDF with selectable text redaction** - Uses local text extraction with standard PII detection
+- **Image redaction with local OCR** - Processes an image file using OCR
+- **PDF redaction with custom entities** - Demonstrates custom entity selection (Titles, Person, Dates)
+- **PDF redaction with AWS services and signature detection** - Shows AWS Textract with signature extraction (if AWS is enabled)
+- **PDF redaction with custom deny list and whole page redaction** - Demonstrates advanced redaction features
+Once you have clicked on an example, you can click the 'Extract text and redact document' button to load the example into the app and redact it.
+**For tabular data:** On the 'Word or Excel/csv files' tab, you'll find examples for both redaction and duplicate detection:
+- **CSV file redaction** - Shows how to redact specific columns in tabular data
+- **Word document redaction** - Demonstrates Word document processing
+- **Excel file duplicate detection** - Shows how to find duplicate rows in spreadsheet data
+Once you have clicked on an example, you can click the 'Redact text/data files' button to load the example into the app and redact it. For the duplicate detection example, you can click the 'Find duplicate cells/rows' button to load the example into the app and find duplicates.
+**For duplicate page detection:** On the 'Identify duplicate pages' tab, you'll find examples for finding duplicate content in documents:
+- **Find duplicate pages of text in document OCR outputs** - Uses page-level analysis with a similarity threshold of 0.95 and minimum word count of 10
+- **Find duplicate text lines in document OCR outputs** - Uses line-level analysis with a similarity threshold of 0.95 and minimum word count of 3
+Once you have clicked on an example, you can click the 'Identify duplicate pages/subdocuments' button to load the example into the app and find duplicate content.
+### External example files (optional)
+If you prefer to use your own example files or want to follow along with specific tutorials, you can still download these external example files:
+- [Example of files sent to a professor before applying](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf)
+- [Example complaint letter (jpg)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_complaint_letter.jpg)
+- [Partnership Agreement Toolkit (for signatures and more advanced usage)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf)
+- [Dummy case note data](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv)
+## Basic redaction
+The document redaction app can detect personally-identifiable information (PII) in documents. Documents can be redacted directly, or suggested redactions can be reviewed and modified using a grapical user interface. Basic document redaction can be performed quickly using the default options.
+Download the example PDFs above to your computer. Open up the redaction app with the link provided by email.
+![Upload files](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/file_upload_highlight.PNG)
+### Upload files to the app
+The 'Redact PDFs/images tab' currently accepts PDFs and image files (JPG, PNG) for redaction. Click on the 'Drop files here or Click to Upload' area of the screen, and select one of the three different [example files](#example-data-files) (they should all be stored in the same folder if you want them to be redacted at the same time).
+### Text extraction
+You can modify default text extraction methods by clicking on the 'Change default text extraction method...' box'.
+Here you can select one of the three text extraction options:
+- **'Local model - selectable text'** - This will read text directly from PDFs that have selectable text to redact (using PikePDF). This is fine for most PDFs, but will find nothing if the PDF does not have selectable text, and it is not good for handwriting or signatures. If it encounters an image file, it will send it onto the second option below.
+- **'Local OCR model - PDFs without selectable text'** - This option will use a simple Optical Character Recognition (OCR) model (Tesseract) to pull out text from a PDF/image that it 'sees'. This can handle most typed text in PDFs/images without selectable text, but struggles with handwriting/signatures. If you are interested in the latter, then you should use the third option if available.
+- **'AWS Textract service - all PDF types'** - Only available for instances of the app running on AWS. AWS Textract is a service that performs OCR on documents within their secure service. This is a more advanced version of OCR compared to the local option, and carries a (relatively small) cost. Textract excels in complex documents based on images, or documents that contain a lot of handwriting and signatures.
+### Enable AWS Textract signature extraction
+If you chose the AWS Textract service above, you can choose if you want handwriting and/or signatures redacted by default. Choosing signatures here will have a cost implication, as identifying signatures will cost ~£2.66 ($3.50) per 1,000 pages vs ~£1.14 ($1.50) per 1,000 pages without signature detection.
+![AWS Textract handwriting and signature options](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_handwriting_signatures.PNG)
+**NOTE:** it is also possible to enable form extraction, layout extraction, and table extraction with AWS Textract. This is not enabled by default, but it is possible for your system admin to enable this feature in the config file.
+### PII redaction method
+If you are running with the AWS service enabled, here you will also have a choice for PII redaction method:
+- **'Only extract text - (no redaction)'** - If you are only interested in getting the text out of the document for further processing (e.g. to find duplicate pages, or to review text on the Review redactions page)
+- **'Local'** - This uses the spacy package to rapidly detect PII in extracted text. This method is often sufficient if you are just interested in redacting specific terms defined in a custom list.
+- **'AWS Comprehend'** - This method calls an AWS service to provide more accurate identification of PII in extracted text.
+### Optional - costs and time estimation
+If the option is enabled (by your system admin, in the config file), you will see a cost and time estimate for the redaction process. 'Existing Textract output file found' will be checked automatically if previous Textract text extraction files exist in the output folder, or have been [previously uploaded by the user](#aws-textract-outputs) (saving time and money for redaction).
+![Cost and time estimation](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/costs_and_time.PNG)
+### Optional - cost code selection
+If the option is enabled (by your system admin, in the config file), you may be prompted to select a cost code before continuing with the redaction task.
+![Cost code selection](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/cost_code_selection.PNG)
+The relevant cost code can be found either by: 1. Using the search bar above the data table to find relevant cost codes, then clicking on the relevant row, or 2. typing it directly into the dropdown to the right, where it should filter as you type.
+### Optional - Submit whole documents to Textract API
+If this option is enabled (by your system admin, in the config file), you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here). This feature is described in more detail in the [advanced user guide](#using-the-aws-textract-document-api).
+![Textract document API](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_document_api.PNG)
+### Redact the document
+Click 'Redact document'. After loading in the document, the app should be able to process about 30 pages per minute (depending on redaction methods chose above). When ready, you should see a message saying that processing is complete, with output files appearing in the bottom right.
+### Redaction outputs
+![Redaction outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/redaction_outputs.PNG)
+- **'...redacted.pdf'** files contain the original pdf with suggested redacted text deleted and replaced by a black box on top of the document.
+- **'...redactions_for_review.pdf'** files contain the original PDF with redaction boxes overlaid but the original text still visible underneath. This file is designed for use in Adobe Acrobat and other PDF viewers where you can see the suggested redactions without the text being permanently removed. This is particularly useful for reviewing redactions before finalising them.
+- **'...ocr_results.csv'** files contain the line-by-line text outputs from the entire document. This file can be useful for later searching through for any terms of interest in the document (e.g. using Excel or a similar program).
+- **'...review_file.csv'** files are the review files that contain details and locations of all of the suggested redactions in the document. This file is key to the [review process](#reviewing-and-modifying-suggested-redactions), and should be downloaded to use later for this.
+### Additional AWS Textract / local OCR outputs
+If you have used the AWS Textract option for extracting text, you may also see a '..._textract.json' file. This file contains all the relevant extracted text information that comes from the AWS Textract service. You can keep this file and upload it at a later date alongside your input document, which will enable you to skip calling AWS Textract every single time you want to do a redaction task, as follows:
+![Document upload alongside Textract](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/document_upload_with_textract.PNG)
+#### Additional outputs in the log file outputs
+On the Redaction settings tab, near the bottom of the pagethere is a section called 'Log file outputs'. This section contains the following files:
+You may see a '..._ocr_results_with_words... .json' file. This file works in the same way as the AWS Textract .json results described above, and can be uploaded alongside an input document to save time on text extraction in future in the same way.
+Also you will see a 'decision_process_table.csv' file. This file contains a table of the decisions made by the app for each page of the document. This can be useful for debugging and understanding the decisions made by the app.
+Additionally, if the option is enabled by your system administrator, on this tab you may see an image of the output from the OCR model used to extract the text from the document, an image ending with page number and '_visualisations.jpg'. A separate image will be created for each page of the document like the one below. This can be useful for seeing at a glance whether the text extraction process for a page was successful, and whether word-level bounding boxes are correctly positioned.
+![Text analysis output](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/example_complaint_letter_1_textract_visualisations.jpg)
+### Downloading output files from previous redaction tasks
+If you are logged in via AWS Cognito and you lose your app page for some reason (e.g. from a crash, reloading), it is possible recover your previous output files, provided the server has not been shut down since you redacted the document. If enabled, this feature can be found at the bottom of the front tab, called 'View and download all output files from this session'. If you open this and click on 'Refresh files in output folder' you should see a file directory of all files. If you click on the box next to a given file, it should appear below for you to download.
+![View all output files](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/view_all_output_files.PNG)
+### Basic redaction summary
+We have covered redacting documents with the default redaction options. The '...redacted.pdf' file output may be enough for your purposes. But it is very likely that you will need to customise your redaction options, which we will cover below.
+## Customising redaction options
+On the 'Redaction settings' page, there are a number of options that you can tweak to better match your use case and needs.
+### Custom allow, deny, and page redaction lists
+The app allows you to specify terms that should never be redacted (an allow list), terms that should always be redacted (a deny list), and also to provide a list of page numbers for pages that should be fully redacted.
+![Custom allow, deny, and page redaction lists](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/allow_deny_full_page_list.PNG)
+#### Allow list example
+It may be the case that specific terms that are frequently redacted are not interesting to
+In the redacted outputs of the 'Example of files sent to a professor before applying' PDF, you can see that it is frequently redacting references to Dr Hyde's lab in the main body of the text. Let's say that references to Dr Hyde were not considered personal information in this context. You can exclude this term from redaction (and others) by providing an 'allow list' file. This is simply a csv that contains the case sensitive terms to exclude in the first column, in our example, 'Hyde' and 'Muller glia'. The example file is provided [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/allow_list.csv).
+To import this to use with your redaction tasks, go to the 'Redaction settings' tab, click on the 'Import allow list file' button halfway down, and select the csv file you have created. It should be loaded for next time you hit the redact button. Go back to the first tab and do this.
+#### Deny list example
+Say you wanted to remove specific terms from a document. In this app you can do this by providing a custom deny list as a csv. Like for the allow list described above, this should be a one-column csv without a column header. The app will suggest each individual term in the list with exact spelling as whole words. So it won't select text from within words. To enable this feature, the 'CUSTOM' tag needs to be chosen as a redaction entity [(the process for adding/removing entity types to redact is described below)](#redacting-additional-types-of-personal-information).
+Here is an example using the [Partnership Agreement Toolkit file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf). This is an [example of a custom deny list file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/partnership_toolkit_redact_custom_deny_list.csv). 'Sister', 'Sister City'
+'Sister Cities', 'Friendship City' have been listed as specific terms to redact. You can see the outputs of this redaction process on the review page:
+![Deny list redaction Partnership file](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/deny_list_partnership_example.PNG).
+You can see that the app has highlighted all instances of these terms on the page shown. You can then consider each of these terms for modification or removal on the review page [explained here](#reviewing-and-modifying-suggested-redactions).
+#### Full page redaction list example
+There may be full pages in a document that you want to redact. The app also provides the capability of redacting pages completely based on a list of input page numbers in a csv. The format of the input file is the same as that for the allow and deny lists described above - a one-column csv without a column header. An [example of this is here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/partnership_toolkit_redact_some_pages.csv). You can see an example of the redacted page on the review page:
+![Whole page partnership redaction](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/whole_page_partnership_example.PNG).
+Using the above approaches to allow, deny, and full page redaction lists will give you an output [like this](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/Partnership-Agreement-Toolkit_0_0_redacted.pdf).
+#### Adding to the loaded allow, deny, and whole page lists in-app
+If you open the accordion below the allow list options called 'Manually modify custom allow...', you should be able to see a few tables with options to add new rows:
+![Manually modify allow or deny list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify.PNG)
+If the table is empty, you can add a new entry, you can add a new row by clicking on the '+' item below each table header. If there is existing data, you may need to click on the three dots to the right and select 'Add row below'. Type the item you wish to keep/remove in the cell, and then (important) press enter to add this new item to the allow/deny/whole page list. Your output tables should look something like below.
+![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
+### Redacting additional types of personal information
+You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
+Under the 'Redaction settings' tab, go to 'Entities to redact (click close to down arrow for full list)'. Different dropdowns are provided according to whether you are using the Local service to redact PII, or the AWS Comprehend service. Click within the empty box close to the dropdown arrow and you should see a list of possible 'entities' to redact. Select 'DATE_TIME' and it should appear in the main list. To remove items, click on the 'x' next to their name.
+![Redacting additional types of information dropdown](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/additional_entities/additional_entities_select.PNG)
+Now, go back to the main screen and click 'Redact Document' again. You should now get a redacted version of 'Example complaint letter' that has the dates and times removed.
+If you want to redact different files, I suggest you refresh your browser page to start a new session and unload all previous data.
+## Redacting only specific pages
+Say also we are only interested in redacting page 1 of the loaded documents. On the Redaction settings tab, select 'Lowest page to redact' as 1, and 'Highest page to redact' also as 1. When you next redact your documents, only the first page will be modified. The output files should now have a suffix similar to '..._1_1.pdf', indicating the lowest and highest page numbers that were redacted.
+![Selecting specific pages to redact](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/select_pages.PNG)
+## Handwriting and signature redaction
+The file [Partnership Agreement Toolkit (for signatures and more advanced usage)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf) is provided as an example document to test AWS Textract + redaction with a document that has signatures in. If you have access to AWS Textract in the app, try removing all entity types from redaction on the Redaction settings and clicking the big X to the right of 'Entities to redact'.
+To ensure that handwriting and signatures are enabled (enabled by default), on the front screen go the 'AWS Textract signature detection' to enable/disable the following options :
+![Handwriting and signatures](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/textract_handwriting_signatures.PNG)
+The outputs should show handwriting/signatures redacted (see pages 5 - 7), which you can inspect and modify on the 'Review redactions' tab.
+![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
+## Reviewing and modifying suggested redactions
+Sometimes the app will suggest redactions that are incorrect, or will miss personal information entirely. The app allows you to review and modify suggested redactions to compensate for this. You can do this on the 'Review redactions' tab.
+We will go through ways to review suggested redactions with an example.On the first tab 'PDFs/images' upload the ['Example of files sent to a professor before applying.pdf'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf) file. Let's stick with the 'Local model - selectable text' option, and click 'Redact document'. Once the outputs are created, go to the 'Review redactions' tab.
+On the 'Review redactions' tab you have a visual interface that allows you to inspect and modify redactions suggested by the app. There are quite a few options to look at, so we'll go from top to bottom.
+![Review redactions](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_redactions.PNG)
+### Uploading documents for review
+The top area has a file upload area where you can upload files for review . In the left box, upload the original PDF file. Click '1. Upload original PDF'. In the right box, you can upload the '..._review_file.csv' that is produced by the redaction process.
+Optionally, you can upload a '..._ocr_result_with_words' file here, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
+Now you can review and modify the suggested redactions using the interface described below.
+![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
+You can upload the three review files in the box (unredacted document, '..._review_file.csv' and '..._ocr_output.csv' file) before clicking '**Review redactions based on original PDF...**', as in the image below:
+![Upload three files for review](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/upload_three_files.PNG)
+**NOTE:** ensure you upload the ***unredacted*** document here and not the redacted version, otherwise you will be checking over a document that already has redaction boxes applied!
+### Page navigation
+You can change the page viewed either by clicking 'Previous page' or 'Next page', or by typing a specific page number in the 'Current page' box and pressing Enter on your keyboard. Each time you switch page, it will save redactions you have made on the page you are moving from, so you will not lose changes you have made.
+You can also navigate to different pages by clicking on rows in the tables under 'Search suggested redactions' to the right, or 'search all extracted text' (if enabled) beneath that.
+### The document viewer pane
+On the selected page, each redaction is highlighted with a box next to its suggested redaction label (e.g. person, email).
+![Document view pane](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/document_viewer_pane.PNG)
+There are a number of different options to add and modify redaction boxes and page on the document viewer pane. To zoom in and out of the page, use your mouse wheel. To move around the page while zoomed, you need to be in modify mode. Scroll to the bottom of the document viewer to see the relevant controls. You should see a box icon, a hand icon, and two arrows pointing counter-clockwise and clockwise.
+![Change redaction mode](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/change_review_mode.PNG)
+Click on the hand icon to go into modify mode. When you click and hold on the document viewer, This will allow you to move around the page when zoomed in. To rotate the page, you can click on either of the round arrow buttons to turn in that direction.
+**NOTE:** When you switch page, the viewer will stay in your selected orientation, so if it looks strange, just rotate the page again and hopefully it will look correct!
+#### Modify existing redactions (hand icon)
+After clicking on the hand icon, the interface allows you to modify existing redaction boxes. When in this mode, you can click and hold on an existing box to move it.
+![Modify existing redaction box](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/modify_existing_redaction_box.PNG)
+Click on one of the small boxes at the edges to change the size of the box. To delete a box, click on it to highlight it, then press delete on your keyboard. Alternatively, double click on a box and click 'Remove' on the box that appears.
+![Remove existing redaction box](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/existing_redaction_box_remove.PNG)
+#### Add new redaction boxes (box icon)
+To change to 'add redaction boxes' mode, scroll to the bottom of the page. Click on the box icon, and your cursor will change into a crosshair. Now you can add new redaction boxes where you wish. A popup will appear when you create a new box so you can select a label and colour for the new box.
+#### 'Locking in' new redaction box format
+It is possible to lock in a chosen format for new redaction boxes so that you don't have the popup appearing each time. When you make a new box, select the options for your 'locked' format, and then click on the lock icon on the left side of the popup, which should turn blue.
+![Lock redaction box format](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/new_redaction_box_lock_mode.PNG)
+You can now add new redaction boxes without a popup appearing. If you want to change or 'unlock' the your chosen box format, you can click on the new icon that has appeared at the bottom of the document viewer pane that looks a little like a gift tag. You can then change the defaults, or click on the lock icon again to 'unlock' the new box format - then popups will appear again each time you create a new box.
+![Change or unlock redaction box format](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/change_review_mode_with_lock.PNG)
+### Apply redactions to PDF and Save changes on current page
+Once you have reviewed all the redactions in your document and you are happy with the outputs, you can click 'Apply revised redactions to PDF' to create a new '_redacted.pdf' output alongside a new '_review_file.csv' output.
+If you are working on a page and haven't saved for a while, you can click 'Save changes on current page to file' to ensure that they are saved to an updated 'review_file.csv' output.
+![Review modified outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_mod_outputs.PNG)
+### Selecting and removing redaction boxes using the 'Search suggested redactions' table
+The table shows a list of all the suggested redactions in the document alongside the page, label, and text (if available).
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/list_find_labels.PNG)
+If you click on one of the rows in this table, you will be taken to the page of the redaction. Clicking on a redaction row on the same page will change the colour of redaction box to blue to help you locate it in the document viewer (just when using the app, not in redacted output PDFs).
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_row_highlight.PNG)
+You can choose a specific entity type to see which pages the entity is present on. If you want to go to the page specified in the table, you can click on a cell in the table and the review page will be changed to that page.
+To filter the 'Search suggested redactions' table you can:
+1. Click on one of the dropdowns (Redaction category, Page, Text), and select an option, or
+2. Write text in the 'Filter' box just above the table. Click the blue box to apply the filter to the table.
+Once you have filtered the table, or selected a row from the table, you have a few options underneath on what you can do with the filtered rows:
+- Click the **Exclude all redactions in table** button to remove all redactions visible in the table from the document. **Important:** ensure that you have clicked the blue tick icon next to the search box before doing this, or you will remove all redactions from the document. If you do end up doing this, click the 'Undo last element removal' button below to restore the redactions.
+- Click the **Exclude specific redaction row** button to remove only the redaction from the last row you clicked on from the document. The currently selected row is visible below.
+- Click the **Exclude all redactions with the same text as selected row** button to remove all redactions from the document that are exactly the same as the selected row text.
+**NOTE**: After excluding redactions using any of the above options, click the 'Reset filters' button below to ensure that the dropdowns and table return to seeing all remaining redactions in the document.
+If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
+### Searching and Adding Custom Redactions
+After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII (Personally Identifiable Information) detection might have missed. The **"Search text and redact"** tab gives you the power to find and redact any text within your document manually.
+#### How to Use the Search and Redact Feature
+The workflow is designed to be simple: **Search → Select → Redact**.
+---
+#### **Step 1: Search for Text**
+1.  Navigate to the **"Search text and redact"** tab.
+2.  The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
+3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
+4.  Click the **"Search"** button or press Enter.
+5.  The table below will update to show only the rows containing text that matches your search query.
+> **Tip:** You can also filter the results by page number using the **"Page"** dropdown. To clear all filters and see the full text again, click the **"Reset table to original state"** button.
+---
+#### **Step 2: Select and Review a Match**
+When you click on any row in the search results table:
+*   The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
+*   The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
+---
+#### **Step 3: Choose Your Redaction Method**
+You have several powerful options for redacting the text you've found:
+*   **Redact a Single, Specific Instance:**
+    *   Click on the exact row in the table you want to redact.
+    *   Click the **`Redact specific text row`** button.
+    *   Only that single instance will be redacted.
+*   **Redact All Instances of a Word/Phrase:**
+    *   Let's say you want to redact the project name "Project Alpha" everywhere it appears.
+    *   Find and select one instance of "Project Alpha" in the table.
+    *   Click the **`Redact all words with same text as selected row`** button.
+    *   The application will find and redact every single occurrence of "Project Alpha" throughout the entire document.
+*   **Redact All Current Search Results:**
+    *   Perform a search (e.g., for a specific person's name).
+    *   If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
+    *   This will apply a redaction to all currently visible items in the table in one go.
+---
+#### **Customising Your New Redactions**
+Before you click one of the redact buttons, you can customize the appearance and label of the new redactions under the **"Search options"** accordion:
+*   **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
+*   **Colour for labels:** Set a custom color for the redaction box by providing an RGB value. The format must be three numbers (0-255) in parentheses, for example:
+    *   ` (255, 0, 0) ` for Red
+    *   ` (0, 0, 0) ` for Black
+    *   ` (255, 255, 0) ` for Yellow
+#### **Undoing a Mistake**
+If you make a mistake, you can reverse the last redaction action you performed on this tab.
+*   Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
+> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
+### Navigating through the document using the 'Search all extracted text'
+The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
+You can navigate through the document using this table. When you click on a row, the Document viewer pane to the left will change to the selected page.
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/select_extracted_text.PNG)
+You can search through the extracted text by using the search bar just above the table, which should filter as you type. To apply the filter and 'cut' the table, click on the blue tick inside the box next to your search term. To return the table to its original content, click the button below the table 'Reset OCR output table filter'.
+![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
+## Redacting Word, tabular data files (XLSX/CSV) or copy and pasted text
+### Word or tabular data files (XLSX/CSV)
+The app can be used to redact Word (.docx), or tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
+To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
+![csv upload](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/file_upload_csv_columns.PNG)
+If you were instead to upload an xlsx file, you would see also a list of all the sheets in the xlsx file that can be redacted. The 'Select columns' area underneath will suggest a list of all columns in the file across all sheets.
+![xlsx upload](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/file_upload_xlsx_columns.PNG)
+Once you have chosen your input file and sheets/columns to redact, you can choose the redaction method. 'Local' will use the same local model as used for documents on the first tab. 'AWS Comprehend' will give better results, at a slight cost.
+When you click Redact text/data files, you will see the progress of the redaction task by file and sheet, and you will receive a csv output with the redacted data.
+### Choosing output anonymisation format
+You can also choose the anonymisation format of your output results.  Open the tab 'Anonymisation output format' to see the options. By default, any detected PII will be replaced with the word 'REDACTED' in the cell. You can choose one of the following options as the form of replacement for the redacted text:
+- replace with 'REDACTED': Replaced by the word 'REDACTED' (default)
+- replace with <ENTITY_NAME>: Replaced by e.g. 'PERSON' for people, 'EMAIL_ADDRESS' for emails etc.
+- redact completely: Text is removed completely and replaced by nothing.
+- hash: Replaced by a unique long ID code that is consistent with entity text. I.e. a particular name will always have the same ID code.
+- mask: Replace with stars '*'.
+### Redacting copy and pasted text
+You can also write open text into an input box and redact that using the same methods as described above. To do this, write or paste text into the 'Enter open text' box that appears when you open the 'Redact open text' tab. Then select a redaction method, and an anonymisation output format as described above. The redacted text will be printed in the output textbox, and will also be saved to a simple csv file in the output file box.
+![Text analysis output](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/text_anonymisation_outputs.PNG)
+### Redaction log outputs
+A list of the suggested redaction outputs from the tabular data / open text data redaction is available on the Redaction settings page under 'Log file outputs'.
+## Identifying and redacting duplicate pages
+The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
+Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents.  It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
+### Duplicate page detection in documents
+This section covers finding duplicate pages across PDF documents using OCR output files.
+![Example duplicate page inputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface_new.PNG)
+**Step 1: Upload and Configure the Analysis**
+First, navigate to the "Identify duplicate pages" tab. Upload all the ocr_output.csv files you wish to compare into the file area. These files are generated every time you run a redaction task and contain the text for each page of a document.
+For our example, you can upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
+The default options will search for matching subdocuments of any length. Before running the analysis, you can configure these matching parameters to tell the tool what you're looking for:
+![Duplicate matching parameters](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_matching_parameters.PNG)
+*Matching Parameters*
+- **Similarity Threshold:** A score from 0 to 1. Pages or sequences of pages with a calculated text similarity above this value will be considered a match. The default of 0.9 (90%) is a good starting point for finding near-identical pages.
+- **Min Word Count:** Pages with fewer words than this value will be completely ignored during the comparison. This is extremely useful for filtering out blank pages, title pages, or boilerplate pages that might otherwise create noise in the results. The default is 10.
+- **Choosing a Matching Strategy:** You have three main options to find duplicate content.
+    - *'Subdocument' matching (default):* Use this to find the longest possible sequence of matching pages. The tool will find an initial match and then automatically expand it forward page-by-page until the consecutive match breaks. This is the best method for identifying complete copied chapters or sections of unknown length. This is enabled by default by ticking the "Enable 'subdocument' matching" box. This setting overrides the described below.
+    - *Minimum length subdocument matching:* Use this to find sequences of consecutively matching pages with a minimum page lenght. For example, setting the slider to 3 will only return sections that are at least 3 pages long. How to enable: Untick the "Enable 'subdocument' matching" box and set the "Minimum consecutive pages" slider to a value greater than 1.
+    - *Single Page Matching:* Use this to find all individual page pairs that are similar to each other. Leave the "Enable 'subdocument' matching" box unchecked and keep the "Minimum consecutive pages" slider at 1.
+Once your parameters are set, click the "Identify duplicate pages/subdocuments" button.
+**Step 2: Review Results in the Interface**
+After the analysis is complete, the results will be displayed directly in the interface.
+*Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
+*Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
+![Duplicate review interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_review_overview.PNG)
+**Step 3: Download and Use the Output Files**
+The analysis also generates a set of downloadable files for your records and for performing redactions.
+- page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
+![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
+- [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
+![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface_new.PNG)
+If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
+### Duplicate detection in tabular data
+The app also includes functionality to find duplicate cells or rows in CSV, Excel, or Parquet files. This is particularly useful for cleaning datasets where you need to identify and remove duplicate entries.
+**Step 1: Upload files and configure analysis**
+Navigate to the 'Word or Excel/csv files' tab and scroll down to the "Find duplicate cells in tabular data" section. Upload your tabular files (CSV, Excel, or Parquet) and configure the analysis parameters:
+- **Similarity threshold**: Score (0-1) to consider cells a match. 1 = perfect match
+- **Minimum word count**: Cells with fewer words than this value are ignored
+- **Do initial clean of text**: Remove URLs, HTML tags, and non-ASCII characters
+- **Remove duplicate rows**: Automatically remove duplicate rows from deduplicated files
+- **Select Excel sheet names**: Choose which sheets to analyze (for Excel files)
+- **Select text columns**: Choose which columns contain text to analyze
+**Step 2: Review results**
+After clicking "Find duplicate cells/rows", the results will be displayed in a table showing:
+- File1, Row1, File2, Row2
+- Similarity_Score
+- Text1, Text2 (the actual text content being compared)
+Click on any row to see more details about the duplicate match in the preview boxes below.
+**Step 3: Remove duplicates**
+Select a file from the dropdown and click "Remove duplicate rows from selected file" to create a cleaned version with duplicates removed. The cleaned file will be available for download.
+# Advanced user guide
+This advanced user guide covers features that require system administration access or command-line usage. These features are typically used by system administrators or advanced users who need more control over the redaction process.
+## Fuzzy search and redaction
+The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/fuzzy_search/).
+Sometimes you may be searching for terms that are slightly mispelled throughout a document, for example names. The document redaction app gives the option for searching for long phrases that may contain spelling mistakes, a method called 'fuzzy matching'.
+To do this, go to the Redaction Settings, and the 'Select entity types to redact' area. In the box below relevant to your chosen redaction method (local or AWS Comprehend), select 'CUSTOM_FUZZY' from the list. Next, we can select the maximum number of spelling mistakes allowed in the search (up to nine). Here, you can either type in a number or use the small arrows to the right of the box. Change this option to 3. This will allow for a maximum of three 'changes' in text needed to match to the desired search terms.
+The other option we can leave as is (should fuzzy search match on entire phrases in deny list) - this option would allow you to fuzzy search on each individual word in the search phrase (apart from stop words).
+Next, we can upload a deny list on the same page to do the fuzzy search. A relevant deny list file can be found [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/fuzzy_search/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv) - you can upload it following [these steps](#deny-list-example). You will notice that the suggested deny list has spelling mistakes compared to phrases found in the example document.
+![Deny list example with spelling mistakes](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/fuzzy_search/img/fuzzy_deny_list_example.PNG)
+Upload the [Partnership-Agreement-Toolkit file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf) into the 'Redact document' area on the first tab. Now, press the 'Redact document' button.
+Using these deny list with spelling mistakes, the app fuzzy match these terms to the correct text in the document. After redaction is complete, go to the Review Redactions tab to check the first tabs. You should see that the phrases in the deny list have been successfully matched.
+![Fuzzy match review outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/fuzzy_search/img/fuzzy_search_review.PNG)
+## Export to and import from Adobe
+Files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/export_to_adobe/).
+The Document Redaction app has enhanced features for working with Adobe Acrobat. You can now export suggested redactions to Adobe, import Adobe comment files into the app, and use the new `_for_review.pdf` files directly in Adobe Acrobat.
+### Using _for_review.pdf files with Adobe Acrobat
+The app now generates `...redactions_for_review.pdf` files that contain the original PDF with redaction boxes overlaid but the original text still visible underneath. These files are specifically designed for use in Adobe Acrobat and other PDF viewers where you can:
+- See the suggested redactions without the text being permanently removed
+- Review redactions before finalising them
+- Use Adobe Acrobat's built-in redaction tools to modify or apply the redactions
+- Export the final redacted version directly from Adobe
+Simply open the `...redactions_for_review.pdf` file in Adobe Acrobat to begin reviewing and modifying the suggested redactions.
+### Exporting to Adobe Acrobat
+To convert suggested redactions to Adobe format, you need to have the original PDF and a review file csv in the input box at the top of the Review redactions page.
+![Input area for files for Adobe export](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/adobe_export_input_area.PNG)
+Then, you can find the export to Adobe option at the bottom of the Review redactions tab. Adobe comment files will be output here.
+![Adobe export/import options](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/export_to_adobe_interface.PNG)
+Once the input files are ready, you can click on the 'Convert review file to Adobe comment format'. You should see a file appear in the output box with a '.xfdf' file type. To use this in Adobe, after download to your computer, you should be able to double click on it, and a pop-up box will appear asking you to find the PDF file associated with it. Find the original PDF file used for your redaction task. The file should be opened up in Adobe Acrobat with the suggested redactions.
+![Suggested redactions in Adobe Acrobat](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/adobe_redact_example.PNG)
+### Importing from Adobe Acrobat
+The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go to the same Adobe import/export area as described above at the bottom of the Review Redactions tab. In this box, you need to upload a .xfdf Adobe comment file, along with the relevant original PDF for redaction.
+![Adobe import interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface.PNG)
+When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
+![Outputs from Adobe import](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface_outputs.PNG)
+## Using the AWS Textract document API
+This option can be enabled by your system admin, in the config file ('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
+### Starting a new Textract API job
+To use this feature, first upload a document file in the file input box [in the usual way](#upload-files-to-the-app) on the first tab of the app. Under AWS Textract signature detection you can select whether or not you would like to analyse signatures or not (with a [cost implication](#optional---select-signature-extraction)).
+Then, open the section under the heading 'Submit whole document to AWS Textract API...'.
+![Textract document API menu](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_document_api.PNG)
+Click 'Analyse document with AWS Textract API call'. After a few seconds, the job should be submitted to the AWS Textract service. The box 'Job ID to check status' should now have an ID filled in. If it is not already filled with previous jobs (up to seven days old), the table should have a row added with details of the new API job.
+Click the button underneath, 'Check status of Textract job and download', to see progress on the job. Processing will continue in the background until the job is ready, so it is worth periodically clicking this button to see if the outputs are ready. In testing, and as a rough estimate, it seems like this process takes about five seconds per page. However, this has not been tested with very large documents. Once ready, the '_textract.json' output should appear below.
+### Textract API job outputs
+The '_textract.json' output can be used to speed up further redaction tasks as [described previously](#optional---costs-and-time-estimation), the 'Existing Textract output file found' flag should now be ticked.
+![Textract document API initial ouputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/textract_api/textract_api_initial_outputs.PNG)
+You can now easily get the '..._ocr_output.csv' redaction output based on this '_textract.json' (described in [Redaction outputs](#redaction-outputs)) by clicking on the button 'Convert Textract job outputs to OCR results'. You can now use this file e.g. for [identifying duplicate pages](#identifying-and-redacting-duplicate-pages), or for redaction review.
+## Modifying existing redaction review files
+You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
+As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified insider or outside of the app. This gives you the flexibility to change redaction details outside of the app.
+### Inside the app
+You can now modify redaction review files directly in the app on the 'Review redactions' tab. Open the accordion 'View and edit review data' under the file input area. You can edit review file data cells here - press Enter to apply changes. You should see the effect on the current page if you click the 'Save changes on current page to file' button to the right.
+### Outside the app
+If you open up a 'review_file' csv output using a spreadsheet software program such as Microsoft Excel you can easily modify redaction properties. Open the file '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local.csv)', and you should see a spreadshet with just four suggested redactions (see below). The following instructions are for using Excel.
+![Review file before](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/review_file_before.PNG)
+The first thing we can do is remove the first row - 'et' is suggested as a person, but is obviously not a genuine instance of personal information. Right click on the row number and select delete on this menu. Next, let's imagine that what the app identified as a 'phone number' was in fact another type of number and so we wanted to change the label. Simply click on the relevant label cells, let's change it to 'SECURITY_NUMBER'. You could also use 'Find & Select' -> 'Replace' from the top ribbon menu if you wanted to change a number of labels simultaneously.
+How about we wanted to change the colour of the 'email address' entry on the redaction review tab of the redaction app? The colours in a review file are based on an RGB scale with three numbers ranging from 0-255. [You can find suitable colours here](https://rgbcolorpicker.com). Using this scale, if I wanted my review box to be pure blue, I can change the cell value to (0,0,255).
+Imagine that a redaction box was slightly too small, and I didn't want to use the in-app options to change the size. In the review file csv, we can modify e.g. the ymin and ymax values for any box to increase the extent of the redaction box. For the 'email address' entry, let's decrease ymin by 5, and increase ymax by 5.
+I have saved an output file following the above steps as '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local_mod.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local_mod.csv)' in the same folder that the original was found. Let's upload this file to the app along with the original pdf to see how the redactions look now.
+![Review file after modification](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/partnership_redactions_after.PNG)
+We can see from the above that we have successfully removed a redaction box, changed labels, colours, and redaction box sizes.
+## Merging redaction review files
+Say you have run multiple redaction tasks on the same document, and you want to merge all of these redactions together. You could do this in your spreadsheet editor, but this could be fiddly especially if dealing with multiple review files or large numbers of redactions. The app has a feature to combine multiple review files together to create a 'merged' review file.
+![Merging review files in the user interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/merge_review_files_interface.PNG)
+You can find this option at the bottom of the 'Redaction Settings' tab. Upload multiple review files here to get a single output 'merged' review_file. In the examples file, merging the 'review_file_custom.csv' and 'review_file_local.csv' files give you an output containing redaction boxes from both. This combined review file can then be uploaded into the review tab following the usual procedure.
+![Merging review files outputs in spreadsheet](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/merged_review_file_outputs_csv.PNG)
+# Features for expert users/system administrators
+This advanced user guide covers features that require system administration access or command-line usage. These options are not enabled by default but can be configured by your system administrator, and are not available to users who are just using the graphical user interface. These features are typically used by system administrators or advanced users who need more control over the redaction process.
+## Using AWS Textract and Comprehend when not running in an AWS environment
+AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
+However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
+To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
+### Direct access by passing AWS access keys through app
+The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
+### Picking up AWS access keys through an .env file
+The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
+AWS_ACCESS_KEY= your-access-key
+AWS_SECRET_KEY= your-secret-key
+The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
+## Advanced OCR options
+The app supports advanced OCR options that combine multiple OCR engines for improved accuracy. These options are not enabled by default but can be configured by changing the app_config.env file in your '/config' folder, or system environment variables in your system.
+### Available OCR models
+- **Tesseract** (default): The standard OCR engine that works well for most documents. Provides good word-level bounding box accuracy.
+- **PaddleOCR**: More accurate for whole line text extraction, but word-level bounding boxes may be less precise. Best for documents with clear, well-formatted text.
+- **Hybrid-paddle**: Combines Tesseract and PaddleOCR - uses Tesseract for initial extraction, then PaddleOCR for re-extraction of low-confidence text regions.
+- **Hybrid-vlm**: Combines Tesseract with Vision Language Models (VLM) - uses Tesseract for initial extraction, then a VLM model (default: Dots.OCR) for re-extraction of low-confidence text.
+- **Hybrid-paddle-vlm**: Combines PaddleOCR with Vision Language Models - uses PaddleOCR first, then a VLM model for low-confidence regions.
+### Enabling advanced OCR options
+To enable these options, you need to modify the app_config.env file in your '/config' folder and set the following environment variables:
+**Basic OCR model selection:**
+```
+SHOW_LOCAL_OCR_MODEL_OPTIONS = "True"
+```
+**To enable PaddleOCR options (paddle, hybrid-paddle):**
+```
+SHOW_PADDLE_MODEL_OPTIONS = "True"
+```
+**To enable Vision Language Model options (hybrid-vlm, hybrid-paddle-vlm):**
+```
+SHOW_VLM_MODEL_OPTIONS = "True"
+```
+Once enabled, users will see a "Change default local OCR model" section in the redaction settings where they can choose between the available models based on what has been enabled.
+### OCR configuration parameters
+The following parameters can be configured by your system administrator to fine-tune OCR behavior:
+#### Hybrid OCR settings
+- **SHOW_HYBRID_MODELS** (default: False): If enabled, hybrid OCR options will be shown in the UI.
+- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 80): Tesseract confidence score below which the secondary OCR engine (PaddleOCR or VLM) will be used for re-extraction. Lower values mean more text will be re-extracted.
+- **HYBRID_OCR_PADDING** (default: 1): Padding (in pixels) added to word bounding boxes before re-extraction with the secondary engine.
+- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): If enabled, saves comparison images showing Tesseract vs. secondary engine results when using hybrid modes.
+- **SAVE_PAGE_OCR_VISUALISATIONS** (default: False): If enabled, saves images with detected bounding boxes overlaid for debugging purposes.
+#### Tesseract settings
+- **TESSERACT_SEGMENTATION_LEVEL** (default: 11): Tesseract PSM (Page Segmentation Mode) level. Valid values are 0-13. Higher values provide more detailed segmentation but may be slower.
+#### PaddleOCR settings
+- **SHOW_PADDLE_MODEL_OPTIONS** (default: False): If enabled, PaddleOCR options will be shown in the UI.
+- **PADDLE_USE_TEXTLINE_ORIENTATION** (default: False): If enabled, PaddleOCR will detect and correct text line orientation.
+- **PADDLE_DET_DB_UNCLIP_RATIO** (default: 1.2): Controls the expansion ratio of detected text regions. Higher values expand the detection area more.
+- **CONVERT_LINE_TO_WORD_LEVEL** (default: False): If enabled, converts PaddleOCR line-level results to word-level for better precision in bounding boxes (not perfect, but pretty good).
+- **LOAD_PADDLE_AT_STARTUP** (default: False): If enabled, loads the PaddleOCR model when the application starts, reducing latency for first use but increasing startup time.
+#### Image preprocessing
+- **PREPROCESS_LOCAL_OCR_IMAGES** (default: True): If enabled, images are preprocessed before OCR. This can improve accuracy but may slow down processing.
+- **SAVE_PREPROCESS_IMAGES** (default: False): If enabled, saves the preprocessed images for debugging purposes.
+#### Vision Language Model (VLM) settings
+When VLM options are enabled, the following settings are available:
+- **SHOW_VLM_MODEL_OPTIONS** (default: False): If enabled, VLM options will be shown in the UI.
+- **SELECTED_MODEL** (default: "Dots.OCR"): The VLM model to use. Options include: "Nanonets-OCR2-3B", "Dots.OCR", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "PaddleOCR-VL". Generally, the Qwen3-VL-8B-Instruct model is the most accurate, and vlm/inference server inference is based on using this model, but is also the slowest. Qwen3-VL-4B-Instruct can also work quite well on easier documents.
+- **MAX_SPACES_GPU_RUN_TIME** (default: 60): Maximum seconds to run GPU operations on Hugging Face Spaces.
+- **MAX_NEW_TOKENS** (default: 30): Maximum number of tokens to generate for VLM responses.
+- **MAX_INPUT_TOKEN_LENGTH** (default: 4096): Maximum number of tokens that can be input to the VLM.
+- **VLM_MAX_IMAGE_SIZE** (default: 1000000): Maximum total pixels (width × height) for images. Larger images are resized while maintaining aspect ratio.
+- **VLM_MAX_DPI** (default: 300.0): Maximum DPI for images. Higher DPI images are resized accordingly.
+- **USE_FLASH_ATTENTION** (default: False): If enabled, uses flash attention for improved VLM performance.
+- **SAVE_VLM_INPUT_IMAGES** (default: False): If enabled, saves input images sent to VLM for debugging.
+#### General settings
+- **MODEL_CACHE_PATH** (default: "./model_cache"): Directory where OCR models are cached.
+- **OVERWRITE_EXISTING_OCR_RESULTS** (default: False): If enabled, always creates new OCR results instead of loading from existing JSON files.
+### Using an alternative OCR model
+If the SHOW_LOCAL_OCR_MODEL_OPTIONS, SHOW_PADDLE_MODEL_OPTIONS, and SHOW_INFERENCE_SERVER_OPTIONS are set to 'True' in your app_config.env file, you should see the following options available under 'Change default redaction settings...' on the front tab. The different OCR options can be used in different contexts.
+- **Tesseract (option 'tesseract')**: Best for documents with clear, well-formatted text, providing a good balance of speed and accuracy with precise word-level bounding boxes. But struggles a lot with handwriting or 'noisy' documents (e.g. scanned documents).
+- **PaddleOCR (option 'paddle')**: More powerful than Tesseract, but slower. Does a decent job with unclear typed text on scanned documents. Also, bounding boxes may not all be accurate as they will be calculated from the line-level bounding boxes produced by Paddle after analysis.
+- **VLM (option 'vlm')**: Recommended for use with the Qwen-3-VL 8B model (can set this with the SELECTED_MODEL environment variable in config.py). This model is extremely good at identifying difficult to read handwriting and noisy documents. However, it is much slower than the above options.
+Other models are available as you can see in the tools/run_vlm.py code file. This will conduct inference with the transformers package, and quantise with bitsandbytes if the QUANTISE_VLM_MODELS environment variable is set to True. Inference with this package is *much* slower than with e.g. llama.cpp or vllm servers, which can be used with the inference-server options described below.
+- **Inference server (option 'inference-server')**: This can be used with OpenAI compatible API endpoints, for example [llama-cpp using llama-server](https://github.com/ggml-org/llama.cpp), or [vllm](https://docs.vllm.ai/en/stable). Both of these options will be much faster for inference than the VLM 'in-app' model calls described above, and produce results of a similar quality, but you will need to be able to set up the server separately.
+#### Hybrid options
+If the SHOW_HYBRID_MODELS environment variable is set to 'True' in your app_config.env file, you will see the hybrid model options available. The hybrid models call a smaller model (paddleOCR) to first identify bounding box position and text, and then pass text sections with low confidence to a more performant model (served in app or via an inference server such as llama.cpp or vllm) to suggest for replacement. **Note:** I have not found that the results from this analysis is significantly better than that from e.g. Paddle or VLM/inference server analysis alone (particularly when using Qwen 3 VL), but are provided for comparison.
+- **Hybrid-paddle-vlm**: This uses PaddleOCR's line-level detection with a VLM's advanced recognition capabilities. PaddleOCR is better at identifying bounding boxes for difficult documents, and so this is probably the most usable of the three options, if you can get both Paddle and the VLM model working in the same environment.
+- **Hybrid-paddle-inference-server**: This uses PaddleOCR's line-level detection with an inference server's advanced recognition capabilities. This is the same as the Hybrid-paddle-vlm option, but uses an inference server instead of a VLM model. This allows for the use of GGUF or AWQ/GPTQ quantised models via llama.cpp or vllm servers.
+### Inference server options
+If using a local inference server, I would suggest using (llama.cpp)[https://github.com/ggml-org/llama.cpp] as it is much faster than transformers/torch inference, and it will offload to cpu/ram automatically rather than failing as vllm tends to do. Here is the run command I use for my llama server locally ion a wsl or linux environment) to get deterministic results (need at least 16GB of VRAM to run with all gpu layers assigned to your graphics card to use the following model):
+```
+llama-server \
+    -hf unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF:UD-Q4_K_XL \
+    --n-gpu-layers 99 \
+    --jinja \
+    --temp 0 \
+    --top-k 1 \
+    --top-p 1 \
+    --min-p 1 \
+    --frequency-penalty 1 \
+    --presence-penalty 1 \
+    --flash-attn on \
+    --ctx-size 8192 \
+    --host 0.0.0.0 \
+    --port 7862 \
+    --image-min-tokens 1600 \
+    --image-max-tokens 2301 \
+    --no-warmup \
+    --n-cpu-moe 13
+```
+If running llama.cpp on the same computer as the doc redaction app, you can then set the following variable in config/app_config.env to run:
+```
+SHOW_INFERENCE_SERVER_OPTIONS=True
+INFERENCE_SERVER_API_URL=http://localhost:7862
+```
+The above setup with host = 0.0.0.0 allows you to access this server from other computers in your home network. Find your internal ip for the computer hosting llama server (e.g. using ipconfig in Windows), and then replace 'localhost' in the above variable with this value.
+### Identifying people and signatures with VLMs
+If VLM or inference server options are enabled, you can also use the VLM to identify photos of people's faces and signatures in the document, and redact them accordingly.
+On the 'Redaction Settings' tab, select the CUSTOM_VLM_PERSON and CUSTOM_VLM_SIGNATURE entities. When you conduct an OCR task with the VLM or inference server, it will identify the bounding boxes for photos of people's faces and signatures in the document, and redact them accordingly if a redaction option is selected.
+## Command Line Interface (CLI)
+The app includes a comprehensive command-line interface (`cli_redact.py`) that allows you to perform redaction, deduplication, and AWS Textract operations directly from the terminal. This is particularly useful for batch processing, automation, and integration with other systems.
+### Getting started with the CLI
+To use the CLI, you need to:
+1. Open a terminal window
+2. Navigate to the app folder containing `cli_redact.py`
+3. Activate your virtual environment (conda or venv)
+4. Run commands using `python cli_redact.py` followed by your options
+### Basic CLI syntax
+```bash
+python cli_redact.py --task [redact|deduplicate|textract] --input_file [file_path] [additional_options]
+```
+### Redaction examples
+**Basic PDF redaction with default settings:**
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+```
+**Extract text only (no redaction) with whole page redaction:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
+```
+**Redact with custom entities and allow list:**
+```bash
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
+```
+**Redact with fuzzy matching and custom deny list:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --fuzzy_mistakes 3
+```
+**Redact with AWS services:**
+```bash
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
+```
+**Redact specific pages with signature extraction:**
+```bash
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+```
+### Tabular data redaction
+**Anonymize CSV file with specific columns:**
+```bash
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
+```
+**Anonymize Excel file:**
+```bash
+python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact
+```
+**Anonymize Word document:**
+```bash
+python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
+```
+### Duplicate detection
+**Find duplicate pages in OCR files:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+```
+**Find duplicates at line level:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
+```
+**Find duplicate rows in tabular data:**
+```bash
+python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
+```
+### AWS Textract operations
+**Submit document for analysis:**
+```bash
+python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+```
+**Submit with signature extraction:**
+```bash
+python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
+```
+**Retrieve results by job ID:**
+```bash
+python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
+```
+**List recent jobs:**
+```bash
+python cli_redact.py --task textract --textract_action list
+```
+### Common CLI options
+#### General options
+- `--task`: Choose between "redact", "deduplicate", or "textract"
+- `--input_file`: Path to input file(s) - can specify multiple files separated by spaces
+- `--output_dir`: Directory for output files (default: output/)
+- `--input_dir`: Directory for input files (default: input/)
+- `--language`: Language of document content (e.g., "en", "es", "fr")
+- `--username`: Username for session tracking
+- `--pii_detector`: Choose PII detection method ("Local", "AWS Comprehend", or "None")
+- `--local_redact_entities`: Specify local entities to redact (space-separated list)
+- `--aws_redact_entities`: Specify AWS Comprehend entities to redact (space-separated list)
+- `--aws_access_key` / `--aws_secret_key`: AWS credentials for cloud services
+- `--aws_region`: AWS region for cloud services
+- `--s3_bucket`: S3 bucket name for cloud operations
+- `--cost_code`: Cost code for tracking usage
+#### PDF/Image redaction options
+- `--ocr_method`: Choose text extraction method ("AWS Textract", "Local OCR", or "Local text")
+- `--chosen_local_ocr_model`: Local OCR model to use (e.g., "tesseract", "paddle", "hybrid-paddle", "hybrid-vlm")
+- `--page_min` / `--page_max`: Process only specific page range (0 for max means all pages)
+- `--images_dpi`: DPI for image processing (default: 300.0)
+- `--preprocess_local_ocr_images`: Preprocess images before OCR (True/False)
+- `--compress_redacted_pdf`: Compress the final redacted PDF (True/False)
+- `--return_pdf_end_of_redaction`: Return PDF at end of redaction process (True/False)
+- `--allow_list_file` / `--deny_list_file`: Paths to custom allow/deny list CSV files
+- `--redact_whole_page_file`: Path to CSV file listing pages to redact completely
+- `--handwrite_signature_extraction`: Handwriting and signature extraction options for Textract ("Extract handwriting", "Extract signatures")
+- `--extract_forms`: Extract forms during Textract analysis (flag)
+- `--extract_tables`: Extract tables during Textract analysis (flag)
+- `--extract_layout`: Extract layout during Textract analysis (flag)
+#### Tabular/Word anonymization options
+- `--anon_strategy`: Anonymization strategy (e.g., "redact", "redact completely", "replace_redacted", "encrypt", "hash")
+- `--text_columns`: List of column names to anonymize (space-separated)
+- `--excel_sheets`: Specific Excel sheet names to process (space-separated)
+- `--fuzzy_mistakes`: Number of spelling mistakes allowed in fuzzy matching (default: 1)
+- `--match_fuzzy_whole_phrase_bool`: Match fuzzy whole phrase (True/False)
+- `--do_initial_clean`: Perform initial text cleaning for tabular data (True/False)
+#### Duplicate detection options
+- `--duplicate_type`: Type of duplicate detection ("pages" for OCR files or "tabular" for CSV/Excel)
+- `--similarity_threshold`: Similarity threshold (0-1) to consider content as duplicates (default: 0.95)
+- `--min_word_count`: Minimum word count for text to be considered (default: 10)
+- `--min_consecutive_pages`: Minimum number of consecutive pages to consider as a match (default: 1)
+- `--greedy_match`: Use greedy matching strategy for consecutive pages (True/False)
+- `--combine_pages`: Combine text from same page number within a file (True/False)
+- `--remove_duplicate_rows`: Remove duplicate rows from output (True/False)
+#### Textract batch operations options
+- `--textract_action`: Action to perform ("submit", "retrieve", or "list")
+- `--job_id`: Textract job ID for retrieve action
+- `--extract_signatures`: Extract signatures during Textract analysis (flag)
+- `--textract_bucket`: S3 bucket name for Textract operations
+- `--poll_interval`: Polling interval in seconds for job status (default: 30)
+- `--max_poll_attempts`: Maximum polling attempts before timeout (default: 120)
+### Output files
+The CLI generates the same output files as the GUI:
+- `...redacted.pdf`: Final redacted document
+- `...redactions_for_review.pdf`: Document with redaction boxes for review
+- `...review_file.csv`: Detailed redaction information
+- `...ocr_results.csv`: Extracted text results
+- `..._textract.json`: AWS Textract results (if applicable)
+For more advanced options and configuration, refer to the help text by running:
+```bash
+python cli_redact.py --help
+```

_quarto.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+project:
+  type: website
+  output-dir: docs # Common for GitHub Pages
+  render:
+    - "*.qmd"
+website:
+  title: "Document Redaction App"
+  page-navigation: true # Often enabled for floating TOC to highlight current section
+  back-to-top-navigation: true
+  search: true
+  navbar:
+    left:
+      - href: index.qmd
+        text: Home
+      - href: src/user_guide.qmd
+        text: User guide
+      - href: src/faq.qmd
+        text: User FAQ
+      - href: src/installation_guide.qmd
+        text: App installation guide (with CDK)
+      - href: src/app_settings.qmd
+        text: App settings management guide
+format:
+  html:
+    theme: cosmo
+    css: styles.css

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cdk/__init__.py ADDED Viewed

File without changes

cdk/app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+from aws_cdk import App, Environment
+from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
+from cdk_functions import create_basic_config_env, load_context_from_file
+from cdk_stack import CdkStack, CdkStackCloudfront  # , CdkStackMain
+# Assuming these are still relevant for you
+from check_resources import CONTEXT_FILE, check_and_set_context
+# Initialize the CDK app
+app = App()
+# --- ENHANCED CONTEXT GENERATION AND LOADING ---
+# 1. Always ensure the old context file is removed before generation
+if os.path.exists(CONTEXT_FILE):
+    try:
+        os.remove(CONTEXT_FILE)
+        print(f"Removed stale context file: {CONTEXT_FILE}")
+    except OSError as e:
+        print(f"Warning: Could not remove old context file {CONTEXT_FILE}: {e}")
+        # Proceed anyway, check_and_set_context might handle overwriting
+# 2. Always run the pre-check script to generate fresh context
+print("Running pre-check script to generate application context...")
+try:
+    check_and_set_context()
+    if not os.path.exists(CONTEXT_FILE):
+        raise RuntimeError(
+            f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
+        )
+    print(f"Context generated successfully at {CONTEXT_FILE}.")
+except Exception as e:
+    raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
+if os.path.exists(CONTEXT_FILE):
+    load_context_from_file(app, CONTEXT_FILE)
+else:
+    raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
+# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
+create_basic_config_env("config")
+# Define the environment for the regional stack (where ALB resides)
+aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
+# Create the regional stack (ALB, SGs, etc.)
+# regional_stack = CdkStack(app,
+#                           "RedactionStackSubnets",
+#                           env=aws_env_regional,
+#                           cross_region_references=True)
+# regional_stack_main = CdkStackMain(app,
+#                         "RedactionStackMain",
+#                         env=aws_env_regional,
+#                         private_subnets=regional_stack.params["private_subnets"],
+#                         private_route_tables=regional_stack.params["private_route_tables"],
+#                         public_subnets=regional_stack.params["public_subnets"],
+#                         public_route_tables=regional_stack.params["public_route_tables"],
+#                         cross_region_references=True)
+regional_stack = CdkStack(
+    app, "RedactionStack", env=aws_env_regional, cross_region_references=True
+)
+if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
+    # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
+    aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
+    # Create the CloudFront stack, passing the outputs from the regional stack
+    cloudfront_stack = CdkStackCloudfront(
+        app,
+        "RedactionStackCloudfront",
+        env=aws_env_us_east_1,
+        alb_arn=regional_stack.params["alb_arn_output"],
+        alb_sec_group_id=regional_stack.params["alb_security_group_id"],
+        alb_dns_name=regional_stack.params["alb_dns_name"],
+        cross_region_references=True,
+    )
+# Synthesize the CloudFormation template
+app.synth(validate_on_synthesis=True)

cdk/cdk_config.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+import tempfile
+from dotenv import load_dotenv
+# Set or retrieve configuration variables for CDK redaction deployment
+def convert_string_to_boolean(value: str) -> bool:
+    """Convert string to boolean, handling various formats."""
+    if isinstance(value, bool):
+        return value
+    elif value in ["True", "1", "true", "TRUE"]:
+        return True
+    elif value in ["False", "0", "false", "FALSE"]:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean value: {value}")
+def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
+    """
+    Get an environmental variable, and set it to a default value if it doesn't exist
+    """
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set the environment variable to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    if print_val is True:
+        print(f"The value of {var_name} is {value}")
+    return value
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def add_folder_to_path(folder_path: str):
+    """
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    """
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ["PATH"]
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ["PATH"] = full_path_extension
+            # print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
+###
+# LOAD CONFIG FROM ENV FILE
+###
+CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
+ensure_folder_exists(CONFIG_FOLDER)
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
+CDK_CONFIG_PATH = get_or_create_env_var(
+    "CDK_CONFIG_PATH", "config/cdk_config.env"
+)  # e.g. config/cdk_config.env
+if CDK_CONFIG_PATH:
+    if os.path.exists(CDK_CONFIG_PATH):
+        print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
+        load_dotenv(CDK_CONFIG_PATH)
+    else:
+        print("CDK config file not found at location:", CDK_CONFIG_PATH)
+###
+# AWS OPTIONS
+###
+AWS_REGION = get_or_create_env_var("AWS_REGION", "")
+AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
+###
+# CDK OPTIONS
+###
+CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
+CONTEXT_FILE = get_or_create_env_var(
+    "CONTEXT_FILE", "cdk.context.json"
+)  # Define the CDK output context file name
+CDK_FOLDER = get_or_create_env_var(
+    "CDK_FOLDER", ""
+)  # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
+RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
+### VPC and connections
+VPC_NAME = get_or_create_env_var("VPC_NAME", "")
+NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
+NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "")  # "10.0.0.0/24"
+EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
+SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
+### SUBNETS / ROUTE TABLES / NAT GATEWAY
+PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
+    "PUBLIC_SUBNETS_TO_USE", ""
+)  # e.g. ['PublicSubnet1', 'PublicSubnet2']
+PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+    "PUBLIC_SUBNET_CIDR_BLOCKS", ""
+)  # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+    "PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
+)  # e.g. ["eu-east-1b", "eu-east1b"]
+PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
+    "PRIVATE_SUBNETS_TO_USE", ""
+)  # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
+PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
+    "PRIVATE_SUBNET_CIDR_BLOCKS", ""
+)  # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
+PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
+    "PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
+)  # e.g. ["eu-east-1b", "eu-east1b"]
+ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
+    "ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
+)
+NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
+    "NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
+)
+NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
+# IAM roles
+AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
+    "AWS_MANAGED_TASK_ROLES_LIST",
+    '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs"]',
+)
+POLICY_FILE_LOCATIONS = get_or_create_env_var(
+    "POLICY_FILE_LOCATIONS", ""
+)  # e.g. '["config/sts_permissions.json"]'
+POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
+# GITHUB REPO
+GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
+GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
+GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
+### CODEBUILD
+CODEBUILD_ROLE_NAME = get_or_create_env_var(
+    "CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
+)
+CODEBUILD_PROJECT_NAME = get_or_create_env_var(
+    "CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
+)
+### ECR
+ECR_REPO_NAME = get_or_create_env_var(
+    "ECR_REPO_NAME", "doc-redaction"
+)  # Beware - cannot have underscores and must be lower case
+ECR_CDK_REPO_NAME = get_or_create_env_var(
+    "ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
+)
+### S3
+S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
+    "S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
+)  # S3 bucket names need to be lower case
+S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
+    "S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
+)
+### KMS KEYS FOR S3 AND SECRETS MANAGER
+USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
+CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
+    "CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
+)
+### ECS
+FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
+    "FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
+)
+TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
+    "TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
+)
+CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
+ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
+ECS_TASK_ROLE_NAME = get_or_create_env_var(
+    "ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
+)
+ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
+    "ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
+)
+ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
+)
+ECS_LOG_GROUP_NAME = get_or_create_env_var(
+    "ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
+)
+ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
+ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
+ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
+ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
+### Cognito
+COGNITO_USER_POOL_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
+)
+COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
+)
+COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
+    "COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
+)
+COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
+    "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
+)  # Should change this to something unique or you'll probably hit an error
+COGNITO_REFRESH_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
+)  # Minutes
+COGNITO_ID_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
+)  # Minutes
+COGNITO_ACCESS_TOKEN_VALIDITY = int(
+    get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
+)  # Minutes
+# Application load balancer
+ALB_NAME = get_or_create_env_var(
+    "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
+)  # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
+ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
+    "ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
+)
+ALB_TARGET_GROUP_NAME = get_or_create_env_var(
+    "ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
+)  # Max 32 characters
+EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
+EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
+    "EXISTING_LOAD_BALANCER_ARN", "placeholder_load_balancer_dns.net"
+)
+## CLOUDFRONT
+USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
+CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
+    "CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
+)
+CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
+    "CLOUDFRONT_GEO_RESTRICTION", ""
+)  # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
+CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
+    "CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
+)
+CLOUDFRONT_DOMAIN = get_or_create_env_var(
+    "CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
+)
+# Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
+ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
+SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
+    "SSL_CERTIFICATE_DOMAIN", ""
+)  # e.g. example.com or www.example.com
+# This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
+if USE_CLOUDFRONT == "True":
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
+    )
+elif SSL_CERTIFICATE_DOMAIN:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
+    )
+else:
+    COGNITO_REDIRECTION_URL = get_or_create_env_var(
+        "COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
+    )
+# Custom headers e.g. if routing traffic through Cloudfront
+CUSTOM_HEADER = get_or_create_env_var(
+    "CUSTOM_HEADER", ""
+)  # Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER_VALUE = get_or_create_env_var(
+    "CUSTOM_HEADER_VALUE", ""
+)  # Retrieving or setting CUSTOM_HEADER_VALUE
+# Firewall on top of load balancer
+LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
+    "LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
+)
+# Firewall on top of CloudFront
+WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
+###
+# File I/O options
+###
+OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/")  # 'output/'
+INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/")  # 'input/'
+# Allow for files to be saved in a temporary folder for increased security in some instances
+if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Temporary directory created at: {temp_dir}")
+        if OUTPUT_FOLDER == "TEMP":
+            OUTPUT_FOLDER = temp_dir + "/"
+        if INPUT_FOLDER == "TEMP":
+            INPUT_FOLDER = temp_dir + "/"
+###
+# LOGGING OPTIONS
+###
+SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
+)
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
+)
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
+    "USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
+)
+###
+# REDACTION OPTIONS
+###
+# Get some environment variables and Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
+GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
+###
+# WHOLE DOCUMENT API OPTIONS
+###
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
+    "DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
+)  # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

cdk/cdk_functions.py ADDED Viewed

	@@ -0,0 +1,1482 @@

+import ipaddress
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import boto3
+import pandas as pd
+from aws_cdk import App, CfnOutput, CfnTag, Tags
+from aws_cdk import aws_cognito as cognito
+from aws_cdk import aws_ec2 as ec2
+from aws_cdk import aws_elasticloadbalancingv2 as elb
+from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_wafv2 as wafv2
+from botocore.exceptions import ClientError
+from cdk_config import (
+    ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    AWS_REGION,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    NAT_GATEWAY_EIP_NAME,
+    POLICY_FILE_LOCATIONS,
+    PRIVATE_SUBNET_AVAILABILITY_ZONES,
+    PRIVATE_SUBNET_CIDR_BLOCKS,
+    PRIVATE_SUBNETS_TO_USE,
+    PUBLIC_SUBNET_AVAILABILITY_ZONES,
+    PUBLIC_SUBNET_CIDR_BLOCKS,
+    PUBLIC_SUBNETS_TO_USE,
+    S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+)
+from constructs import Construct
+from dotenv import set_key
+# --- Function to load context from file ---
+def load_context_from_file(app: App, file_path: str):
+    if os.path.exists(file_path):
+        with open(file_path, "r") as f:
+            context_data = json.load(f)
+            for key, value in context_data.items():
+                app.node.set_context(key, value)
+            print(f"Loaded context from {file_path}")
+    else:
+        print(f"Context file not found: {file_path}")
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE:
+    PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE:
+    PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS:
+    PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES:
+    PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS:
+    PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES:
+    PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
+        "PRIVATE_SUBNET_AVAILABILITY_ZONES"
+    )
+if POLICY_FILE_LOCATIONS:
+    POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
+def check_for_existing_role(role_name: str):
+    try:
+        iam = boto3.client("iam")
+        # iam.get_role(RoleName=role_name)
+        response = iam.get_role(RoleName=role_name)
+        role = response["Role"]["Arn"]
+        print("Response Role:", role)
+        return True, role, ""
+    except iam.exceptions.NoSuchEntityException:
+        return False, "", ""
+    except Exception as e:
+        raise Exception("Getting information on IAM role failed due to:", e)
+from typing import List
+# Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
+# For example:
+# POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
+def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
+    """
+    Adds individual policy statements from a parsed policy document to a CDK Role.
+    Args:
+        role: The CDK Role construct to attach policies to.
+        policy_document: A Python dictionary representing an IAM policy document.
+    """
+    # Ensure the loaded JSON is a valid policy document structure
+    if "Statement" not in policy_document or not isinstance(
+        policy_document["Statement"], list
+    ):
+        print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
+        return  # Do not return role, just log and exit
+    for statement_dict in policy_document["Statement"]:
+        try:
+            # Create a CDK PolicyStatement from the dictionary
+            cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
+            # Add the policy statement to the role
+            role.add_to_policy(cdk_policy_statement)
+            print(f"  - Added statement: {statement_dict.get('Sid', 'No Sid')}")
+        except Exception as e:
+            print(
+                f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
+            )
+def add_custom_policies(
+    scope: Construct,  # Not strictly used here, but good practice if you expand to ManagedPolicies
+    role: iam.IRole,
+    policy_file_locations: Optional[List[str]] = None,
+    custom_policy_text: Optional[str] = None,
+) -> iam.IRole:
+    """
+    Loads custom policies from JSON files or a string and attaches them to a CDK Role.
+    Args:
+        scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
+        role: The CDK Role construct to attach policies to.
+        policy_file_locations: List of file paths to JSON policy documents.
+        custom_policy_text: A JSON string representing a policy document.
+    Returns:
+        The modified CDK Role construct.
+    """
+    if policy_file_locations is None:
+        policy_file_locations = []
+    current_source = "unknown source"  # For error messages
+    try:
+        if policy_file_locations:
+            print(f"Attempting to add policies from files to role {role.node.id}...")
+            for path in policy_file_locations:
+                current_source = f"file: {path}"
+                try:
+                    with open(path, "r") as f:
+                        policy_document = json.load(f)
+                    print(f"Processing policy from {current_source}...")
+                    add_statement_to_policy(role, policy_document)
+                except FileNotFoundError:
+                    print(f"Warning: Policy file not found at {path}. Skipping.")
+                except json.JSONDecodeError as e:
+                    print(
+                        f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
+                    )
+                except Exception as e:
+                    print(
+                        f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
+                    )
+        if custom_policy_text:
+            current_source = "custom policy text string"
+            print(
+                f"Attempting to add policy from custom text to role {role.node.id}..."
+            )
+            try:
+                # *** FIX: Parse the JSON string into a Python dictionary ***
+                policy_document = json.loads(custom_policy_text)
+                print(f"Processing policy from {current_source}...")
+                add_statement_to_policy(role, policy_document)
+            except json.JSONDecodeError as e:
+                print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
+            except Exception as e:
+                print(
+                    f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
+                )
+        # You might want a final success message, but individual processing messages are also good.
+        print(f"Finished processing custom policies for role {role.node.id}.")
+    except Exception as e:
+        print(
+            f"An unhandled error occurred during policy addition for {current_source}: {e}"
+        )
+    return role
+# Import the S3 Bucket class if you intend to return a CDK object later
+# from aws_cdk import aws_s3 as s3
+def check_s3_bucket_exists(
+    bucket_name: str,
+):  # Return type hint depends on what you return
+    """
+    Checks if an S3 bucket with the given name exists and is accessible.
+    Args:
+        bucket_name: The name of the S3 bucket to check.
+    Returns:
+        A tuple: (bool indicating existence, optional S3 Bucket object or None)
+        Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
+              for direct use in CDK. You'll likely only need the boolean result
+              or the bucket name for CDK lookups/creations.
+              For this example, let's return the boolean and the name.
+    """
+    s3_client = boto3.client("s3")
+    try:
+        # Use head_bucket to check for existence and access
+        s3_client.head_bucket(Bucket=bucket_name)
+        print(f"Bucket '{bucket_name}' exists and is accessible.")
+        return True, bucket_name  # Return True and the bucket name
+    except ClientError as e:
+        # If a ClientError occurs, check the error code.
+        # '404' means the bucket does not exist.
+        # '403' means the bucket exists but you don't have permission.
+        error_code = e.response["Error"]["Code"]
+        if error_code == "404":
+            print(f"Bucket '{bucket_name}' does not exist.")
+            return False, None
+        elif error_code == "403":
+            # The bucket exists, but you can't access it.
+            # Depending on your requirements, this might be treated as "exists"
+            # or "not accessible for our purpose". For checking existence,
+            # we'll say it exists here, but note the permission issue.
+            # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
+            print(
+                f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
+            )
+            return False, bucket_name  # It exists, even if not accessible
+        else:
+            # For other errors, it's better to raise the exception
+            # to indicate something unexpected happened.
+            print(
+                f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
+            )
+            # Decide how to handle other errors - raising might be safer
+            raise  # Re-raise the original exception
+    except Exception as e:
+        print(
+            f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
+        )
+        # Decide how to handle other errors
+        raise  # Re-raise the original exception
+# Example usage in your check_resources.py:
+# exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
+# context_data[f"exists:{log_bucket_name}"] = exists
+# # You don't necessarily need to store the name in context if using from_bucket_name
+# Delete an S3 bucket
+def delete_s3_bucket(bucket_name: str):
+    s3 = boto3.client("s3")
+    try:
+        # List and delete all objects
+        response = s3.list_object_versions(Bucket=bucket_name)
+        versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
+        for version in versions:
+            s3.delete_object(
+                Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
+            )
+        # Delete the bucket
+        s3.delete_bucket(Bucket=bucket_name)
+        return {"Status": "SUCCESS"}
+    except Exception as e:
+        return {"Status": "FAILED", "Reason": str(e)}
+# Function to get subnet ID from subnet name
+def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
+    response = ec2_client.describe_subnets(
+        Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
+    )
+    for subnet in response["Subnets"]:
+        if subnet["Tags"] and any(
+            tag["Key"] == "Name" and tag["Value"] == subnet_name
+            for tag in subnet["Tags"]
+        ):
+            return subnet["SubnetId"]
+    return None
+def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
+    """
+    Checks if an ECR repository with the given name exists.
+    Args:
+        repo_name: The name of the ECR repository to check.
+    Returns:
+        True if the repository exists, False otherwise.
+    """
+    ecr_client = boto3.client("ecr")
+    try:
+        print("ecr repo_name to check:", repo_name)
+        response = ecr_client.describe_repositories(repositoryNames=[repo_name])
+        # If describe_repositories succeeds and returns a list of repositories,
+        # and the list is not empty, the repository exists.
+        return len(response["repositories"]) > 0, response["repositories"][0]
+    except ClientError as e:
+        # Check for the specific error code indicating the repository doesn't exist
+        if e.response["Error"]["Code"] == "RepositoryNotFoundException":
+            return False, {}
+        else:
+            # Re-raise other exceptions to handle unexpected errors
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_codebuild_project_exists(
+    project_name: str,
+):  # Adjust return type hint as needed
+    """
+    Checks if a CodeBuild project with the given name exists.
+    Args:
+        project_name: The name of the CodeBuild project to check.
+    Returns:
+        A tuple:
+        - The first element is True if the project exists, False otherwise.
+        - The second element is the project object (dictionary) if found,
+          None otherwise.
+    """
+    codebuild_client = boto3.client("codebuild")
+    try:
+        # Use batch_get_projects with a list containing the single project name
+        response = codebuild_client.batch_get_projects(names=[project_name])
+        # The response for batch_get_projects includes 'projects' (found)
+        # and 'projectsNotFound' (not found).
+        if response["projects"]:
+            # If the project is found in the 'projects' list
+            print(f"CodeBuild project '{project_name}' found.")
+            return (
+                True,
+                response["projects"][0]["arn"],
+            )  # Return True and the project details dict
+        elif (
+            response["projectsNotFound"]
+            and project_name in response["projectsNotFound"]
+        ):
+            # If the project name is explicitly in the 'projectsNotFound' list
+            print(f"CodeBuild project '{project_name}' not found.")
+            return False, None
+        else:
+            # This case is less expected for a single name lookup,
+            # but could happen if there's an internal issue or the response
+            # structure is slightly different than expected for an error.
+            # It's safer to assume it wasn't found if not in 'projects'.
+            print(
+                f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
+            )
+            return False, None
+    except ClientError as e:
+        # Catch specific ClientErrors. batch_get_projects might not throw
+        # 'InvalidInputException' for a non-existent project name if the
+        # name format is valid. It typically just lists it in projectsNotFound.
+        # However, other ClientErrors are possible (e.g., permissions).
+        print(
+            f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
+        )
+        # Decide how to handle other ClientErrors - raising might be safer
+        raise  # Re-raise the original exception
+    except Exception as e:
+        print(
+            f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
+        )
+        # Decide how to handle other errors
+        raise  # Re-raise the original exception
+def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
+    """
+    Finds a VPC ID by its 'Name' tag.
+    """
+    ec2_client = boto3.client("ec2")
+    try:
+        response = ec2_client.describe_vpcs(
+            Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
+        )
+        if response and response["Vpcs"]:
+            vpc_id = response["Vpcs"][0]["VpcId"]
+            print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
+            # In get_vpc_id_by_name, after finding VPC ID:
+            # Look for NAT Gateways in this VPC
+            ec2_client = boto3.client("ec2")
+            nat_gateways = []
+            try:
+                response = ec2_client.describe_nat_gateways(
+                    Filters=[
+                        {"Name": "vpc-id", "Values": [vpc_id]},
+                        # Optional: Add a tag filter if you consistently tag your NATs
+                        # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
+                    ]
+                )
+                nat_gateways = response.get("NatGateways", [])
+            except Exception as e:
+                print(
+                    f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
+                )
+                # Decide how to handle this error - proceed or raise?
+            # Decide how to identify the specific NAT Gateway you want to check for.
+            return vpc_id, nat_gateways
+        else:
+            print(f"VPC '{vpc_name}' not found.")
+            return None
+    except Exception as e:
+        print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
+        raise
+# --- Helper to fetch all existing subnets in a VPC once ---
+def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
+    """
+    Fetches all subnets in a given VPC.
+    Returns a dictionary with 'by_name' (map of name to subnet data),
+    'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
+    """
+    ec2_client = boto3.client("ec2")
+    existing_subnets_data = {
+        "by_name": {},  # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
+        "by_id": {},  # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
+        "cidr_networks": [],  # List of ipaddress.IPv4Network objects
+    }
+    try:
+        response = ec2_client.describe_subnets(
+            Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
+        )
+        for s in response.get("Subnets", []):
+            subnet_id = s["SubnetId"]
+            cidr_block = s.get("CidrBlock")
+            # Extract 'Name' tag, which is crucial for lookup by name
+            name_tag = next(
+                (tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
+                None,
+            )
+            subnet_info = {"id": subnet_id, "cidr": cidr_block, "name": name_tag}
+            if name_tag:
+                existing_subnets_data["by_name"][name_tag] = subnet_info
+            existing_subnets_data["by_id"][subnet_id] = subnet_info
+            if cidr_block:
+                try:
+                    existing_subnets_data["cidr_networks"].append(
+                        ipaddress.ip_network(cidr_block, strict=False)
+                    )
+                except ValueError:
+                    print(
+                        f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
+                    )
+        print(
+            f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
+        )
+    except Exception as e:
+        print(
+            f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
+        )
+        raise  # Re-raise if this essential step fails
+    return existing_subnets_data
+# --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
+def validate_subnet_creation_parameters(
+    vpc_id: str,
+    proposed_subnets_data: List[
+        Dict[str, str]
+    ],  # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
+    existing_aws_subnets_data: Dict[
+        str, Any
+    ],  # Pre-fetched data from _get_existing_subnets_in_vpc
+) -> None:
+    """
+    Validates proposed subnet names and CIDR blocks against existing AWS subnets
+    in the specified VPC and against each other.
+    This function uses pre-fetched AWS subnet data.
+    Args:
+        vpc_id: The ID of the VPC (for logging/error messages).
+        proposed_subnets_data: A list of dictionaries, where each dict represents
+                               a proposed subnet with 'name', 'cidr', and 'az'.
+        existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+                                   (e.g., from _get_existing_subnets_in_vpc).
+    Raises:
+        ValueError: If any proposed subnet name or CIDR block
+                    conflicts with existing AWS resources or other proposed resources.
+    """
+    if not proposed_subnets_data:
+        print("No proposed subnet data provided for validation. Skipping.")
+        return
+    print(
+        f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
+    )
+    print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
+    existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
+    existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
+    # Sets to track names and list to track networks for internal batch consistency
+    proposed_names_seen: set[str] = set()
+    proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
+    for i, proposed_subnet in enumerate(proposed_subnets_data):
+        subnet_name = proposed_subnet.get("name")
+        cidr_block_str = proposed_subnet.get("cidr")
+        availability_zone = proposed_subnet.get("az")
+        if not all([subnet_name, cidr_block_str, availability_zone]):
+            raise ValueError(
+                f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
+            )
+        # 1. Check for duplicate names within the proposed batch
+        if subnet_name in proposed_names_seen:
+            raise ValueError(
+                f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
+            )
+        proposed_names_seen.add(subnet_name)
+        # 2. Check for duplicate names against existing AWS subnets
+        if subnet_name in existing_aws_subnet_names:
+            print(
+                f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
+            )
+        # Parse proposed CIDR
+        try:
+            proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
+        except ValueError as e:
+            raise ValueError(
+                f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
+            )
+        # 3. Check for overlapping CIDRs within the proposed batch
+        for existing_proposed_net in proposed_cidr_networks_seen:
+            if proposed_net.overlaps(existing_proposed_net):
+                raise ValueError(
+                    f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+                    f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
+                    f"within the same batch."
+                )
+        # 4. Check for overlapping CIDRs against existing AWS subnets
+        for existing_aws_net in existing_aws_cidr_networks:
+            if proposed_net.overlaps(existing_aws_net):
+                raise ValueError(
+                    f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
+                    f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
+                    f"in VPC '{vpc_id}'."
+                )
+        # If all checks pass for this subnet, add its network to the list for subsequent checks
+        proposed_cidr_networks_seen.append(proposed_net)
+        print(
+            f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
+        )
+    print(
+        f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
+    )
+# --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
+def check_subnet_exists_by_name(
+    subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
+) -> Tuple[bool, Optional[str]]:
+    """
+    Checks if a subnet with the given name exists within the pre-fetched data.
+    Args:
+        subnet_name: The 'Name' tag value of the subnet to check.
+        existing_aws_subnets_data: Dictionary containing existing AWS subnet data
+                                   (e.g., from _get_existing_subnets_in_vpc).
+    Returns:
+        A tuple:
+        - The first element is True if the subnet exists, False otherwise.
+        - The second element is the Subnet ID if found, None otherwise.
+    """
+    subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
+    if subnet_info:
+        print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
+        return True, subnet_info["id"]
+    else:
+        print(f"Subnet '{subnet_name}' not found.")
+        return False, None
+def create_nat_gateway(
+    scope: Construct,
+    public_subnet_for_nat: ec2.ISubnet,  # Expects a proper ISubnet
+    nat_gateway_name: str,
+    nat_gateway_id_context_key: str,
+) -> str:
+    """
+    Creates a single NAT Gateway in the specified public subnet.
+    It does not handle lookup from context; the calling stack should do that.
+    Returns the CloudFormation Ref of the NAT Gateway ID.
+    """
+    print(
+        f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
+    )
+    # Create an Elastic IP for the NAT Gateway
+    eip = ec2.CfnEIP(
+        scope,
+        NAT_GATEWAY_EIP_NAME,
+        tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
+    )
+    # Create the NAT Gateway
+    nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
+    nat_gateway = ec2.CfnNatGateway(
+        scope,
+        nat_gateway_logical_id,
+        subnet_id=public_subnet_for_nat.subnet_id,  # Associate with the public subnet
+        allocation_id=eip.attr_allocation_id,  # Associate with the EIP
+        tags=[CfnTag(key="Name", value=nat_gateway_name)],
+    )
+    # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
+    nat_gateway.add_dependency(eip)
+    # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
+    # This is how you will get the ID to put into cdk.context.json
+    CfnOutput(
+        scope,
+        "SingleNatGatewayIdOutput",
+        value=nat_gateway.ref,
+        description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
+        export_name=f"{scope.stack_name}-NatGatewayId",  # Make export name unique
+    )
+    print(
+        f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
+    )
+    # Return the tokenised reference for use within this synthesis
+    return nat_gateway.ref
+def create_subnets(
+    scope: Construct,
+    vpc: ec2.IVpc,
+    prefix: str,
+    subnet_names: List[str],
+    cidr_blocks: List[str],
+    availability_zones: List[str],
+    is_public: bool,
+    internet_gateway_id: Optional[str] = None,
+    single_nat_gateway_id: Optional[str] = None,
+) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
+    """
+    Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
+    for backward compatibility.
+    """
+    # --- Validations remain the same ---
+    if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
+        raise ValueError(
+            "Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
+        )
+    if is_public and not internet_gateway_id:
+        raise ValueError("internet_gateway_id must be provided for public subnets.")
+    if not is_public and not single_nat_gateway_id:
+        raise ValueError(
+            "single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
+        )
+    # --- We will populate these lists with the L1 objects to return ---
+    created_subnets: List[ec2.CfnSubnet] = []
+    created_route_tables: List[ec2.CfnRouteTable] = []
+    subnet_type_tag = "public" if is_public else "private"
+    for i, subnet_name in enumerate(subnet_names):
+        logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
+        # 1. Create the L2 Subnet (this is the easy part)
+        subnet = ec2.Subnet(
+            scope,
+            logical_id,
+            vpc_id=vpc.vpc_id,
+            cidr_block=cidr_blocks[i],
+            availability_zone=availability_zones[i],
+            map_public_ip_on_launch=is_public,
+        )
+        Tags.of(subnet).add("Name", subnet_name)
+        Tags.of(subnet).add("Type", subnet_type_tag)
+        if is_public:
+            # The subnet's route_table is automatically created by the L2 Subnet construct
+            try:
+                subnet.add_route(
+                    "DefaultInternetRoute",  # A logical ID for the CfnRoute resource
+                    router_id=internet_gateway_id,
+                    router_type=ec2.RouterType.GATEWAY,
+                    # destination_cidr_block="0.0.0.0/0" is the default for this method
+                )
+            except Exception as e:
+                print("Could not create IGW route for public subnet due to:", e)
+            print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
+        else:
+            try:
+                # Using .add_route() for private subnets as well for consistency
+                subnet.add_route(
+                    "DefaultNatRoute",  # A logical ID for the CfnRoute resource
+                    router_id=single_nat_gateway_id,
+                    router_type=ec2.RouterType.NAT_GATEWAY,
+                )
+            except Exception as e:
+                print("Could not create NAT gateway route for public subnet due to:", e)
+            print(
+                f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
+            )
+        route_table = subnet.route_table
+        created_subnets.append(subnet)
+        created_route_tables.append(route_table)
+    return created_subnets, created_route_tables
+def ingress_rule_exists(security_group: str, peer: str, port: str):
+    for rule in security_group.connections.security_groups:
+        if port:
+            if rule.peer == peer and rule.connection == port:
+                return True
+        else:
+            if rule.peer == peer:
+                return True
+    return False
+def check_for_existing_user_pool(user_pool_name: str):
+    cognito_client = boto3.client("cognito-idp")
+    list_pools_response = cognito_client.list_user_pools(
+        MaxResults=60
+    )  # MaxResults up to 60
+    # ListUserPools might require pagination if you have more than 60 pools
+    # This simple example doesn't handle pagination, which could miss your pool
+    existing_user_pool_id = ""
+    for pool in list_pools_response.get("UserPools", []):
+        if pool.get("Name") == user_pool_name:
+            existing_user_pool_id = pool["Id"]
+            print(
+                f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
+            )
+            break  # Found the one we're looking for
+    if existing_user_pool_id:
+        return True, existing_user_pool_id, pool
+    else:
+        return False, "", ""
+def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
+    """
+    Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
+    Args:
+        user_pool_id: The ID of the Cognito User Pool.
+        user_pool_client_name: The name of the User Pool Client to check for.
+    Returns:
+        A tuple:
+        - True, client_id, client_details if the client exists.
+        - False, "", {} otherwise.
+    """
+    cognito_client = boto3.client("cognito-idp")
+    next_token = "string"
+    while True:
+        try:
+            response = cognito_client.list_user_pool_clients(
+                UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
+            )
+        except cognito_client.exceptions.ResourceNotFoundException:
+            print(f"Error: User pool with ID '{user_pool_id}' not found.")
+            return False, "", {}
+        except cognito_client.exceptions.InvalidParameterException:
+            print(f"Error: No app clients for '{user_pool_id}' found.")
+            return False, "", {}
+        except Exception as e:
+            print("Could not check User Pool clients due to:", e)
+        for client in response.get("UserPoolClients", []):
+            if client.get("ClientName") == user_pool_client_name:
+                print(
+                    f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
+                )
+                return True, client["ClientId"], client
+        next_token = response.get("NextToken")
+        if not next_token:
+            break
+    return False, "", {}
+def check_for_secret(secret_name: str, secret_value: dict = ""):
+    """
+    Checks if a Secrets Manager secret with the given name exists.
+    If it doesn't exist, it creates the secret.
+    Args:
+        secret_name: The name of the Secrets Manager secret.
+        secret_value: A dictionary containing the key-value pairs for the secret.
+    Returns:
+        True if the secret existed or was created, False otherwise (due to other errors).
+    """
+    secretsmanager_client = boto3.client("secretsmanager")
+    try:
+        # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
+        secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
+        print("Secret already exists.")
+        return True, secret_value
+    except secretsmanager_client.exceptions.ResourceNotFoundException:
+        print("Secret not found")
+        return False, {}
+    except Exception as e:
+        # Handle other potential exceptions during the get operation
+        print(f"Error checking for secret: {e}")
+        return False, {}
+def check_alb_exists(
+    load_balancer_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if an Application Load Balancer (ALB) with the given name exists.
+    Args:
+        load_balancer_name: The name of the ALB to check.
+        region_name: The AWS region to check in.  If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the ALB exists, False otherwise.
+        - The second element is the ALB object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of
+          the LoadBalancers list from the describe_load_balancers response.
+    """
+    if region_name:
+        elbv2_client = boto3.client("elbv2", region_name=region_name)
+    else:
+        elbv2_client = boto3.client("elbv2")
+    try:
+        response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
+        if response["LoadBalancers"]:
+            return (
+                True,
+                response["LoadBalancers"][0],
+            )  # Return True and the first ALB object
+        else:
+            return False, {}
+    except ClientError as e:
+        #  If the error indicates the ALB doesn't exist, return False
+        if e.response["Error"]["Code"] == "LoadBalancerNotFound":
+            return False, {}
+        else:
+            # Re-raise other exceptions
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_fargate_task_definition_exists(
+    task_definition_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if a Fargate task definition with the given name exists.
+    Args:
+        task_definition_name: The name or ARN of the task definition to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the task definition exists, False otherwise.
+        - The second element is the task definition object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of the
+          taskDefinitions list from the describe_task_definition response.
+    """
+    if region_name:
+        ecs_client = boto3.client("ecs", region_name=region_name)
+    else:
+        ecs_client = boto3.client("ecs")
+    try:
+        response = ecs_client.describe_task_definition(
+            taskDefinition=task_definition_name
+        )
+        # If describe_task_definition succeeds, it returns the task definition.
+        # We can directly return True and the task definition.
+        return True, response["taskDefinition"]
+    except ClientError as e:
+        # Check for the error code indicating the task definition doesn't exist.
+        if (
+            e.response["Error"]["Code"] == "ClientException"
+            and "Task definition" in e.response["Message"]
+            and "does not exist" in e.response["Message"]
+        ):
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_ecs_service_exists(
+    cluster_name: str, service_name: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if an ECS service with the given name exists in the specified cluster.
+    Args:
+        cluster_name: The name or ARN of the ECS cluster.
+        service_name: The name of the ECS service to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.
+    Returns:
+        A tuple:
+        - The first element is True if the service exists, False otherwise.
+        - The second element is the service object (dictionary) if found,
+          None otherwise.
+    """
+    if region_name:
+        ecs_client = boto3.client("ecs", region_name=region_name)
+    else:
+        ecs_client = boto3.client("ecs")
+    try:
+        response = ecs_client.describe_services(
+            cluster=cluster_name, services=[service_name]
+        )
+        if response["services"]:
+            return (
+                True,
+                response["services"][0],
+            )  # Return True and the first service object
+        else:
+            return False, {}
+    except ClientError as e:
+        # Check for the error code indicating the service doesn't exist.
+        if e.response["Error"]["Code"] == "ClusterNotFoundException":
+            return False, {}
+        elif e.response["Error"]["Code"] == "ServiceNotFoundException":
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def check_cloudfront_distribution_exists(
+    distribution_name: str, region_name: str = None
+) -> tuple[bool, dict | None]:
+    """
+    Checks if a CloudFront distribution with the given name exists.
+    Args:
+        distribution_name: The name of the CloudFront distribution to check.
+        region_name: The AWS region to check in. If None, uses the default
+                     session region.  Note: CloudFront is a global service,
+                     so the region is usually 'us-east-1', but this parameter
+                     is included for completeness.
+    Returns:
+        A tuple:
+        - The first element is True if the distribution exists, False otherwise.
+        - The second element is the distribution object (dictionary) if found,
+          None otherwise.  Specifically, it returns the first element of the
+          DistributionList from the ListDistributions response.
+    """
+    if region_name:
+        cf_client = boto3.client("cloudfront", region_name=region_name)
+    else:
+        cf_client = boto3.client("cloudfront")
+    try:
+        response = cf_client.list_distributions()
+        if "Items" in response["DistributionList"]:
+            for distribution in response["DistributionList"]["Items"]:
+                # CloudFront doesn't directly filter by name, so we have to iterate.
+                if (
+                    distribution["AliasSet"]["Items"]
+                    and distribution["AliasSet"]["Items"][0] == distribution_name
+                ):
+                    return True, distribution
+            return False, None
+        else:
+            return False, None
+    except ClientError as e:
+        #  If the error indicates the Distribution doesn't exist, return False
+        if e.response["Error"]["Code"] == "NoSuchDistribution":
+            return False, None
+        else:
+            # Re-raise other exceptions
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, None
+def create_web_acl_with_common_rules(
+    scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
+):
+    """
+    Use CDK to create a web ACL based on an AWS common rule set with overrides.
+    This function now expects a 'scope' argument, typically 'self' from your stack,
+    as CfnWebACL requires a construct scope.
+    """
+    # Create full list of rules
+    rules = []
+    aws_ruleset_names = [
+        "AWSManagedRulesCommonRuleSet",
+        "AWSManagedRulesKnownBadInputsRuleSet",
+        "AWSManagedRulesAmazonIpReputationList",
+    ]
+    # Use a separate counter to assign unique priorities sequentially
+    priority_counter = 1
+    for aws_rule_name in aws_ruleset_names:
+        current_rule_action_overrides = None
+        # All managed rule groups need an override_action.
+        # 'none' means use the managed rule group's default action.
+        current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
+        current_priority = priority_counter
+        priority_counter += 1
+        if aws_rule_name == "AWSManagedRulesCommonRuleSet":
+            current_rule_action_overrides = [
+                wafv2.CfnWebACL.RuleActionOverrideProperty(
+                    name="SizeRestrictions_BODY",
+                    action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
+                )
+            ]
+            # No need to set current_override_action here, it's already set above.
+            # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
+            # For now, it will get priority 1 from the counter.
+        rule_property = wafv2.CfnWebACL.RuleProperty(
+            name=aws_rule_name,
+            priority=current_priority,
+            statement=wafv2.CfnWebACL.StatementProperty(
+                managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
+                    vendor_name="AWS",
+                    name=aws_rule_name,
+                    rule_action_overrides=current_rule_action_overrides,
+                )
+            ),
+            visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+                cloud_watch_metrics_enabled=True,
+                metric_name=aws_rule_name,
+                sampled_requests_enabled=True,
+            ),
+            override_action=current_override_action,  # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
+        )
+        rules.append(rule_property)
+    # Add the rate limit rule
+    rate_limit_priority = priority_counter  # Use the next available priority
+    rules.append(
+        wafv2.CfnWebACL.RuleProperty(
+            name="RateLimitRule",
+            priority=rate_limit_priority,
+            statement=wafv2.CfnWebACL.StatementProperty(
+                rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
+                    limit=1000, aggregate_key_type="IP"
+                )
+            ),
+            visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+                cloud_watch_metrics_enabled=True,
+                metric_name="RateLimitRule",
+                sampled_requests_enabled=True,
+            ),
+            action=wafv2.CfnWebACL.RuleActionProperty(block={}),
+        )
+    )
+    web_acl = wafv2.CfnWebACL(
+        scope,
+        "WebACL",
+        name=web_acl_name,
+        default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
+        scope=waf_scope,
+        visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
+            cloud_watch_metrics_enabled=True,
+            metric_name="webACL",
+            sampled_requests_enabled=True,
+        ),
+        rules=rules,
+    )
+    CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
+    return web_acl
+def check_web_acl_exists(
+    web_acl_name: str, scope: str, region_name: str = None
+) -> tuple[bool, dict]:
+    """
+    Checks if a Web ACL with the given name and scope exists.
+    Args:
+        web_acl_name: The name of the Web ACL to check.
+        scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
+        region_name: The AWS region to check in. Required for REGIONAL scope.
+                     If None, uses the default session region.  For CLOUDFRONT,
+                     the region should be 'us-east-1'.
+    Returns:
+        A tuple:
+        - The first element is True if the Web ACL exists, False otherwise.
+        - The second element is the Web ACL object (dictionary) if found,
+          None otherwise.
+    """
+    if scope not in ["CLOUDFRONT", "REGIONAL"]:
+        raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
+    if scope == "REGIONAL" and not region_name:
+        raise ValueError("Region name is required for REGIONAL scope")
+    if scope == "CLOUDFRONT":
+        region_name = "us-east-1"  # CloudFront scope requires us-east-1
+    if region_name:
+        waf_client = boto3.client("wafv2", region_name=region_name)
+    else:
+        waf_client = boto3.client("wafv2")
+    try:
+        response = waf_client.list_web_acls(Scope=scope)
+        if "WebACLs" in response:
+            for web_acl in response["WebACLs"]:
+                if web_acl["Name"] == web_acl_name:
+                    # Describe the Web ACL to get the full object.
+                    describe_response = waf_client.describe_web_acl(
+                        Name=web_acl_name, Scope=scope
+                    )
+                    return True, describe_response["WebACL"]
+            return False, {}
+        else:
+            return False, {}
+    except ClientError as e:
+        # Check for the error code indicating the web ACL doesn't exist.
+        if e.response["Error"]["Code"] == "ResourceNotFoundException":
+            return False, {}
+        else:
+            # Re-raise other exceptions.
+            raise
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False, {}
+def add_alb_https_listener_with_cert(
+    scope: Construct,
+    logical_id: str,  # A unique ID for this listener construct
+    alb: elb.ApplicationLoadBalancer,
+    acm_certificate_arn: Optional[
+        str
+    ],  # Optional: If None, no HTTPS listener will be created
+    default_target_group: elb.ITargetGroup,  # Mandatory: The target group to forward traffic to
+    listener_port_https: int = 443,
+    listener_open_to_internet: bool = False,  # Be cautious with True, ensure ALB security group restricts access
+    # --- Cognito Authentication Parameters ---
+    enable_cognito_auth: bool = False,
+    cognito_user_pool: Optional[cognito.IUserPool] = None,
+    cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
+    cognito_user_pool_domain: Optional[
+        str
+    ] = None,  # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
+    cognito_auth_scope: Optional[
+        str
+    ] = "openid profile email",  # Default recommended scope
+    cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
+    stickiness_cookie_duration=None,
+    # --- End Cognito Parameters ---
+) -> Optional[elb.ApplicationListener]:
+    """
+    Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
+    and optionally enables Cognito User Pool authentication.
+    Args:
+        scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
+        logical_id (str): A unique logical ID for the listener construct within the stack.
+        alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
+        acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
+                                             If None, the HTTPS listener will NOT be created.
+        default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
+                                                 This is mandatory for a functional listener.
+        listener_port_https (int): The HTTPS port to listen on (default: 443).
+        listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
+                                          If False (recommended), ensure your ALB's security group allows
+                                          inbound traffic on this port from desired sources.
+        enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
+        cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
+        cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
+        cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
+        cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
+        cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
+                                                                           Defaults to AUTHENTICATE (redirect to login).
+    Returns:
+        Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
+                                           None if no ACM certificate ARN was provided.
+    """
+    https_listener = None
+    if acm_certificate_arn:
+        certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
+        print(
+            f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
+        )
+        # Determine the default action based on whether Cognito auth is enabled
+        default_action = None
+        if enable_cognito_auth is True:
+            if not all(
+                [cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
+            ):
+                raise ValueError(
+                    "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
+                )
+            print(
+                f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
+            )
+            default_action = elb_act.AuthenticateCognitoAction(
+                next=elb.ListenerAction.forward(
+                    [default_target_group]
+                ),  # After successful auth, forward to TG
+                user_pool=cognito_user_pool,
+                user_pool_client=cognito_user_pool_client,
+                user_pool_domain=cognito_user_pool_domain,
+                scope=cognito_auth_scope,
+                on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
+                session_timeout=stickiness_cookie_duration,
+                # Additional options you might want to configure:
+                # session_cookie_name="AWSELBCookies"
+            )
+        else:
+            default_action = elb.ListenerAction.forward([default_target_group])
+            print("Cognito authentication is NOT enabled for this listener.")
+        # Add the HTTPS listener
+        https_listener = alb.add_listener(
+            logical_id,
+            port=listener_port_https,
+            open=listener_open_to_internet,
+            certificates=certificates_list,
+            default_action=default_action,  # Use the determined default action
+        )
+        print(f"ALB HTTPS listener on port {listener_port_https} defined.")
+    else:
+        print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
+    return https_listener
+def ensure_folder_exists(output_folder: str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def create_basic_config_env(
+    out_dir: str = "config",
+    S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
+    ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
+):
+    """
+    Create a basic config.env file for the user to use with their newly deployed redaction app.
+    """
+    variables = {
+        "COGNITO_AUTH": "True",
+        "RUN_AWS_FUNCTIONS": "True",
+        "DISPLAY_FILE_NAMES_IN_LOGS": "False",
+        "SESSION_OUTPUT_FOLDER": "True",
+        "SAVE_LOGS_TO_DYNAMODB": "True",
+        "SHOW_COSTS": "True",
+        "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
+        "LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
+        "DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
+        "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
+        "ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
+        "FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+        "USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
+    }
+    # Write variables to .env file
+    ensure_folder_exists(out_dir + "/")
+    env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
+    # It's good practice to ensure the file exists before calling set_key repeatedly.
+    # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
+    if not os.path.exists(env_file_path):
+        with open(env_file_path, "w"):
+            pass  # Create empty file
+    for key, value in variables.items():
+        set_key(env_file_path, key, str(value), quote_mode="never")
+    return variables
+def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
+    """
+    Start an existing Codebuild project build
+    """
+    # --- Initialize CodeBuild client ---
+    client = boto3.client("codebuild", region_name=AWS_REGION)
+    try:
+        print(f"Attempting to start build for project: {PROJECT_NAME}")
+        response = client.start_build(projectName=PROJECT_NAME)
+        build_id = response["build"]["id"]
+        print(f"Successfully started build with ID: {build_id}")
+        print(f"Build ARN: {response['build']['arn']}")
+        print("Build URL (approximate - construct based on region and ID):")
+        print(
+            f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
+        )
+        # You can inspect the full response if needed
+        # print("\nFull response:")
+        # import json
+        # print(json.dumps(response, indent=2))
+    except client.exceptions.ResourceNotFoundException:
+        print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+def upload_file_to_s3(
+    local_file_paths: List[str],
+    s3_key: str,
+    s3_bucket: str,
+    RUN_AWS_FUNCTIONS: str = "1",
+):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    final_out_message_str = ""
+    if RUN_AWS_FUNCTIONS == "1":
+        try:
+            if s3_bucket and local_file_paths:
+                s3_client = boto3.client("s3", region_name=AWS_REGION)
+                if isinstance(local_file_paths, str):
+                    local_file_paths = [local_file_paths]
+                for file in local_file_paths:
+                    if s3_client:
+                        # print(s3_client)
+                        try:
+                            # Get file name off file path
+                            file_name = os.path.basename(file)
+                            s3_key_full = s3_key + file_name
+                            print("S3 key: ", s3_key_full)
+                            s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = (
+                                "File " + file_name + " uploaded successfully!"
+                            )
+                            print(out_message)
+                        except Exception as e:
+                            out_message = f"Error uploading file(s): {e}"
+                            print(out_message)
+                        final_out_message.append(out_message)
+                        final_out_message_str = "\n".join(final_out_message)
+                    else:
+                        final_out_message_str = "Could not connect to AWS."
+            else:
+                final_out_message_str = (
+                    "At least one essential variable is empty, could not upload to S3"
+                )
+        except Exception as e:
+            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+            print(final_out_message_str)
+    else:
+        final_out_message_str = "App not set to run AWS functions"
+    return final_out_message_str
+# Initialize ECS client
+def start_ecs_task(cluster_name, service_name):
+    ecs_client = boto3.client("ecs")
+    try:
+        # Update the service to set the desired count to 1
+        ecs_client.update_service(
+            cluster=cluster_name, service=service_name, desiredCount=1
+        )
+        return {
+            "statusCode": 200,
+            "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
+        }
+    except Exception as e:
+        return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}

cdk/cdk_stack.py ADDED Viewed

	@@ -0,0 +1,1869 @@

+import json  # You might still need json if loading task_definition.json
+import os
+from typing import Any, Dict, List
+from aws_cdk import (
+    CfnOutput,  # <-- Import CfnOutput directly
+    Duration,
+    RemovalPolicy,
+    SecretValue,
+    Stack,
+)
+from aws_cdk import aws_cloudfront as cloudfront
+from aws_cdk import aws_cloudfront_origins as origins
+from aws_cdk import aws_codebuild as codebuild
+from aws_cdk import aws_cognito as cognito
+from aws_cdk import aws_dynamodb as dynamodb  # Import the DynamoDB module
+from aws_cdk import aws_ec2 as ec2
+from aws_cdk import aws_ecr as ecr
+from aws_cdk import aws_ecs as ecs
+from aws_cdk import aws_elasticloadbalancingv2 as elbv2
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_kms as kms
+from aws_cdk import aws_logs as logs
+from aws_cdk import aws_s3 as s3
+from aws_cdk import aws_secretsmanager as secretsmanager
+from aws_cdk import aws_wafv2 as wafv2
+from cdk_config import (
+    ACCESS_LOG_DYNAMODB_TABLE_NAME,
+    ACM_SSL_CERTIFICATE_ARN,
+    ALB_NAME,
+    ALB_NAME_SECURITY_GROUP_NAME,
+    ALB_TARGET_GROUP_NAME,
+    AWS_ACCOUNT_ID,
+    AWS_MANAGED_TASK_ROLES_LIST,
+    AWS_REGION,
+    CDK_PREFIX,
+    CLOUDFRONT_DISTRIBUTION_NAME,
+    CLOUDFRONT_GEO_RESTRICTION,
+    CLUSTER_NAME,
+    CODEBUILD_PROJECT_NAME,
+    CODEBUILD_ROLE_NAME,
+    COGNITO_ACCESS_TOKEN_VALIDITY,
+    COGNITO_ID_TOKEN_VALIDITY,
+    COGNITO_REDIRECTION_URL,
+    COGNITO_REFRESH_TOKEN_VALIDITY,
+    COGNITO_USER_POOL_CLIENT_NAME,
+    COGNITO_USER_POOL_CLIENT_SECRET_NAME,
+    COGNITO_USER_POOL_DOMAIN_PREFIX,
+    COGNITO_USER_POOL_NAME,
+    CUSTOM_HEADER,
+    CUSTOM_HEADER_VALUE,
+    CUSTOM_KMS_KEY_NAME,
+    DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
+    ECR_CDK_REPO_NAME,
+    ECS_LOG_GROUP_NAME,
+    ECS_READ_ONLY_FILE_SYSTEM,
+    ECS_SECURITY_GROUP_NAME,
+    ECS_SERVICE_NAME,
+    ECS_TASK_CPU_SIZE,
+    ECS_TASK_EXECUTION_ROLE_NAME,
+    ECS_TASK_MEMORY_SIZE,
+    ECS_TASK_ROLE_NAME,
+    ECS_USE_FARGATE_SPOT,
+    EXISTING_IGW_ID,
+    FARGATE_TASK_DEFINITION_NAME,
+    FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+    GITHUB_REPO_BRANCH,
+    GITHUB_REPO_NAME,
+    GITHUB_REPO_USERNAME,
+    GRADIO_SERVER_PORT,
+    LOAD_BALANCER_WEB_ACL_NAME,
+    NAT_GATEWAY_NAME,
+    NEW_VPC_CIDR,
+    NEW_VPC_DEFAULT_NAME,
+    PRIVATE_SUBNET_AVAILABILITY_ZONES,
+    PRIVATE_SUBNET_CIDR_BLOCKS,
+    PRIVATE_SUBNETS_TO_USE,
+    PUBLIC_SUBNET_AVAILABILITY_ZONES,
+    PUBLIC_SUBNET_CIDR_BLOCKS,
+    PUBLIC_SUBNETS_TO_USE,
+    S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME,
+    SAVE_LOGS_TO_DYNAMODB,
+    SINGLE_NAT_GATEWAY_ID,
+    TASK_DEFINITION_FILE_LOCATION,
+    USAGE_LOG_DYNAMODB_TABLE_NAME,
+    USE_CLOUDFRONT,
+    USE_CUSTOM_KMS_KEY,
+    VPC_NAME,
+    WEB_ACL_NAME,
+)
+from cdk_functions import (  # Only keep CDK-native functions
+    add_alb_https_listener_with_cert,
+    add_custom_policies,
+    create_nat_gateway,
+    create_subnets,
+    create_web_acl_with_common_rules,
+)
+from constructs import Construct
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+# 1. Try to load CIDR/AZs from environment variables
+if PUBLIC_SUBNETS_TO_USE:
+    PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE:
+    PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS:
+    PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
+if PUBLIC_SUBNET_AVAILABILITY_ZONES:
+    PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
+if PRIVATE_SUBNET_CIDR_BLOCKS:
+    PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
+if PRIVATE_SUBNET_AVAILABILITY_ZONES:
+    PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
+        "PRIVATE_SUBNET_AVAILABILITY_ZONES"
+    )
+if AWS_MANAGED_TASK_ROLES_LIST:
+    AWS_MANAGED_TASK_ROLES_LIST = _get_env_list(AWS_MANAGED_TASK_ROLES_LIST)
+class CdkStack(Stack):
+    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
+        super().__init__(scope, construct_id, **kwargs)
+        # --- Helper to get context values ---
+        def get_context_bool(key: str, default: bool = False) -> bool:
+            return self.node.try_get_context(key) or default
+        def get_context_str(key: str, default: str = None) -> str:
+            return self.node.try_get_context(key) or default
+        def get_context_dict(key: str, default: dict = None) -> dict:
+            return self.node.try_get_context(key) or default
+        def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
+            ctx_value = self.node.try_get_context(key)
+            if not isinstance(ctx_value, list):
+                print(
+                    f"Warning: Context key '{key}' not found or not a list. Returning empty list."
+                )
+                return []
+            # Optional: Add validation that all items in the list are dicts
+            return ctx_value
+        self.template_options.description = "Deployment of the 'doc_redaction' PDF, image, and XLSX/CSV redaction app. Git repo available at: https://github.com/seanpedrick-case/doc_redaction."
+        # --- VPC and Subnets (Assuming VPC is always lookup, Subnets are created/returned by create_subnets) ---
+        new_vpc_created = False
+        if VPC_NAME:
+            print("Looking for current VPC:", VPC_NAME)
+            try:
+                vpc = ec2.Vpc.from_lookup(self, "VPC", vpc_name=VPC_NAME)
+                print("Successfully looked up VPC:", vpc.vpc_id)
+            except Exception as e:
+                raise Exception(
+                    f"Could not look up VPC with name '{VPC_NAME}' due to: {e}"
+                )
+        elif NEW_VPC_DEFAULT_NAME:
+            new_vpc_created = True
+            print(
+                f"NEW_VPC_DEFAULT_NAME ('{NEW_VPC_DEFAULT_NAME}') is set. Creating a new VPC."
+            )
+            # Configuration for the new VPC
+            # You can make these configurable via context as well, e.g.,
+            # new_vpc_cidr = self.node.try_get_context("new_vpc_cidr") or "10.0.0.0/24"
+            # new_vpc_max_azs = self.node.try_get_context("new_vpc_max_azs") or 2 # Use 2 AZs by default for HA
+            # new_vpc_nat_gateways = self.node.try_get_context("new_vpc_nat_gateways") or new_vpc_max_azs # One NAT GW per AZ for HA
+            # or 1 for cost savings if acceptable
+            if not NEW_VPC_CIDR:
+                raise Exception(
+                    "App has been instructed to create a new VPC but not VPC CDR range provided to variable NEW_VPC_CIDR"
+                )
+            print("Provided NEW_VPC_CIDR range:", NEW_VPC_CIDR)
+            new_vpc_cidr = NEW_VPC_CIDR
+            new_vpc_max_azs = 2  # Creates resources in 2 AZs. Adjust as needed.
+            # For "a NAT gateway", you can set nat_gateways=1.
+            # For resilience (NAT GW per AZ), set nat_gateways=new_vpc_max_azs.
+            # The Vpc construct will create NAT Gateway(s) if subnet_type PRIVATE_WITH_EGRESS is used
+            # and nat_gateways > 0.
+            new_vpc_nat_gateways = (
+                1  # Creates a single NAT Gateway for cost-effectiveness.
+            )
+            # If you need one per AZ for higher availability, set this to new_vpc_max_azs.
+            vpc = ec2.Vpc(
+                self,
+                "MyNewLogicalVpc",  # This is the CDK construct ID
+                vpc_name=NEW_VPC_DEFAULT_NAME,
+                ip_addresses=ec2.IpAddresses.cidr(new_vpc_cidr),
+                max_azs=new_vpc_max_azs,
+                nat_gateways=new_vpc_nat_gateways,  # Number of NAT gateways to create
+                subnet_configuration=[
+                    ec2.SubnetConfiguration(
+                        name="Public",  # Name prefix for public subnets
+                        subnet_type=ec2.SubnetType.PUBLIC,
+                        cidr_mask=28,  # Adjust CIDR mask as needed (e.g., /24 provides ~250 IPs per subnet)
+                    ),
+                    ec2.SubnetConfiguration(
+                        name="Private",  # Name prefix for private subnets
+                        subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,  # Ensures these subnets have NAT Gateway access
+                        cidr_mask=28,  # Adjust CIDR mask as needed
+                    ),
+                    # You could also add ec2.SubnetType.PRIVATE_ISOLATED if needed
+                ],
+                # Internet Gateway is created and configured automatically for PUBLIC subnets.
+                # Route tables for public subnets will point to the IGW.
+                # Route tables for PRIVATE_WITH_EGRESS subnets will point to the NAT Gateway(s).
+            )
+            print(
+                f"Successfully created new VPC: {vpc.vpc_id} with name '{NEW_VPC_DEFAULT_NAME}'"
+            )
+            # If nat_gateways > 0, vpc.nat_gateway_ips will contain EIPs if Vpc created them.
+            # vpc.public_subnets, vpc.private_subnets, vpc.isolated_subnets are populated.
+        else:
+            raise Exception(
+                "VPC_NAME for current VPC not found, and NEW_VPC_DEFAULT_NAME not found to create a new VPC"
+            )
+        # --- Subnet Handling (Check Context and Create/Import) ---
+        # Initialize lists to hold ISubnet objects (L2) and CfnSubnet/CfnRouteTable (L1)
+        # We will store ISubnet for consistency, as CfnSubnet has a .subnet_id property
+        self.public_subnets: List[ec2.ISubnet] = []
+        self.private_subnets: List[ec2.ISubnet] = []
+        # Store L1 CfnRouteTables explicitly if you need to reference them later
+        self.private_route_tables_cfn: List[ec2.CfnRouteTable] = []
+        self.public_route_tables_cfn: List[ec2.CfnRouteTable] = (
+            []
+        )  # New: to store public RTs
+        names_to_create_private = []
+        names_to_create_public = []
+        if not PUBLIC_SUBNETS_TO_USE and not PRIVATE_SUBNETS_TO_USE:
+            print(
+                "Warning: No public or private subnets specified in *_SUBNETS_TO_USE. Attempting to select from existing VPC subnets."
+            )
+            print("vpc.public_subnets:", vpc.public_subnets)
+            print("vpc.private_subnets:", vpc.private_subnets)
+            if (
+                vpc.public_subnets
+            ):  # These are already one_per_az if max_azs was used and Vpc created them
+                self.public_subnets.extend(vpc.public_subnets)
+            else:
+                self.node.add_warning("No public subnets found in the VPC.")
+            # Get private subnets with egress specifically
+            # selected_private_subnets_with_egress = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)
+            print(
+                f"Selected from VPC: {len(self.public_subnets)} public, {len(self.private_subnets)} private_with_egress subnets."
+            )
+            if (
+                len(self.public_subnets) < 1 or len(self.private_subnets) < 1
+            ):  # Simplified check for new VPC
+                # If new_vpc_max_azs was 1, you'd have 1 of each. If 2, then 2 of each.
+                # The original check ' < 2' might be too strict if new_vpc_max_azs=1
+                pass  # For new VPC, allow single AZ setups if configured that way. The VPC construct ensures one per AZ up to max_azs.
+            if not self.public_subnets and not self.private_subnets:
+                print(
+                    "Error: No public or private subnets could be found in the VPC for automatic selection. "
+                    "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
+                )
+                raise RuntimeError("No suitable subnets found for automatic selection.")
+            else:
+                print(
+                    f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC properties."
+                )
+            selected_public_subnets = vpc.select_subnets(
+                subnet_type=ec2.SubnetType.PUBLIC, one_per_az=True
+            )
+            private_subnets_egress = vpc.select_subnets(
+                subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, one_per_az=True
+            )
+            if private_subnets_egress.subnets:
+                self.private_subnets.extend(private_subnets_egress.subnets)
+            else:
+                self.node.add_warning(
+                    "No PRIVATE_WITH_EGRESS subnets found in the VPC."
+                )
+            try:
+                private_subnets_isolated = vpc.select_subnets(
+                    subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, one_per_az=True
+                )
+            except Exception as e:
+                private_subnets_isolated = []
+                print("Could not find any isolated subnets due to:", e)
+            ###
+            combined_subnet_objects = []
+            if private_subnets_isolated:
+                if private_subnets_egress.subnets:
+                    # Add the first PRIVATE_WITH_EGRESS subnet
+                    combined_subnet_objects.append(private_subnets_egress.subnets[0])
+            elif not private_subnets_isolated:
+                if private_subnets_egress.subnets:
+                    # Add the first PRIVATE_WITH_EGRESS subnet
+                    combined_subnet_objects.extend(private_subnets_egress.subnets)
+            else:
+                self.node.add_warning(
+                    "No PRIVATE_WITH_EGRESS subnets found to select the first one."
+                )
+            # Add all PRIVATE_ISOLATED subnets *except* the first one (if they exist)
+            try:
+                if len(private_subnets_isolated.subnets) > 1:
+                    combined_subnet_objects.extend(private_subnets_isolated.subnets[1:])
+                elif (
+                    private_subnets_isolated.subnets
+                ):  # Only 1 isolated subnet, add a warning if [1:] was desired
+                    self.node.add_warning(
+                        "Only one PRIVATE_ISOLATED subnet found, private_subnets_isolated.subnets[1:] will be empty."
+                    )
+                else:
+                    self.node.add_warning("No PRIVATE_ISOLATED subnets found.")
+            except Exception as e:
+                print("Could not identify private isolated subnets due to:", e)
+            # Create an ec2.SelectedSubnets object from the combined private subnet list.
+            selected_private_subnets = vpc.select_subnets(
+                subnets=combined_subnet_objects
+            )
+            print("selected_public_subnets:", selected_public_subnets)
+            print("selected_private_subnets:", selected_private_subnets)
+            if (
+                len(selected_public_subnets.subnet_ids) < 2
+                or len(selected_private_subnets.subnet_ids) < 2
+            ):
+                raise Exception(
+                    "Need at least two public or private subnets in different availability zones"
+                )
+            if not selected_public_subnets and not selected_private_subnets:
+                # If no subnets could be found even with automatic selection, raise an error.
+                # This ensures the stack doesn't proceed if it absolutely needs subnets.
+                print(
+                    "Error: No existing public or private subnets could be found in the VPC for automatic selection. "
+                    "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
+                )
+                raise RuntimeError("No suitable subnets found for automatic selection.")
+            else:
+                self.public_subnets = selected_public_subnets.subnets
+                self.private_subnets = selected_private_subnets.subnets
+                print(
+                    f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC discovery."
+                )
+                print("self.public_subnets:", self.public_subnets)
+                print("self.private_subnets:", self.private_subnets)
+                # Since subnets are now assigned, we can exit this processing block.
+                # The rest of the original code (which iterates *_SUBNETS_TO_USE) will be skipped.
+        checked_public_subnets_ctx = get_context_dict("checked_public_subnets")
+        get_context_dict("checked_private_subnets")
+        public_subnets_data_for_creation_ctx = get_context_list_of_dicts(
+            "public_subnets_to_create"
+        )
+        private_subnets_data_for_creation_ctx = get_context_list_of_dicts(
+            "private_subnets_to_create"
+        )
+        # --- 3. Process Public Subnets ---
+        print("\n--- Processing Public Subnets ---")
+        # Import existing public subnets
+        if checked_public_subnets_ctx:
+            for i, subnet_name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                subnet_info = checked_public_subnets_ctx.get(subnet_name)
+                if subnet_info and subnet_info.get("exists"):
+                    subnet_id = subnet_info.get("id")
+                    if not subnet_id:
+                        raise RuntimeError(
+                            f"Context for existing public subnet '{subnet_name}' is missing 'id'."
+                        )
+                    try:
+                        ec2.Subnet.from_subnet_id(
+                            self,
+                            f"ImportedPublicSubnet{subnet_name.replace('-', '')}{i}",
+                            subnet_id,
+                        )
+                        # self.public_subnets.append(imported_subnet)
+                        print(
+                            f"Imported existing public subnet: {subnet_name} (ID: {subnet_id})"
+                        )
+                    except Exception as e:
+                        raise RuntimeError(
+                            f"Failed to import public subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}"
+                        )
+        # Create new public subnets based on public_subnets_data_for_creation_ctx
+        if public_subnets_data_for_creation_ctx:
+            names_to_create_public = [
+                s["name"] for s in public_subnets_data_for_creation_ctx
+            ]
+            cidrs_to_create_public = [
+                s["cidr"] for s in public_subnets_data_for_creation_ctx
+            ]
+            azs_to_create_public = [
+                s["az"] for s in public_subnets_data_for_creation_ctx
+            ]
+            if names_to_create_public:
+                print(
+                    f"Attempting to create {len(names_to_create_public)} new public subnets: {names_to_create_public}"
+                )
+                newly_created_public_subnets, newly_created_public_rts_cfn = (
+                    create_subnets(
+                        self,
+                        vpc,
+                        CDK_PREFIX,
+                        names_to_create_public,
+                        cidrs_to_create_public,
+                        azs_to_create_public,
+                        is_public=True,
+                        internet_gateway_id=EXISTING_IGW_ID,
+                    )
+                )
+                self.public_subnets.extend(newly_created_public_subnets)
+                self.public_route_tables_cfn.extend(newly_created_public_rts_cfn)
+        if (
+            not self.public_subnets
+            and not names_to_create_public
+            and not PUBLIC_SUBNETS_TO_USE
+        ):
+            raise Exception("No public subnets found or created, exiting.")
+        # --- NAT Gateway Creation/Lookup ---
+        print("Creating NAT gateway/located existing")
+        self.single_nat_gateway_id = None
+        nat_gw_id_from_context = SINGLE_NAT_GATEWAY_ID
+        if nat_gw_id_from_context:
+            print(
+                f"Using existing NAT Gateway ID from context: {nat_gw_id_from_context}"
+            )
+            self.single_nat_gateway_id = nat_gw_id_from_context
+        elif (
+            new_vpc_created
+            and new_vpc_nat_gateways > 0
+            and hasattr(vpc, "nat_gateways")
+            and vpc.nat_gateways
+        ):
+            self.single_nat_gateway_id = vpc.nat_gateways[0].gateway_id
+            print(
+                f"Using NAT Gateway {self.single_nat_gateway_id} created by the new VPC construct."
+            )
+        if not self.single_nat_gateway_id:
+            print("Creating a new NAT gateway")
+            if hasattr(vpc, "nat_gateways") and vpc.nat_gateways:
+                print("Existing NAT gateway found in vpc")
+                pass
+                # If not in context, create a new one, but only if we have a public subnet.
+            elif self.public_subnets:
+                print("NAT Gateway ID not found in context. Creating a new one.")
+                # Place the NAT GW in the first available public subnet
+                first_public_subnet = self.public_subnets[0]
+                self.single_nat_gateway_id = create_nat_gateway(
+                    self,
+                    first_public_subnet,
+                    nat_gateway_name=NAT_GATEWAY_NAME,
+                    nat_gateway_id_context_key=SINGLE_NAT_GATEWAY_ID,
+                )
+            else:
+                print(
+                    "WARNING: No public subnets available and NAT gateway not found in existing VPC. Cannot create a NAT Gateway."
+                )
+        # --- 4. Process Private Subnets ---
+        print("\n--- Processing Private Subnets ---")
+        # ... (rest of your existing subnet processing logic for checked_private_subnets_ctx) ...
+        # (This part for importing existing subnets remains the same)
+        # Create new private subnets
+        if private_subnets_data_for_creation_ctx:
+            names_to_create_private = [
+                s["name"] for s in private_subnets_data_for_creation_ctx
+            ]
+            cidrs_to_create_private = [
+                s["cidr"] for s in private_subnets_data_for_creation_ctx
+            ]
+            azs_to_create_private = [
+                s["az"] for s in private_subnets_data_for_creation_ctx
+            ]
+            if names_to_create_private:
+                print(
+                    f"Attempting to create {len(names_to_create_private)} new private subnets: {names_to_create_private}"
+                )
+                # --- CALL THE NEW CREATE_SUBNETS FUNCTION FOR PRIVATE ---
+                # Ensure self.single_nat_gateway_id is available before this call
+                if not self.single_nat_gateway_id:
+                    raise ValueError(
+                        "A single NAT Gateway ID is required for private subnets but was not resolved."
+                    )
+                newly_created_private_subnets_cfn, newly_created_private_rts_cfn = (
+                    create_subnets(
+                        self,
+                        vpc,
+                        CDK_PREFIX,
+                        names_to_create_private,
+                        cidrs_to_create_private,
+                        azs_to_create_private,
+                        is_public=False,
+                        single_nat_gateway_id=self.single_nat_gateway_id,  # Pass the single NAT Gateway ID
+                    )
+                )
+                self.private_subnets.extend(newly_created_private_subnets_cfn)
+                self.private_route_tables_cfn.extend(newly_created_private_rts_cfn)
+                print(
+                    f"Successfully defined {len(newly_created_private_subnets_cfn)} new private subnets and their route tables for creation."
+                )
+        else:
+            print(
+                "No private subnets specified for creation in context ('private_subnets_to_create')."
+            )
+        # if not self.private_subnets:
+        #     raise Exception("No private subnets found or created, exiting.")
+        if (
+            not self.private_subnets
+            and not names_to_create_private
+            and not PRIVATE_SUBNETS_TO_USE
+        ):
+            # This condition might need adjustment for new VPCs.
+            raise Exception("No private subnets found or created, exiting.")
+        # --- 5. Sanity Check and Output ---
+        # Output the single NAT Gateway ID for verification
+        if self.single_nat_gateway_id:
+            CfnOutput(
+                self,
+                "SingleNatGatewayId",
+                value=self.single_nat_gateway_id,
+                description="ID of the single NAT Gateway resolved or created.",
+            )
+        elif (
+            NEW_VPC_DEFAULT_NAME
+            and (self.node.try_get_context("new_vpc_nat_gateways") or 1) > 0
+        ):
+            print(
+                "INFO: A new VPC was created with NAT Gateway(s). Their routing is handled by the VPC construct. No single_nat_gateway_id was explicitly set for separate output."
+            )
+        else:
+            out_message = "WARNING: No single NAT Gateway was resolved or created explicitly by the script's logic after VPC setup."
+            print(out_message)
+            raise Exception(out_message)
+        # --- Outputs for other stacks/regions ---
+        # These are crucial for cross-stack, cross-region referencing
+        self.params = dict()
+        self.params["vpc_id"] = vpc.vpc_id
+        self.params["private_subnets"] = self.private_subnets
+        self.params["private_route_tables"] = self.private_route_tables_cfn
+        self.params["public_subnets"] = self.public_subnets
+        self.params["public_route_tables"] = self.public_route_tables_cfn
+        private_subnet_selection = ec2.SubnetSelection(subnets=self.private_subnets)
+        public_subnet_selection = ec2.SubnetSelection(subnets=self.public_subnets)
+        for sub in private_subnet_selection.subnets:
+            print(
+                "private subnet:",
+                sub.subnet_id,
+                "is in availability zone:",
+                sub.availability_zone,
+            )
+        for sub in public_subnet_selection.subnets:
+            print(
+                "public subnet:",
+                sub.subnet_id,
+                "is in availability zone:",
+                sub.availability_zone,
+            )
+        print("Private subnet route tables:", self.private_route_tables_cfn)
+        # Add the S3 Gateway Endpoint to the VPC
+        if names_to_create_private:
+            try:
+                s3_gateway_endpoint = vpc.add_gateway_endpoint(
+                    "S3GatewayEndpoint",
+                    service=ec2.GatewayVpcEndpointAwsService.S3,
+                    subnets=[private_subnet_selection],
+                )
+            except Exception as e:
+                print("Could not add S3 gateway endpoint to subnets due to:", e)
+            # Output some useful information
+            CfnOutput(
+                self,
+                "VpcIdOutput",
+                value=vpc.vpc_id,
+                description="The ID of the VPC where the S3 Gateway Endpoint is deployed.",
+            )
+            CfnOutput(
+                self,
+                "S3GatewayEndpointService",
+                value=s3_gateway_endpoint.vpc_endpoint_id,
+                description="The id for the S3 Gateway Endpoint.",
+            )  # Specify the S3 service
+        # --- IAM Roles ---
+        if USE_CUSTOM_KMS_KEY == "1":
+            kms_key = kms.Key(
+                self,
+                "RedactionSharedKmsKey",
+                alias=CUSTOM_KMS_KEY_NAME,
+                removal_policy=RemovalPolicy.DESTROY,
+            )
+            custom_sts_kms_policy_dict = {
+                "Version": "2012-10-17",
+                "Statement": [
+                    {
+                        "Sid": "STSCallerIdentity",
+                        "Effect": "Allow",
+                        "Action": ["sts:GetCallerIdentity"],
+                        "Resource": "*",
+                    },
+                    {
+                        "Sid": "KMSAccess",
+                        "Effect": "Allow",
+                        "Action": ["kms:Encrypt", "kms:Decrypt", "kms:GenerateDataKey"],
+                        "Resource": kms_key.key_arn,  # Use key_arn, as it's the full ARN, safer than key_id
+                    },
+                ],
+            }
+        else:
+            kms_key = None
+            custom_sts_kms_policy_dict = {
+                "Version": "2012-10-17",
+                "Statement": [
+                    {
+                        "Sid": "STSCallerIdentity",
+                        "Effect": "Allow",
+                        "Action": ["sts:GetCallerIdentity"],
+                        "Resource": "*",
+                    },
+                    {
+                        "Sid": "KMSSecretsManagerDecrypt",  # Explicitly add decrypt for default key
+                        "Effect": "Allow",
+                        "Action": ["kms:Decrypt"],
+                        "Resource": f"arn:aws:kms:{AWS_REGION}:{AWS_ACCOUNT_ID}:key/aws/secretsmanager",
+                    },
+                ],
+            }
+        custom_sts_kms_policy = json.dumps(custom_sts_kms_policy_dict, indent=4)
+        try:
+            codebuild_role_name = CODEBUILD_ROLE_NAME
+            if get_context_bool(f"exists:{codebuild_role_name}"):
+                # If exists, lookup/import the role using ARN from context
+                role_arn = get_context_str(f"arn:{codebuild_role_name}")
+                if not role_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{codebuild_role_name}' is required if role exists."
+                    )
+                codebuild_role = iam.Role.from_role_arn(
+                    self, "CodeBuildRole", role_arn=role_arn
+                )
+                print("Using existing CodeBuild role")
+            else:
+                # If not exists, create the role
+                codebuild_role = iam.Role(
+                    self,
+                    "CodeBuildRole",  # Logical ID
+                    role_name=codebuild_role_name,  # Explicit resource name
+                    assumed_by=iam.ServicePrincipal("codebuild.amazonaws.com"),
+                )
+                codebuild_role.add_managed_policy(
+                    iam.ManagedPolicy.from_aws_managed_policy_name(
+                        "EC2InstanceProfileForImageBuilderECRContainerBuilds"
+                    )
+                )
+                print("Successfully created new CodeBuild role")
+            task_role_name = ECS_TASK_ROLE_NAME
+            if get_context_bool(f"exists:{task_role_name}"):
+                role_arn = get_context_str(f"arn:{task_role_name}")
+                if not role_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{task_role_name}' is required if role exists."
+                    )
+                task_role = iam.Role.from_role_arn(self, "TaskRole", role_arn=role_arn)
+                print("Using existing ECS task role")
+            else:
+                task_role = iam.Role(
+                    self,
+                    "TaskRole",  # Logical ID
+                    role_name=task_role_name,  # Explicit resource name
+                    assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
+                )
+                for role in AWS_MANAGED_TASK_ROLES_LIST:
+                    print(f"Adding {role} to policy")
+                    task_role.add_managed_policy(
+                        iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
+                    )
+                task_role = add_custom_policies(
+                    self, task_role, custom_policy_text=custom_sts_kms_policy
+                )
+                print("Successfully created new ECS task role")
+            execution_role_name = ECS_TASK_EXECUTION_ROLE_NAME
+            if get_context_bool(f"exists:{execution_role_name}"):
+                role_arn = get_context_str(f"arn:{execution_role_name}")
+                if not role_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{execution_role_name}' is required if role exists."
+                    )
+                execution_role = iam.Role.from_role_arn(
+                    self, "ExecutionRole", role_arn=role_arn
+                )
+                print("Using existing ECS execution role")
+            else:
+                execution_role = iam.Role(
+                    self,
+                    "ExecutionRole",  # Logical ID
+                    role_name=execution_role_name,  # Explicit resource name
+                    assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
+                )
+                for role in AWS_MANAGED_TASK_ROLES_LIST:
+                    execution_role.add_managed_policy(
+                        iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
+                    )
+                execution_role = add_custom_policies(
+                    self, execution_role, custom_policy_text=custom_sts_kms_policy
+                )
+                print("Successfully created new ECS execution role")
+        except Exception as e:
+            raise Exception("Failed at IAM role step due to:", e)
+        # --- S3 Buckets ---
+        try:
+            log_bucket_name = S3_LOG_CONFIG_BUCKET_NAME
+            if get_context_bool(f"exists:{log_bucket_name}"):
+                bucket = s3.Bucket.from_bucket_name(
+                    self, "LogConfigBucket", bucket_name=log_bucket_name
+                )
+                print("Using existing S3 bucket", log_bucket_name)
+            else:
+                if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
+                    bucket = s3.Bucket(
+                        self,
+                        "LogConfigBucket",
+                        bucket_name=log_bucket_name,
+                        versioned=False,
+                        removal_policy=RemovalPolicy.DESTROY,
+                        auto_delete_objects=True,
+                        encryption=s3.BucketEncryption.KMS,
+                        encryption_key=kms_key,
+                    )
+                else:
+                    bucket = s3.Bucket(
+                        self,
+                        "LogConfigBucket",
+                        bucket_name=log_bucket_name,
+                        versioned=False,
+                        removal_policy=RemovalPolicy.DESTROY,
+                        auto_delete_objects=True,
+                    )
+                print("Created S3 bucket", log_bucket_name)
+            # Add policies - this will apply to both created and imported buckets
+            # CDK handles idempotent policy additions
+            bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],  # Pass the role object directly
+                    actions=["s3:GetObject", "s3:PutObject"],
+                    resources=[f"{bucket.bucket_arn}/*"],
+                )
+            )
+            bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],
+                    actions=["s3:ListBucket"],
+                    resources=[bucket.bucket_arn],
+                )
+            )
+            output_bucket_name = S3_OUTPUT_BUCKET_NAME
+            if get_context_bool(f"exists:{output_bucket_name}"):
+                output_bucket = s3.Bucket.from_bucket_name(
+                    self, "OutputBucket", bucket_name=output_bucket_name
+                )
+                print("Using existing Output bucket", output_bucket_name)
+            else:
+                if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
+                    output_bucket = s3.Bucket(
+                        self,
+                        "OutputBucket",
+                        bucket_name=output_bucket_name,
+                        lifecycle_rules=[
+                            s3.LifecycleRule(
+                                expiration=Duration.days(
+                                    int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
+                                )
+                            )
+                        ],
+                        versioned=False,
+                        removal_policy=RemovalPolicy.DESTROY,
+                        auto_delete_objects=True,
+                        encryption=s3.BucketEncryption.KMS,
+                        encryption_key=kms_key,
+                    )
+                else:
+                    output_bucket = s3.Bucket(
+                        self,
+                        "OutputBucket",
+                        bucket_name=output_bucket_name,
+                        lifecycle_rules=[
+                            s3.LifecycleRule(
+                                expiration=Duration.days(
+                                    int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
+                                )
+                            )
+                        ],
+                        versioned=False,
+                        removal_policy=RemovalPolicy.DESTROY,
+                        auto_delete_objects=True,
+                    )
+                print("Created Output bucket:", output_bucket_name)
+            # Add policies to output bucket
+            output_bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],
+                    actions=["s3:GetObject", "s3:PutObject"],
+                    resources=[f"{output_bucket.bucket_arn}/*"],
+                )
+            )
+            output_bucket.add_to_resource_policy(
+                iam.PolicyStatement(
+                    effect=iam.Effect.ALLOW,
+                    principals=[task_role],
+                    actions=["s3:ListBucket"],
+                    resources=[output_bucket.bucket_arn],
+                )
+            )
+        except Exception as e:
+            raise Exception("Could not handle S3 buckets due to:", e)
+        # --- Elastic Container Registry ---
+        try:
+            full_ecr_repo_name = ECR_CDK_REPO_NAME
+            if get_context_bool(f"exists:{full_ecr_repo_name}"):
+                ecr_repo = ecr.Repository.from_repository_name(
+                    self, "ECRRepo", repository_name=full_ecr_repo_name
+                )
+                print("Using existing ECR repository")
+            else:
+                ecr_repo = ecr.Repository(
+                    self, "ECRRepo", repository_name=full_ecr_repo_name
+                )  # Explicitly set repository_name
+                print("Created ECR repository", full_ecr_repo_name)
+            ecr_image_loc = ecr_repo.repository_uri
+        except Exception as e:
+            raise Exception("Could not handle ECR repo due to:", e)
+        # --- CODEBUILD ---
+        try:
+            codebuild_project_name = CODEBUILD_PROJECT_NAME
+            if get_context_bool(f"exists:{codebuild_project_name}"):
+                # Lookup CodeBuild project by ARN from context
+                project_arn = get_context_str(f"arn:{codebuild_project_name}")
+                if not project_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{codebuild_project_name}' is required if project exists."
+                    )
+                codebuild_project = codebuild.Project.from_project_arn(
+                    self, "CodeBuildProject", project_arn=project_arn
+                )
+                print("Using existing CodeBuild project")
+            else:
+                codebuild_project = codebuild.Project(
+                    self,
+                    "CodeBuildProject",  # Logical ID
+                    project_name=codebuild_project_name,  # Explicit resource name
+                    source=codebuild.Source.git_hub(
+                        owner=GITHUB_REPO_USERNAME,
+                        repo=GITHUB_REPO_NAME,
+                        branch_or_ref=GITHUB_REPO_BRANCH,
+                    ),
+                    environment=codebuild.BuildEnvironment(
+                        build_image=codebuild.LinuxBuildImage.STANDARD_7_0,
+                        privileged=True,
+                        environment_variables={
+                            "ECR_REPO_NAME": codebuild.BuildEnvironmentVariable(
+                                value=full_ecr_repo_name
+                            ),
+                            "AWS_DEFAULT_REGION": codebuild.BuildEnvironmentVariable(
+                                value=AWS_REGION
+                            ),
+                            "AWS_ACCOUNT_ID": codebuild.BuildEnvironmentVariable(
+                                value=AWS_ACCOUNT_ID
+                            ),
+                            "APP_MODE": codebuild.BuildEnvironmentVariable(
+                                value="gradio"
+                            ),
+                        },
+                    ),
+                    build_spec=codebuild.BuildSpec.from_object(
+                        {
+                            "version": "0.2",
+                            "phases": {
+                                "pre_build": {
+                                    "commands": [
+                                        "echo Logging in to Amazon ECR",
+                                        "aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com",
+                                    ]
+                                },
+                                "build": {
+                                    "commands": [
+                                        "echo Building the Docker image",
+                                        "docker build --build-args APP_MODE=$APP_MODE --target $APP_MODE -t $ECR_REPO_NAME:latest .",
+                                        "docker tag $ECR_REPO_NAME:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
+                                    ]
+                                },
+                                "post_build": {
+                                    "commands": [
+                                        "echo Pushing the Docker image",
+                                        "docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
+                                    ]
+                                },
+                            },
+                        }
+                    ),
+                )
+                print("Successfully created CodeBuild project", codebuild_project_name)
+            # Grant permissions - applies to both created and imported project role
+            ecr_repo.grant_pull_push(codebuild_project.role)
+        except Exception as e:
+            raise Exception("Could not handle Codebuild project due to:", e)
+        # --- Security Groups ---
+        try:
+            ecs_security_group_name = ECS_SECURITY_GROUP_NAME
+            try:
+                ecs_security_group = ec2.SecurityGroup(
+                    self,
+                    "ECSSecurityGroup",  # Logical ID
+                    security_group_name=ecs_security_group_name,  # Explicit resource name
+                    vpc=vpc,
+                )
+                print(f"Created Security Group: {ecs_security_group_name}")
+            except Exception as e:  # If lookup fails, create
+                print("Failed to create ECS security group due to:", e)
+            alb_security_group_name = ALB_NAME_SECURITY_GROUP_NAME
+            try:
+                alb_security_group = ec2.SecurityGroup(
+                    self,
+                    "ALBSecurityGroup",  # Logical ID
+                    security_group_name=alb_security_group_name,  # Explicit resource name
+                    vpc=vpc,
+                )
+                print(f"Created Security Group: {alb_security_group_name}")
+            except Exception as e:  # If lookup fails, create
+                print("Failed to create ALB security group due to:", e)
+            # Define Ingress Rules - CDK will manage adding/removing these as needed
+            ec2_port_gradio_server_port = ec2.Port.tcp(
+                int(GRADIO_SERVER_PORT)
+            )  # Ensure port is int
+            ecs_security_group.add_ingress_rule(
+                peer=alb_security_group,
+                connection=ec2_port_gradio_server_port,
+                description="ALB traffic",
+            )
+            alb_security_group.add_ingress_rule(
+                peer=ec2.Peer.prefix_list("pl-93a247fa"),
+                connection=ec2.Port.all_traffic(),
+                description="CloudFront traffic",
+            )
+        except Exception as e:
+            raise Exception("Could not handle security groups due to:", e)
+        # --- DynamoDB tables for logs (optional) ---
+        if SAVE_LOGS_TO_DYNAMODB == "True":
+            try:
+                print("Creating DynamoDB tables for logs")
+                dynamodb.Table(
+                    self,
+                    "RedactionAccessDataTable",
+                    table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
+                    partition_key=dynamodb.Attribute(
+                        name="id", type=dynamodb.AttributeType.STRING
+                    ),
+                    billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+                    removal_policy=RemovalPolicy.DESTROY,
+                )
+                dynamodb.Table(
+                    self,
+                    "RedactionFeedbackDataTable",
+                    table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
+                    partition_key=dynamodb.Attribute(
+                        name="id", type=dynamodb.AttributeType.STRING
+                    ),
+                    billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+                    removal_policy=RemovalPolicy.DESTROY,
+                )
+                dynamodb.Table(
+                    self,
+                    "RedactionUsageDataTable",
+                    table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
+                    partition_key=dynamodb.Attribute(
+                        name="id", type=dynamodb.AttributeType.STRING
+                    ),
+                    billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
+                    removal_policy=RemovalPolicy.DESTROY,
+                )
+            except Exception as e:
+                raise Exception("Could not create DynamoDB tables due to:", e)
+        # --- ALB ---
+        try:
+            load_balancer_name = ALB_NAME
+            if len(load_balancer_name) > 32:
+                load_balancer_name = load_balancer_name[-32:]
+            if get_context_bool(f"exists:{load_balancer_name}"):
+                # Lookup ALB by ARN from context
+                alb_arn = get_context_str(f"arn:{load_balancer_name}")
+                if not alb_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{load_balancer_name}' is required if ALB exists."
+                    )
+                alb = elbv2.ApplicationLoadBalancer.from_lookup(
+                    self, "ALB", load_balancer_arn=alb_arn  # Logical ID
+                )
+                print(f"Using existing Application Load Balancer {load_balancer_name}.")
+            else:
+                alb = elbv2.ApplicationLoadBalancer(
+                    self,
+                    "ALB",  # Logical ID
+                    load_balancer_name=load_balancer_name,  # Explicit resource name
+                    vpc=vpc,
+                    internet_facing=True,
+                    security_group=alb_security_group,  # Link to SG
+                    vpc_subnets=public_subnet_selection,  # Link to subnets
+                )
+                print("Successfully created new Application Load Balancer")
+        except Exception as e:
+            raise Exception("Could not handle application load balancer due to:", e)
+        # --- Cognito User Pool ---
+        try:
+            if get_context_bool(f"exists:{COGNITO_USER_POOL_NAME}"):
+                # Lookup by ID from context
+                user_pool_id = get_context_str(f"id:{COGNITO_USER_POOL_NAME}")
+                if not user_pool_id:
+                    raise ValueError(
+                        f"Context value 'id:{COGNITO_USER_POOL_NAME}' is required if User Pool exists."
+                    )
+                user_pool = cognito.UserPool.from_user_pool_id(
+                    self, "UserPool", user_pool_id=user_pool_id
+                )
+                print(f"Using existing user pool {user_pool_id}.")
+            else:
+                user_pool = cognito.UserPool(
+                    self,
+                    "UserPool",
+                    user_pool_name=COGNITO_USER_POOL_NAME,
+                    mfa=cognito.Mfa.OFF,  # Adjust as needed
+                    sign_in_aliases=cognito.SignInAliases(email=True),
+                    removal_policy=RemovalPolicy.DESTROY,
+                )  # Adjust as needed
+                print(f"Created new user pool {user_pool.user_pool_id}.")
+            # If you're using a certificate, assume that you will be using the ALB Cognito login features. You need different redirect URLs to accept the token that comes from Cognito authentication.
+            if ACM_SSL_CERTIFICATE_ARN:
+                redirect_uris = [
+                    COGNITO_REDIRECTION_URL,
+                    COGNITO_REDIRECTION_URL + "/oauth2/idpresponse",
+                ]
+            else:
+                redirect_uris = [COGNITO_REDIRECTION_URL]
+            user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
+            if get_context_bool(f"exists:{user_pool_client_name}"):
+                # Lookup by ID from context (requires User Pool object)
+                user_pool_client_id = get_context_str(f"id:{user_pool_client_name}")
+                if not user_pool_client_id:
+                    raise ValueError(
+                        f"Context value 'id:{user_pool_client_name}' is required if User Pool Client exists."
+                    )
+                user_pool_client = cognito.UserPoolClient.from_user_pool_client_id(
+                    self, "UserPoolClient", user_pool_client_id=user_pool_client_id
+                )
+                print(f"Using existing user pool client {user_pool_client_id}.")
+            else:
+                user_pool_client = cognito.UserPoolClient(
+                    self,
+                    "UserPoolClient",
+                    auth_flows=cognito.AuthFlow(
+                        user_srp=True, user_password=True
+                    ),  # Example: enable SRP for secure sign-in
+                    user_pool=user_pool,
+                    generate_secret=True,
+                    user_pool_client_name=user_pool_client_name,
+                    supported_identity_providers=[
+                        cognito.UserPoolClientIdentityProvider.COGNITO
+                    ],
+                    o_auth=cognito.OAuthSettings(
+                        flows=cognito.OAuthFlows(authorization_code_grant=True),
+                        scopes=[
+                            cognito.OAuthScope.OPENID,
+                            cognito.OAuthScope.EMAIL,
+                            cognito.OAuthScope.PROFILE,
+                        ],
+                        callback_urls=redirect_uris,
+                    ),
+                    refresh_token_validity=Duration.minutes(
+                        COGNITO_REFRESH_TOKEN_VALIDITY
+                    ),
+                    id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
+                    access_token_validity=Duration.minutes(
+                        COGNITO_ACCESS_TOKEN_VALIDITY
+                    ),
+                )
+            CfnOutput(
+                self, "CognitoAppClientId", value=user_pool_client.user_pool_client_id
+            )
+            print(
+                f"Created new user pool client {user_pool_client.user_pool_client_id}."
+            )
+            # Add a domain to the User Pool (crucial for ALB integration)
+            user_pool_domain = user_pool.add_domain(
+                "UserPoolDomain",
+                cognito_domain=cognito.CognitoDomainOptions(
+                    domain_prefix=COGNITO_USER_POOL_DOMAIN_PREFIX
+                ),
+            )
+            # Apply removal_policy to the created UserPoolDomain construct
+            user_pool_domain.apply_removal_policy(policy=RemovalPolicy.DESTROY)
+            CfnOutput(
+                self, "CognitoUserPoolLoginUrl", value=user_pool_domain.base_url()
+            )
+        except Exception as e:
+            raise Exception("Could not handle Cognito resources due to:", e)
+        # --- Secrets Manager Secret ---
+        try:
+            secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
+            if get_context_bool(f"exists:{secret_name}"):
+                # Lookup by name
+                secret = secretsmanager.Secret.from_secret_name_v2(
+                    self, "CognitoSecret", secret_name=secret_name
+                )
+                print("Using existing Secret.")
+            else:
+                if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
+                    secret = secretsmanager.Secret(
+                        self,
+                        "CognitoSecret",  # Logical ID
+                        secret_name=secret_name,  # Explicit resource name
+                        secret_object_value={
+                            "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
+                                user_pool.user_pool_id
+                            ),  # Use the CDK attribute
+                            "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
+                                user_pool_client.user_pool_client_id
+                            ),  # Use the CDK attribute
+                            "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret,  # Use the CDK attribute
+                        },
+                        encryption_key=kms_key,
+                    )
+                else:
+                    secret = secretsmanager.Secret(
+                        self,
+                        "CognitoSecret",  # Logical ID
+                        secret_name=secret_name,  # Explicit resource name
+                        secret_object_value={
+                            "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
+                                user_pool.user_pool_id
+                            ),  # Use the CDK attribute
+                            "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
+                                user_pool_client.user_pool_client_id
+                            ),  # Use the CDK attribute
+                            "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret,  # Use the CDK attribute
+                        },
+                    )
+                print(
+                    "Created new secret in Secrets Manager for Cognito user pool and related details."
+                )
+        except Exception as e:
+            raise Exception("Could not handle Secrets Manager secret due to:", e)
+        # --- Fargate Task Definition ---
+        try:
+            fargate_task_definition_name = FARGATE_TASK_DEFINITION_NAME
+            read_only_file_system = ECS_READ_ONLY_FILE_SYSTEM == "True"
+            if os.path.exists(TASK_DEFINITION_FILE_LOCATION):
+                with open(TASK_DEFINITION_FILE_LOCATION) as f:  # Use correct path
+                    task_def_params = json.load(f)
+                # Need to ensure taskRoleArn and executionRoleArn in JSON are correct ARN strings
+            else:
+                epheremal_storage_volume_name = "appEphemeralVolume"
+                task_def_params = {}
+                task_def_params["taskRoleArn"] = (
+                    task_role.role_arn
+                )  # Use CDK role object ARN
+                task_def_params["executionRoleArn"] = (
+                    execution_role.role_arn
+                )  # Use CDK role object ARN
+                task_def_params["memory"] = ECS_TASK_MEMORY_SIZE
+                task_def_params["cpu"] = ECS_TASK_CPU_SIZE
+                container_def = {
+                    "name": full_ecr_repo_name,
+                    "image": ecr_image_loc + ":latest",
+                    "essential": True,
+                    "portMappings": [
+                        {
+                            "containerPort": int(GRADIO_SERVER_PORT),
+                            "hostPort": int(GRADIO_SERVER_PORT),
+                            "protocol": "tcp",
+                            "appProtocol": "http",
+                        }
+                    ],
+                    "logConfiguration": {
+                        "logDriver": "awslogs",
+                        "options": {
+                            "awslogs-group": ECS_LOG_GROUP_NAME,
+                            "awslogs-region": AWS_REGION,
+                            "awslogs-stream-prefix": "ecs",
+                        },
+                    },
+                    "environmentFiles": [
+                        {"value": bucket.bucket_arn + "/config.env", "type": "s3"}
+                    ],
+                    "memoryReservation": int(task_def_params["memory"])
+                    - 512,  # Reserve some memory for the container
+                    "mountPoints": [
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/logs",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/feedback",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/usage",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/input",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/output",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/tmp",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/app/config",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/tmp/matplotlib_cache",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/tmp",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/var/tmp",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/tmp/tld",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/tmp/gradio_tmp",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/.paddlex",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/home/user/.local/share/spacy/data",
+                            "readOnly": False,
+                        },
+                        {
+                            "sourceVolume": epheremal_storage_volume_name,
+                            "containerPath": "/usr/share/tessdata",
+                            "readOnly": False,
+                        },
+                    ],
+                    "readonlyRootFilesystem": read_only_file_system,
+                }
+                task_def_params["containerDefinitions"] = [container_def]
+            log_group_name_from_config = task_def_params["containerDefinitions"][0][
+                "logConfiguration"
+            ]["options"]["awslogs-group"]
+            cdk_managed_log_group = logs.LogGroup(
+                self,
+                "MyTaskLogGroup",  # CDK Logical ID
+                log_group_name=log_group_name_from_config,
+                retention=logs.RetentionDays.ONE_MONTH,
+                removal_policy=RemovalPolicy.DESTROY,
+            )
+            epheremal_storage_volume_cdk_obj = ecs.Volume(
+                name=epheremal_storage_volume_name
+            )
+            fargate_task_definition = ecs.FargateTaskDefinition(
+                self,
+                "FargateTaskDefinition",  # Logical ID
+                family=fargate_task_definition_name,
+                cpu=int(task_def_params["cpu"]),
+                memory_limit_mib=int(task_def_params["memory"]),
+                task_role=task_role,
+                execution_role=execution_role,
+                runtime_platform=ecs.RuntimePlatform(
+                    cpu_architecture=ecs.CpuArchitecture.X86_64,
+                    operating_system_family=ecs.OperatingSystemFamily.LINUX,
+                ),
+                ephemeral_storage_gib=21,  # Minimum is 21 GiB
+                volumes=[epheremal_storage_volume_cdk_obj],
+            )
+            print("Fargate task definition defined.")
+            # Add container definitions to the task definition object
+            if task_def_params["containerDefinitions"]:
+                container_def_params = task_def_params["containerDefinitions"][0]
+                if container_def_params.get("environmentFiles"):
+                    env_files = []
+                    for env_file_param in container_def_params["environmentFiles"]:
+                        # Need to parse the ARN to get the bucket object and key
+                        env_file_arn_parts = env_file_param["value"].split(":::")
+                        bucket_name_and_key = env_file_arn_parts[-1]
+                        env_bucket_name, env_key = bucket_name_and_key.split("/", 1)
+                        env_file = ecs.EnvironmentFile.from_bucket(bucket, env_key)
+                        env_files.append(env_file)
+                container = fargate_task_definition.add_container(
+                    container_def_params["name"],
+                    image=ecs.ContainerImage.from_registry(
+                        container_def_params["image"]
+                    ),
+                    logging=ecs.LogDriver.aws_logs(
+                        stream_prefix=container_def_params["logConfiguration"][
+                            "options"
+                        ]["awslogs-stream-prefix"],
+                        log_group=cdk_managed_log_group,
+                    ),
+                    secrets={
+                        "AWS_USER_POOL_ID": ecs.Secret.from_secrets_manager(
+                            secret, "REDACTION_USER_POOL_ID"
+                        ),
+                        "AWS_CLIENT_ID": ecs.Secret.from_secrets_manager(
+                            secret, "REDACTION_CLIENT_ID"
+                        ),
+                        "AWS_CLIENT_SECRET": ecs.Secret.from_secrets_manager(
+                            secret, "REDACTION_CLIENT_SECRET"
+                        ),
+                    },
+                    environment_files=env_files,
+                    readonly_root_filesystem=read_only_file_system,
+                )
+                for port_mapping in container_def_params["portMappings"]:
+                    container.add_port_mappings(
+                        ecs.PortMapping(
+                            container_port=int(port_mapping["containerPort"]),
+                            host_port=int(port_mapping["hostPort"]),
+                            name="port-" + str(port_mapping["containerPort"]),
+                            app_protocol=ecs.AppProtocol.http,
+                            protocol=ecs.Protocol.TCP,
+                        )
+                    )
+                container.add_port_mappings(
+                    ecs.PortMapping(
+                        container_port=80,
+                        host_port=80,
+                        name="port-80",
+                        app_protocol=ecs.AppProtocol.http,
+                        protocol=ecs.Protocol.TCP,
+                    )
+                )
+                if container_def_params.get("mountPoints"):
+                    mount_points = []
+                    for mount_point in container_def_params["mountPoints"]:
+                        mount_points.append(
+                            ecs.MountPoint(
+                                container_path=mount_point["containerPath"],
+                                read_only=mount_point["readOnly"],
+                                source_volume=epheremal_storage_volume_name,
+                            )
+                        )
+                    container.add_mount_points(*mount_points)
+        except Exception as e:
+            raise Exception("Could not handle Fargate task definition due to:", e)
+        # --- ECS Cluster ---
+        try:
+            cluster = ecs.Cluster(
+                self,
+                "ECSCluster",  # Logical ID
+                cluster_name=CLUSTER_NAME,  # Explicit resource name
+                enable_fargate_capacity_providers=True,
+                vpc=vpc,
+            )
+            print("Successfully created new ECS cluster")
+        except Exception as e:
+            raise Exception("Could not handle ECS cluster due to:", e)
+        # --- ECS Service ---
+        try:
+            ecs_service_name = ECS_SERVICE_NAME
+            if ECS_USE_FARGATE_SPOT == "True":
+                use_fargate_spot = "FARGATE_SPOT"
+            if ECS_USE_FARGATE_SPOT == "False":
+                use_fargate_spot = "FARGATE"
+            # Check if service exists - from_service_arn or from_service_name (needs cluster)
+            try:
+                # from_service_name is useful if you have the cluster object
+                ecs_service = ecs.FargateService.from_service_attributes(
+                    self,
+                    "ECSService",  # Logical ID
+                    cluster=cluster,  # Requires the cluster object
+                    service_name=ecs_service_name,
+                )
+                print(f"Using existing ECS service {ecs_service_name}.")
+            except Exception:
+                # Service will be created with a count of 0, because you haven't yet actually built the initial Docker container with CodeBuild
+                ecs_service = ecs.FargateService(
+                    self,
+                    "ECSService",  # Logical ID
+                    service_name=ecs_service_name,  # Explicit resource name
+                    platform_version=ecs.FargatePlatformVersion.LATEST,
+                    capacity_provider_strategies=[
+                        ecs.CapacityProviderStrategy(
+                            capacity_provider=use_fargate_spot, base=0, weight=1
+                        )
+                    ],
+                    cluster=cluster,
+                    task_definition=fargate_task_definition,  # Link to TD
+                    security_groups=[ecs_security_group],  # Link to SG
+                    vpc_subnets=ec2.SubnetSelection(
+                        subnets=self.private_subnets
+                    ),  # Link to subnets
+                    min_healthy_percent=0,
+                    max_healthy_percent=100,
+                    desired_count=0,
+                )
+                print("Successfully created new ECS service")
+            # Note: Auto-scaling setup would typically go here if needed for the service
+        except Exception as e:
+            raise Exception("Could not handle ECS service due to:", e)
+        # --- Grant Secret Read Access (Applies to both created and imported roles) ---
+        try:
+            secret.grant_read(task_role)
+            secret.grant_read(execution_role)
+        except Exception as e:
+            raise Exception("Could not grant access to Secrets Manager due to:", e)
+        # --- ALB TARGET GROUPS AND LISTENERS ---
+        # This section should primarily define the resources if they are managed by this stack.
+        # CDK handles adding/removing targets and actions on updates.
+        # If they might pre-exist outside the stack, you need lookups.
+        cookie_duration = Duration.hours(12)
+        target_group_name = ALB_TARGET_GROUP_NAME  # Explicit resource name
+        cloudfront_distribution_url = "cloudfront_placeholder.net"  # Need to replace this afterwards with the actual cloudfront_distribution.domain_name
+        try:
+            # --- CREATING TARGET GROUPS AND ADDING THE CLOUDFRONT LISTENER RULE ---
+            target_group = elbv2.ApplicationTargetGroup(
+                self,
+                "AppTargetGroup",  # Logical ID
+                target_group_name=target_group_name,  # Explicit resource name
+                port=int(GRADIO_SERVER_PORT),  # Ensure port is int
+                protocol=elbv2.ApplicationProtocol.HTTP,
+                targets=[ecs_service],  # Link to ECS Service
+                stickiness_cookie_duration=cookie_duration,
+                vpc=vpc,  # Target Groups need VPC
+            )
+            print(f"ALB target group {target_group_name} defined.")
+            # First HTTP
+            listener_port = 80
+            # Check if Listener exists - from_listener_arn or lookup by port/ALB
+            http_listener = alb.add_listener(
+                "HttpListener",  # Logical ID
+                port=listener_port,
+                open=False,  # Be cautious with open=True, usually restrict source SG
+            )
+            print(f"ALB listener on port {listener_port} defined.")
+            if ACM_SSL_CERTIFICATE_ARN:
+                http_listener.add_action(
+                    "DefaultAction",  # Logical ID for the default action
+                    action=elbv2.ListenerAction.redirect(
+                        protocol="HTTPS",
+                        host="#{host}",
+                        port="443",
+                        path="/#{path}",
+                        query="#{query}",
+                    ),
+                )
+            else:
+                if USE_CLOUDFRONT == "True":
+                    # The following default action can be added for the listener after a host header rule is added to the listener manually in the Console as suggested in the above comments.
+                    http_listener.add_action(
+                        "DefaultAction",  # Logical ID for the default action
+                        action=elbv2.ListenerAction.fixed_response(
+                            status_code=403,
+                            content_type="text/plain",
+                            message_body="Access denied",
+                        ),
+                    )
+                    # Add the Listener Rule for the specific CloudFront Host Header
+                    http_listener.add_action(
+                        "CloudFrontHostHeaderRule",
+                        action=elbv2.ListenerAction.forward(
+                            target_groups=[target_group],
+                            stickiness_duration=cookie_duration,
+                        ),
+                        priority=1,  # Example priority. Adjust as needed. Lower is evaluated first.
+                        conditions=[
+                            elbv2.ListenerCondition.host_headers(
+                                [cloudfront_distribution_url]
+                            )  # May have to redefine url in console afterwards if not specified in config file
+                        ],
+                    )
+                else:
+                    # Add the Listener Rule for the specific CloudFront Host Header
+                    http_listener.add_action(
+                        "CloudFrontHostHeaderRule",
+                        action=elbv2.ListenerAction.forward(
+                            target_groups=[target_group],
+                            stickiness_duration=cookie_duration,
+                        ),
+                    )
+                print("Added targets and actions to ALB HTTP listener.")
+            # Now the same for HTTPS if you have an ACM certificate
+            if ACM_SSL_CERTIFICATE_ARN:
+                listener_port_https = 443
+                # Check if Listener exists - from_listener_arn or lookup by port/ALB
+                https_listener = add_alb_https_listener_with_cert(
+                    self,
+                    "MyHttpsListener",  # Logical ID for the HTTPS listener
+                    alb,
+                    acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
+                    default_target_group=target_group,
+                    enable_cognito_auth=True,
+                    cognito_user_pool=user_pool,
+                    cognito_user_pool_client=user_pool_client,
+                    cognito_user_pool_domain=user_pool_domain,
+                    listener_open_to_internet=True,
+                    stickiness_cookie_duration=cookie_duration,
+                )
+                if https_listener:
+                    CfnOutput(
+                        self, "HttpsListenerArn", value=https_listener.listener_arn
+                    )
+                print(f"ALB listener on port {listener_port_https} defined.")
+                # if USE_CLOUDFRONT == 'True':
+                #     # Add default action to the listener
+                #     https_listener.add_action(
+                #         "DefaultAction", # Logical ID for the default action
+                #         action=elbv2.ListenerAction.fixed_response(
+                #             status_code=403,
+                #             content_type="text/plain",
+                #             message_body="Access denied",
+                #         ),
+                #     )
+                #     # Add the Listener Rule for the specific CloudFront Host Header
+                #     https_listener.add_action(
+                #         "CloudFrontHostHeaderRuleHTTPS",
+                #         action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
+                #         priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
+                #         conditions=[
+                #             elbv2.ListenerCondition.host_headers([cloudfront_distribution_url])
+                #         ]
+                #     )
+                # else:
+                #     https_listener.add_action(
+                #         "CloudFrontHostHeaderRuleHTTPS",
+                #         action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration))
+                print("Added targets and actions to ALB HTTPS listener.")
+        except Exception as e:
+            raise Exception(
+                "Could not handle ALB target groups and listeners due to:", e
+            )
+        # Create WAF to attach to load balancer
+        try:
+            web_acl_name = LOAD_BALANCER_WEB_ACL_NAME
+            if get_context_bool(f"exists:{web_acl_name}"):
+                # Lookup WAF ACL by ARN from context
+                web_acl_arn = get_context_str(f"arn:{web_acl_name}")
+                if not web_acl_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
+                    )
+                web_acl = create_web_acl_with_common_rules(
+                    self, web_acl_name, waf_scope="REGIONAL"
+                )  # Assuming it takes scope and name
+                print(f"Handled ALB WAF web ACL {web_acl_name}.")
+            else:
+                web_acl = create_web_acl_with_common_rules(
+                    self, web_acl_name, waf_scope="REGIONAL"
+                )  # Assuming it takes scope and name
+                print(f"Created ALB WAF web ACL {web_acl_name}.")
+            wafv2.CfnWebACLAssociation(
+                self,
+                id="alb_waf_association",
+                resource_arn=alb.load_balancer_arn,
+                web_acl_arn=web_acl.attr_arn,
+            )
+        except Exception as e:
+            raise Exception("Could not handle create ALB WAF web ACL due to:", e)
+        # --- Outputs for other stacks/regions ---
+        self.params = dict()
+        self.params["alb_arn_output"] = alb.load_balancer_arn
+        self.params["alb_security_group_id"] = alb_security_group.security_group_id
+        self.params["alb_dns_name"] = alb.load_balancer_dns_name
+        CfnOutput(
+            self,
+            "AlbArnOutput",
+            value=alb.load_balancer_arn,
+            description="ARN of the Application Load Balancer",
+            export_name=f"{self.stack_name}-AlbArn",
+        )  # Export name must be unique within the account/region
+        CfnOutput(
+            self,
+            "AlbSecurityGroupIdOutput",
+            value=alb_security_group.security_group_id,
+            description="ID of the ALB's Security Group",
+            export_name=f"{self.stack_name}-AlbSgId",
+        )
+        CfnOutput(self, "ALBName", value=alb.load_balancer_name)
+        CfnOutput(self, "RegionalAlbDnsName", value=alb.load_balancer_dns_name)
+        CfnOutput(self, "CognitoPoolId", value=user_pool.user_pool_id)
+        # Add other outputs if needed
+        CfnOutput(self, "ECRRepoUri", value=ecr_repo.repository_uri)
+# --- CLOUDFRONT DISTRIBUTION in separate stack (us-east-1 required) ---
+class CdkStackCloudfront(Stack):
+    def __init__(
+        self,
+        scope: Construct,
+        construct_id: str,
+        alb_arn: str,
+        alb_sec_group_id: str,
+        alb_dns_name: str,
+        **kwargs,
+    ) -> None:
+        super().__init__(scope, construct_id, **kwargs)
+        # --- Helper to get context values ---
+        def get_context_bool(key: str, default: bool = False) -> bool:
+            return self.node.try_get_context(key) or default
+        def get_context_str(key: str, default: str = None) -> str:
+            return self.node.try_get_context(key) or default
+        def get_context_dict(scope: Construct, key: str, default: dict = None) -> dict:
+            return scope.node.try_get_context(key) or default
+        print(f"CloudFront Stack: Received ALB ARN: {alb_arn}")
+        print(f"CloudFront Stack: Received ALB Security Group ID: {alb_sec_group_id}")
+        if not alb_arn:
+            raise ValueError("ALB ARN must be provided to CloudFront stack")
+        if not alb_sec_group_id:
+            raise ValueError(
+                "ALB Security Group ID must be provided to CloudFront stack"
+            )
+        # 2. Import the ALB using its ARN
+        # This imports an existing ALB as a construct in the CloudFront stack's context.
+        # CloudFormation will understand this reference at deploy time.
+        alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
+            self,
+            "ImportedAlb",
+            load_balancer_arn=alb_arn,
+            security_group_id=alb_sec_group_id,
+            load_balancer_dns_name=alb_dns_name,
+        )
+        try:
+            web_acl_name = WEB_ACL_NAME
+            if get_context_bool(f"exists:{web_acl_name}"):
+                # Lookup WAF ACL by ARN from context
+                web_acl_arn = get_context_str(f"arn:{web_acl_name}")
+                if not web_acl_arn:
+                    raise ValueError(
+                        f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
+                    )
+                web_acl = create_web_acl_with_common_rules(
+                    self, web_acl_name
+                )  # Assuming it takes scope and name
+                print(f"Handled Cloudfront WAF web ACL {web_acl_name}.")
+            else:
+                web_acl = create_web_acl_with_common_rules(
+                    self, web_acl_name
+                )  # Assuming it takes scope and name
+                print(f"Created Cloudfront WAF web ACL {web_acl_name}.")
+            # Add ALB as CloudFront Origin
+            origin = origins.LoadBalancerV2Origin(
+                alb,  # Use the created or looked-up ALB object
+                custom_headers={CUSTOM_HEADER: CUSTOM_HEADER_VALUE},
+                origin_shield_enabled=False,
+                protocol_policy=cloudfront.OriginProtocolPolicy.HTTP_ONLY,
+            )
+            if CLOUDFRONT_GEO_RESTRICTION:
+                geo_restrict = cloudfront.GeoRestriction.allowlist(
+                    CLOUDFRONT_GEO_RESTRICTION
+                )
+            else:
+                geo_restrict = None
+            cloudfront_distribution = cloudfront.Distribution(
+                self,
+                "CloudFrontDistribution",  # Logical ID
+                comment=CLOUDFRONT_DISTRIBUTION_NAME,  # Use name as comment for easier identification
+                geo_restriction=geo_restrict,
+                default_behavior=cloudfront.BehaviorOptions(
+                    origin=origin,
+                    viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS,
+                    allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL,
+                    cache_policy=cloudfront.CachePolicy.CACHING_DISABLED,
+                    origin_request_policy=cloudfront.OriginRequestPolicy.ALL_VIEWER,
+                ),
+                web_acl_id=web_acl.attr_arn,
+            )
+            print(f"Cloudfront distribution {CLOUDFRONT_DISTRIBUTION_NAME} defined.")
+        except Exception as e:
+            raise Exception("Could not handle Cloudfront distribution due to:", e)
+        # --- Outputs ---
+        CfnOutput(
+            self, "CloudFrontDistributionURL", value=cloudfront_distribution.domain_name
+        )

cdk/check_resources.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import json
+import os
+from typing import Any, Dict, List
+from cdk_config import (  # Import necessary config
+    ALB_NAME,
+    AWS_REGION,
+    CDK_CONFIG_PATH,
+    CDK_FOLDER,
+    CODEBUILD_PROJECT_NAME,
+    CODEBUILD_ROLE_NAME,
+    COGNITO_USER_POOL_CLIENT_NAME,
+    COGNITO_USER_POOL_CLIENT_SECRET_NAME,
+    COGNITO_USER_POOL_NAME,
+    CONTEXT_FILE,
+    ECR_CDK_REPO_NAME,
+    ECS_TASK_EXECUTION_ROLE_NAME,
+    ECS_TASK_ROLE_NAME,
+    PRIVATE_SUBNET_AVAILABILITY_ZONES,
+    PRIVATE_SUBNET_CIDR_BLOCKS,
+    PRIVATE_SUBNETS_TO_USE,
+    PUBLIC_SUBNET_AVAILABILITY_ZONES,
+    PUBLIC_SUBNET_CIDR_BLOCKS,
+    PUBLIC_SUBNETS_TO_USE,
+    S3_LOG_CONFIG_BUCKET_NAME,
+    S3_OUTPUT_BUCKET_NAME,
+    VPC_NAME,
+    WEB_ACL_NAME,
+)
+from cdk_functions import (  # Import your check functions (assuming they use Boto3)
+    _get_existing_subnets_in_vpc,
+    check_alb_exists,
+    check_codebuild_project_exists,
+    check_ecr_repo_exists,
+    check_for_existing_role,
+    check_for_existing_user_pool,
+    check_for_existing_user_pool_client,
+    check_for_secret,
+    check_s3_bucket_exists,
+    check_subnet_exists_by_name,
+    check_web_acl_exists,
+    get_vpc_id_by_name,
+    validate_subnet_creation_parameters,
+    # Add other check functions as needed
+)
+cdk_folder = CDK_FOLDER  # <FULL_PATH_TO_CDK_FOLDER_HERE>
+# Full path needed to find config file
+os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
+# --- Helper to parse environment variables into lists ---
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list):
+    PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
+if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list):
+    PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
+if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list):
+    PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
+if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(
+    PUBLIC_SUBNET_AVAILABILITY_ZONES, list
+):
+    PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
+if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list):
+    PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
+if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(
+    PRIVATE_SUBNET_AVAILABILITY_ZONES, list
+):
+    PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
+# Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
+def check_and_set_context():
+    context_data = {}
+    # --- Find the VPC ID first ---
+    if VPC_NAME:
+        print("VPC_NAME:", VPC_NAME)
+        vpc_id, nat_gateways = get_vpc_id_by_name(VPC_NAME)
+        # If you expect only one, or one per AZ and you're creating one per AZ in CDK:
+        if nat_gateways:
+            # For simplicity, let's just check if *any* NAT exists in the VPC
+            # A more robust check would match by subnet, AZ, or a specific tag.
+            context_data["exists:NatGateway"] = True
+            context_data["id:NatGateway"] = nat_gateways[0][
+                "NatGatewayId"
+            ]  # Store the ID of the first one found
+        else:
+            context_data["exists:NatGateway"] = False
+            context_data["id:NatGateway"] = None
+        if not vpc_id:
+            # If the VPC doesn't exist, you might not be able to check/create subnets.
+            # Decide how to handle this: raise an error, set a flag, etc.
+            raise RuntimeError(
+                f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks."
+            )
+        context_data["vpc_id"] = vpc_id  # Store VPC ID in context
+        # SUBNET CHECKS
+        context_data: Dict[str, Any] = {}
+        all_proposed_subnets_data: List[Dict[str, str]] = []
+        # Flag to indicate if full validation mode (with CIDR/AZs) is active
+        full_validation_mode = False
+        # Determine if full validation mode is possible/desired
+        # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
+        public_ready_for_full_validation = (
+            len(PUBLIC_SUBNETS_TO_USE) > 0
+            and len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE)
+            and len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
+        )
+        private_ready_for_full_validation = (
+            len(PRIVATE_SUBNETS_TO_USE) > 0
+            and len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE)
+            and len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
+        )
+        # Activate full validation if *any* type of subnet (public or private) has its full details provided.
+        # You might adjust this logic if you require ALL subnet types to have CIDRs, or NONE.
+        if public_ready_for_full_validation or private_ready_for_full_validation:
+            full_validation_mode = True
+            # If some are ready but others aren't, print a warning or raise an error based on your strictness
+            if (
+                public_ready_for_full_validation
+                and not private_ready_for_full_validation
+                and PRIVATE_SUBNETS_TO_USE
+            ):
+                print(
+                    "Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs."
+                )
+            if (
+                private_ready_for_full_validation
+                and not public_ready_for_full_validation
+                and PUBLIC_SUBNETS_TO_USE
+            ):
+                print(
+                    "Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs."
+                )
+            # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
+            if public_ready_for_full_validation:
+                for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                    all_proposed_subnets_data.append(
+                        {
+                            "name": name,
+                            "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
+                            "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+                        }
+                    )
+            if private_ready_for_full_validation:
+                for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+                    all_proposed_subnets_data.append(
+                        {
+                            "name": name,
+                            "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
+                            "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+                        }
+                    )
+        print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
+        # Fetch all existing subnets in the target VPC once to avoid repeated API calls
+        try:
+            existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
+        except Exception as e:
+            print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
+            raise SystemExit(1)  # Exit immediately if we can't get baseline data
+        print("\n--- Running Name-Only Subnet Existence Check Mode ---")
+        # Fallback: check only by name using the existing data
+        checked_public_subnets = {}
+        if PUBLIC_SUBNETS_TO_USE:
+            for subnet_name in PUBLIC_SUBNETS_TO_USE:
+                print("subnet_name:", subnet_name)
+                exists, subnet_id = check_subnet_exists_by_name(
+                    subnet_name, existing_aws_subnets
+                )
+                checked_public_subnets[subnet_name] = {
+                    "exists": exists,
+                    "id": subnet_id,
+                }
+                # If the subnet exists, remove it from the proposed subnets list
+                if checked_public_subnets[subnet_name]["exists"] is True:
+                    all_proposed_subnets_data = [
+                        subnet
+                        for subnet in all_proposed_subnets_data
+                        if subnet["name"] != subnet_name
+                    ]
+        context_data["checked_public_subnets"] = checked_public_subnets
+        checked_private_subnets = {}
+        if PRIVATE_SUBNETS_TO_USE:
+            for subnet_name in PRIVATE_SUBNETS_TO_USE:
+                print("subnet_name:", subnet_name)
+                exists, subnet_id = check_subnet_exists_by_name(
+                    subnet_name, existing_aws_subnets
+                )
+                checked_private_subnets[subnet_name] = {
+                    "exists": exists,
+                    "id": subnet_id,
+                }
+                # If the subnet exists, remove it from the proposed subnets list
+                if checked_private_subnets[subnet_name]["exists"] is True:
+                    all_proposed_subnets_data = [
+                        subnet
+                        for subnet in all_proposed_subnets_data
+                        if subnet["name"] != subnet_name
+                    ]
+        context_data["checked_private_subnets"] = checked_private_subnets
+        print("\nName-only existence subnet check complete.\n")
+        if full_validation_mode:
+            print(
+                "\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---"
+            )
+            try:
+                validate_subnet_creation_parameters(
+                    vpc_id, all_proposed_subnets_data, existing_aws_subnets
+                )
+                print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
+                # Populate context_data for downstream CDK construct creation
+                context_data["public_subnets_to_create"] = []
+                if public_ready_for_full_validation:
+                    for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
+                        context_data["public_subnets_to_create"].append(
+                            {
+                                "name": name,
+                                "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
+                                "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
+                                "is_public": True,
+                            }
+                        )
+                context_data["private_subnets_to_create"] = []
+                if private_ready_for_full_validation:
+                    for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
+                        context_data["private_subnets_to_create"].append(
+                            {
+                                "name": name,
+                                "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
+                                "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
+                                "is_public": False,
+                            }
+                        )
+            except (ValueError, Exception) as e:
+                print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
+                raise SystemExit(1)  # Exit if validation fails
+    # Example checks and setting context values
+    # IAM Roles
+    role_name = CODEBUILD_ROLE_NAME
+    exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists  # Use boolean
+    if exists:
+        _, role_arn, _ = check_for_existing_role(role_name)  # Get ARN if needed
+        context_data[f"arn:{role_name}"] = role_arn
+    role_name = ECS_TASK_ROLE_NAME
+    exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists
+    if exists:
+        _, role_arn, _ = check_for_existing_role(role_name)
+        context_data[f"arn:{role_name}"] = role_arn
+    role_name = ECS_TASK_EXECUTION_ROLE_NAME
+    exists, _, _ = check_for_existing_role(role_name)
+    context_data[f"exists:{role_name}"] = exists
+    if exists:
+        _, role_arn, _ = check_for_existing_role(role_name)
+        context_data[f"arn:{role_name}"] = role_arn
+    # S3 Buckets
+    bucket_name = S3_LOG_CONFIG_BUCKET_NAME
+    exists, _ = check_s3_bucket_exists(bucket_name)
+    context_data[f"exists:{bucket_name}"] = exists
+    if exists:
+        # You might not need the ARN if using from_bucket_name
+        pass
+    output_bucket_name = S3_OUTPUT_BUCKET_NAME
+    exists, _ = check_s3_bucket_exists(output_bucket_name)
+    context_data[f"exists:{output_bucket_name}"] = exists
+    if exists:
+        pass
+    # ECR Repository
+    repo_name = ECR_CDK_REPO_NAME
+    exists, _ = check_ecr_repo_exists(repo_name)
+    context_data[f"exists:{repo_name}"] = exists
+    if exists:
+        pass  # from_repository_name is sufficient
+    # CodeBuild Project
+    project_name = CODEBUILD_PROJECT_NAME
+    exists, _ = check_codebuild_project_exists(project_name)
+    context_data[f"exists:{project_name}"] = exists
+    if exists:
+        # Need a way to get the ARN from the check function
+        _, project_arn = check_codebuild_project_exists(
+            project_name
+        )  # Assuming it returns ARN
+        context_data[f"arn:{project_name}"] = project_arn
+    # ALB (by name lookup)
+    alb_name = ALB_NAME
+    exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
+    context_data[f"exists:{alb_name}"] = exists
+    if exists:
+        _, alb_object = check_alb_exists(
+            alb_name, region_name=AWS_REGION
+        )  # Assuming check returns object
+        print("alb_object:", alb_object)
+        context_data[f"arn:{alb_name}"] = alb_object["LoadBalancerArn"]
+    # Cognito User Pool (by name)
+    user_pool_name = COGNITO_USER_POOL_NAME
+    exists, user_pool_id, _ = check_for_existing_user_pool(user_pool_name)
+    context_data[f"exists:{user_pool_name}"] = exists
+    if exists:
+        context_data[f"id:{user_pool_name}"] = user_pool_id
+    # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
+    if user_pool_id:
+        user_pool_id_for_client_check = user_pool_id  # context_data.get(f"id:{user_pool_name}") # Use ID from context
+        user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
+        if user_pool_id_for_client_check:
+            exists, client_id, _ = check_for_existing_user_pool_client(
+                user_pool_client_name, user_pool_id_for_client_check
+            )
+            context_data[f"exists:{user_pool_client_name}"] = exists
+            if exists:
+                context_data[f"id:{user_pool_client_name}"] = client_id
+    # Secrets Manager Secret (by name)
+    secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
+    exists, _ = check_for_secret(secret_name)
+    context_data[f"exists:{secret_name}"] = exists
+    # You might not need the ARN if using from_secret_name_v2
+    # WAF Web ACL (by name and scope)
+    web_acl_name = WEB_ACL_NAME
+    exists, _ = check_web_acl_exists(
+        web_acl_name, scope="CLOUDFRONT"
+    )  # Assuming check returns object
+    context_data[f"exists:{web_acl_name}"] = exists
+    if exists:
+        _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
+        context_data[f"arn:{web_acl_name}"] = existing_web_acl.attr_arn
+    # Write the context data to the file
+    with open(CONTEXT_FILE, "w") as f:
+        json.dump(context_data, f, indent=2)
+    print(f"Context data written to {CONTEXT_FILE}")

cdk/post_cdk_build_quickstart.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import time
+from cdk_config import (
+    CLUSTER_NAME,
+    CODEBUILD_PROJECT_NAME,
+    ECS_SERVICE_NAME,
+    S3_LOG_CONFIG_BUCKET_NAME,
+)
+from cdk_functions import (
+    create_basic_config_env,
+    start_codebuild_build,
+    start_ecs_task,
+    upload_file_to_s3,
+)
+from tqdm import tqdm
+# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
+create_basic_config_env("config")
+# Start codebuild build
+print("Starting CodeBuild project.")
+start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
+# Upload config.env file to S3 bucket
+upload_file_to_s3(
+    local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME
+)
+total_seconds = 660  # 11 minutes
+update_interval = 1  # Update every second
+print("Waiting 11 minutes for the CodeBuild container to build.")
+# tqdm iterates over a range, and you perform a small sleep in each iteration
+for i in tqdm(range(total_seconds), desc="Building container"):
+    time.sleep(update_interval)
+# Start task on ECS
+print("Starting ECS task")
+start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)

cdk/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+aws-cdk-lib==2.220.0
+boto3==1.40.57
+pandas==2.3.3
+nodejs==0.1.1
+python-dotenv==1.0.1

cli_redact.py ADDED Viewed

	@@ -0,0 +1,1431 @@

+import argparse
+import os
+import time
+import uuid
+import pandas as pd
+from tools.config import (
+    ACCESS_LOGS_FOLDER,
+    ALLOW_LIST_PATH,
+    AWS_ACCESS_KEY,
+    AWS_PII_OPTION,
+    AWS_REGION,
+    AWS_SECRET_KEY,
+    CHOSEN_COMPREHEND_ENTITIES,
+    CHOSEN_LOCAL_OCR_MODEL,
+    CHOSEN_REDACT_ENTITIES,
+    COMPRESS_REDACTED_PDF,
+    CUSTOM_ENTITIES,
+    DEFAULT_COMBINE_PAGES,
+    DEFAULT_COST_CODE,
+    DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+    DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+    DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
+    DEFAULT_LANGUAGE,
+    DEFAULT_MIN_CONSECUTIVE_PAGES,
+    DEFAULT_MIN_WORD_COUNT,
+    DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+    DENY_LIST_PATH,
+    DIRECT_MODE_DEFAULT_USER,
+    DISPLAY_FILE_NAMES_IN_LOGS,
+    DO_INITIAL_TABULAR_DATA_CLEAN,
+    DOCUMENT_REDACTION_BUCKET,
+    FEEDBACK_LOGS_FOLDER,
+    FULL_COMPREHEND_ENTITY_LIST,
+    FULL_ENTITY_LIST,
+    IMAGES_DPI,
+    INPUT_FOLDER,
+    LOCAL_OCR_MODEL_OPTIONS,
+    LOCAL_PII_OPTION,
+    OUTPUT_FOLDER,
+    PADDLE_MODEL_PATH,
+    PREPROCESS_LOCAL_OCR_IMAGES,
+    REMOVE_DUPLICATE_ROWS,
+    RETURN_REDACTED_PDF,
+    RUN_AWS_FUNCTIONS,
+    S3_USAGE_LOGS_FOLDER,
+    SAVE_LOGS_TO_CSV,
+    SAVE_LOGS_TO_DYNAMODB,
+    SESSION_OUTPUT_FOLDER,
+    SPACY_MODEL_PATH,
+    TEXTRACT_JOBS_LOCAL_LOC,
+    TEXTRACT_JOBS_S3_LOC,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    USAGE_LOGS_FOLDER,
+    USE_GREEDY_DUPLICATE_DETECTION,
+    WHOLE_PAGE_REDACTION_LIST_PATH,
+    convert_string_to_boolean,
+)
+def _generate_session_hash() -> str:
+    """Generate a unique session hash for logging purposes."""
+    return str(uuid.uuid4())[:8]
+def get_username_and_folders(
+    username: str = "",
+    output_folder_textbox: str = OUTPUT_FOLDER,
+    input_folder_textbox: str = INPUT_FOLDER,
+    session_output_folder: bool = SESSION_OUTPUT_FOLDER,
+    textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+    textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+    s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC,
+    local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC,
+):
+    # Generate session hash for logging. Either from input user name or generated
+    if username:
+        out_session_hash = username
+    else:
+        out_session_hash = _generate_session_hash()
+    if session_output_folder:
+        output_folder = output_folder_textbox + out_session_hash + "/"
+        input_folder = input_folder_textbox + out_session_hash + "/"
+        textract_document_upload_input_folder = (
+            textract_document_upload_input_folder + "/" + out_session_hash
+        )
+        textract_document_upload_output_folder = (
+            textract_document_upload_output_folder + "/" + out_session_hash
+        )
+        s3_textract_document_logs_subfolder = (
+            s3_textract_document_logs_subfolder + "/" + out_session_hash
+        )
+        local_textract_document_logs_subfolder = (
+            local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
+        )
+    else:
+        output_folder = output_folder_textbox
+        input_folder = input_folder_textbox
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    if not os.path.exists(input_folder):
+        os.mkdir(input_folder)
+    return (
+        out_session_hash,
+        output_folder,
+        out_session_hash,
+        input_folder,
+        textract_document_upload_input_folder,
+        textract_document_upload_output_folder,
+        s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder,
+    )
+def _get_env_list(env_var_name: str) -> list[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(",") if s.strip()]
+# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
+CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
+FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES)
+chosen_redact_entities = CHOSEN_REDACT_ENTITIES
+full_entity_list = FULL_ENTITY_LIST
+chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
+full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
+default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
+# --- Main CLI Function ---
+def main(direct_mode_args={}):
+    """
+    A unified command-line interface to prepare, redact, and anonymise various document types.
+    Args:
+        direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
+                                          If provided, uses these instead of parsing command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.",
+        formatter_class=argparse.RawTextHelpFormatter,
+        epilog="""
+Examples:
+To run these, you need to do the following:
+- Open a terminal window
+- CD to the app folder that contains this file (cli_redact.py)
+- Load the virtual environment using either conda or venv depending on your setup
+- Run one of the example commands below
+- Look in the output/ folder to see output files:
+# Redaction
+## Redact a PDF with default settings (local OCR):
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+## Extract text from a PDF only (i.e. no redaction), using local OCR:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
+## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM
+## Redact a PDF with allow list (local OCR) and custom list of redaction entities:
+python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
+## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3
+## Redaction with custom deny list, allow list, and whole page redaction list:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv
+## Redact an image:
+python cli_redact.py --input_file example_data/example_complaint_letter.jpg
+## Anonymise csv file with specific columns:
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
+## Anonymise csv file with a different strategy (remove text completely):
+python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact
+## Anonymise Excel file, remove text completely:
+python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact
+## Anonymise a word document:
+python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
+# Redaction with AWS services:
+## Use Textract and Comprehend::
+python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
+## Redact specific pages with AWS OCR and signature extraction:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
+## Redact with AWS OCR and additional layout extraction options:
+python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout
+# Duplicate page detection
+## Find duplicate pages in OCR files:
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
+## Find duplicate in OCR files at the line level:
+python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
+## Find duplicate rows in tabular data:
+python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
+# AWS Textract whole document analysis
+## Submit document to Textract for basic text analysis:
+python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
+## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results):
+python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
+## Retrieve Textract results by job ID (returns a .json file output):
+python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
+## List recent Textract jobs:
+python cli_redact.py --task textract --textract_action list
+""",
+    )
+    # --- Task Selection ---
+    task_group = parser.add_argument_group("Task Selection")
+    task_group.add_argument(
+        "--task",
+        choices=["redact", "deduplicate", "textract"],
+        default="redact",
+        help="Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), or textract (AWS Textract batch operations).",
+    )
+    # --- General Arguments (apply to all file types) ---
+    general_group = parser.add_argument_group("General Options")
+    general_group.add_argument(
+        "--input_file",
+        nargs="+",
+        help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.",
+    )
+    general_group.add_argument(
+        "--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files."
+    )
+    general_group.add_argument(
+        "--input_dir", default=INPUT_FOLDER, help="Directory for all input files."
+    )
+    general_group.add_argument(
+        "--language", default=DEFAULT_LANGUAGE, help="Language of the document content."
+    )
+    general_group.add_argument(
+        "--allow_list",
+        default=ALLOW_LIST_PATH,
+        help="Path to a CSV file with words to exclude from redaction.",
+    )
+    general_group.add_argument(
+        "--pii_detector",
+        choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"],
+        default=LOCAL_PII_OPTION,
+        help="Core PII detection method (Local or AWS Comprehend, or None).",
+    )
+    general_group.add_argument(
+        "--username", default=DIRECT_MODE_DEFAULT_USER, help="Username for the session."
+    )
+    general_group.add_argument(
+        "--save_to_user_folders",
+        default=SESSION_OUTPUT_FOLDER,
+        help="Whether to save to user folders or not.",
+    )
+    general_group.add_argument(
+        "--local_redact_entities",
+        nargs="+",
+        choices=full_entity_list,
+        default=chosen_redact_entities,
+        help=f"Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.",
+    )
+    general_group.add_argument(
+        "--aws_redact_entities",
+        nargs="+",
+        choices=full_comprehend_entity_list,
+        default=chosen_comprehend_entities,
+        help=f"AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.",
+    )
+    general_group.add_argument(
+        "--aws_access_key", default=AWS_ACCESS_KEY, help="Your AWS Access Key ID."
+    )
+    general_group.add_argument(
+        "--aws_secret_key", default=AWS_SECRET_KEY, help="Your AWS Secret Access Key."
+    )
+    general_group.add_argument(
+        "--cost_code", default=DEFAULT_COST_CODE, help="Cost code for tracking usage."
+    )
+    general_group.add_argument(
+        "--aws_region", default=AWS_REGION, help="AWS region for cloud services."
+    )
+    general_group.add_argument(
+        "--s3_bucket",
+        default=DOCUMENT_REDACTION_BUCKET,
+        help="S3 bucket name for cloud operations.",
+    )
+    general_group.add_argument(
+        "--do_initial_clean",
+        default=DO_INITIAL_TABULAR_DATA_CLEAN,
+        help="Perform initial text cleaning for tabular data.",
+    )
+    general_group.add_argument(
+        "--save_logs_to_csv",
+        default=SAVE_LOGS_TO_CSV,
+        help="Save processing logs to CSV files.",
+    )
+    general_group.add_argument(
+        "--save_logs_to_dynamodb",
+        default=SAVE_LOGS_TO_DYNAMODB,
+        help="Save processing logs to DynamoDB.",
+    )
+    general_group.add_argument(
+        "--display_file_names_in_logs",
+        default=DISPLAY_FILE_NAMES_IN_LOGS,
+        help="Include file names in log outputs.",
+    )
+    general_group.add_argument(
+        "--upload_logs_to_s3",
+        default=RUN_AWS_FUNCTIONS,
+        help="Upload log files to S3 after processing.",
+    )
+    general_group.add_argument(
+        "--s3_logs_prefix",
+        default=S3_USAGE_LOGS_FOLDER,
+        help="S3 prefix for usage log files.",
+    )
+    general_group.add_argument(
+        "--feedback_logs_folder",
+        default=FEEDBACK_LOGS_FOLDER,
+        help="Directory for feedback log files.",
+    )
+    general_group.add_argument(
+        "--access_logs_folder",
+        default=ACCESS_LOGS_FOLDER,
+        help="Directory for access log files.",
+    )
+    general_group.add_argument(
+        "--usage_logs_folder",
+        default=USAGE_LOGS_FOLDER,
+        help="Directory for usage log files.",
+    )
+    general_group.add_argument(
+        "--paddle_model_path",
+        default=PADDLE_MODEL_PATH,
+        help="Directory for PaddleOCR model storage.",
+    )
+    general_group.add_argument(
+        "--spacy_model_path",
+        default=SPACY_MODEL_PATH,
+        help="Directory for spaCy model storage.",
+    )
+    # --- PDF/Image Redaction Arguments ---
+    pdf_group = parser.add_argument_group(
+        "PDF/Image Redaction Options (.pdf, .png, .jpg)"
+    )
+    pdf_group.add_argument(
+        "--ocr_method",
+        choices=["AWS Textract", "Local OCR", "Local text"],
+        default="Local OCR",
+        help="OCR method for text extraction from images.",
+    )
+    pdf_group.add_argument(
+        "--page_min", type=int, default=0, help="First page to redact."
+    )
+    pdf_group.add_argument(
+        "--page_max", type=int, default=0, help="Last page to redact."
+    )
+    pdf_group.add_argument(
+        "--images_dpi",
+        type=float,
+        default=float(IMAGES_DPI),
+        help="DPI for image processing.",
+    )
+    pdf_group.add_argument(
+        "--chosen_local_ocr_model",
+        choices=LOCAL_OCR_MODEL_OPTIONS,
+        default=CHOSEN_LOCAL_OCR_MODEL,
+        help="Local OCR model to use.",
+    )
+    pdf_group.add_argument(
+        "--preprocess_local_ocr_images",
+        default=PREPROCESS_LOCAL_OCR_IMAGES,
+        help="Preprocess images before OCR.",
+    )
+    pdf_group.add_argument(
+        "--compress_redacted_pdf",
+        default=COMPRESS_REDACTED_PDF,
+        help="Compress the final redacted PDF.",
+    )
+    pdf_group.add_argument(
+        "--return_pdf_end_of_redaction",
+        default=RETURN_REDACTED_PDF,
+        help="Return PDF at end of redaction process.",
+    )
+    pdf_group.add_argument(
+        "--deny_list_file",
+        default=DENY_LIST_PATH,
+        help="Custom words file to recognize for redaction.",
+    )
+    pdf_group.add_argument(
+        "--allow_list_file",
+        default=ALLOW_LIST_PATH,
+        help="Custom words file to recognize for redaction.",
+    )
+    pdf_group.add_argument(
+        "--redact_whole_page_file",
+        default=WHOLE_PAGE_REDACTION_LIST_PATH,
+        help="File for pages to redact completely.",
+    )
+    pdf_group.add_argument(
+        "--handwrite_signature_extraction",
+        nargs="+",
+        default=default_handwrite_signature_checkbox,
+        help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".',
+    )
+    pdf_group.add_argument(
+        "--extract_forms",
+        action="store_true",
+        help="Extract forms during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--extract_tables",
+        action="store_true",
+        help="Extract tables during Textract analysis.",
+    )
+    pdf_group.add_argument(
+        "--extract_layout",
+        action="store_true",
+        help="Extract layout during Textract analysis.",
+    )
+    # --- Word/Tabular Anonymisation Arguments ---
+    tabular_group = parser.add_argument_group(
+        "Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)"
+    )
+    tabular_group.add_argument(
+        "--anon_strategy",
+        choices=[
+            "redact",
+            "redact completely",
+            "replace_redacted",
+            "entity_type",
+            "encrypt",
+            "hash",
+            "replace with 'REDACTED'",
+            "replace with <ENTITY_NAME>",
+            "mask",
+            "fake_first_name",
+        ],
+        default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
+        help="The anonymisation strategy to apply.",
+    )
+    tabular_group.add_argument(
+        "--text_columns",
+        nargs="+",
+        default=list(),
+        help="A list of column names to anonymise or deduplicate in tabular data.",
+    )
+    tabular_group.add_argument(
+        "--excel_sheets",
+        nargs="+",
+        default=list(),
+        help="Specific Excel sheet names to process.",
+    )
+    tabular_group.add_argument(
+        "--fuzzy_mistakes",
+        type=int,
+        default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
+        help="Number of allowed spelling mistakes for fuzzy matching.",
+    )
+    tabular_group.add_argument(
+        "--match_fuzzy_whole_phrase_bool",
+        default=True,
+        help="Match fuzzy whole phrase boolean.",
+    )
+    # --- Duplicate Detection Arguments ---
+    duplicate_group = parser.add_argument_group("Duplicate Detection Options")
+    duplicate_group.add_argument(
+        "--duplicate_type",
+        choices=["pages", "tabular"],
+        default="pages",
+        help="Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).",
+    )
+    duplicate_group.add_argument(
+        "--similarity_threshold",
+        type=float,
+        default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
+        help="Similarity threshold (0-1) to consider content as duplicates.",
+    )
+    duplicate_group.add_argument(
+        "--min_word_count",
+        type=int,
+        default=DEFAULT_MIN_WORD_COUNT,
+        help="Minimum word count for text to be considered in duplicate analysis.",
+    )
+    duplicate_group.add_argument(
+        "--min_consecutive_pages",
+        type=int,
+        default=DEFAULT_MIN_CONSECUTIVE_PAGES,
+        help="Minimum number of consecutive pages to consider as a match.",
+    )
+    duplicate_group.add_argument(
+        "--greedy_match",
+        default=USE_GREEDY_DUPLICATE_DETECTION,
+        help="Use greedy matching strategy for consecutive pages.",
+    )
+    duplicate_group.add_argument(
+        "--combine_pages",
+        default=DEFAULT_COMBINE_PAGES,
+        help="Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.",
+    )
+    duplicate_group.add_argument(
+        "--remove_duplicate_rows",
+        default=REMOVE_DUPLICATE_ROWS,
+        help="Remove duplicate rows from the output.",
+    )
+    # --- Textract Batch Operations Arguments ---
+    textract_group = parser.add_argument_group("Textract Batch Operations Options")
+    textract_group.add_argument(
+        "--textract_action",
+        choices=["submit", "retrieve", "list"],
+        help="Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).",
+    )
+    textract_group.add_argument("--job_id", help="Textract job ID for retrieve action.")
+    textract_group.add_argument(
+        "--extract_signatures",
+        action="store_true",
+        help="Extract signatures during Textract analysis (for submit action).",
+    )
+    textract_group.add_argument(
+        "--textract_bucket",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
+        help="S3 bucket name for Textract operations (overrides default).",
+    )
+    textract_group.add_argument(
+        "--textract_input_prefix",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+        help="S3 prefix for input files in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--textract_output_prefix",
+        default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
+        help="S3 prefix for output files in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--s3_textract_document_logs_subfolder",
+        default=TEXTRACT_JOBS_S3_LOC,
+        help="S3 prefix for logs in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--local_textract_document_logs_subfolder",
+        default=TEXTRACT_JOBS_LOCAL_LOC,
+        help="Local prefix for logs in Textract operations.",
+    )
+    textract_group.add_argument(
+        "--poll_interval",
+        type=int,
+        default=30,
+        help="Polling interval in seconds for Textract job status.",
+    )
+    textract_group.add_argument(
+        "--max_poll_attempts",
+        type=int,
+        default=120,
+        help="Maximum number of polling attempts for Textract job completion.",
+    )
+    # Parse arguments - either from command line or direct mode
+    if direct_mode_args:
+        # Use direct mode arguments
+        args = argparse.Namespace(**direct_mode_args)
+    else:
+        # Parse command line arguments
+        args = parser.parse_args()
+    # --- Initial Setup ---
+    # Convert string boolean variables to boolean
+    if args.preprocess_local_ocr_images == "True":
+        args.preprocess_local_ocr_images = True
+    else:
+        args.preprocess_local_ocr_images = False
+    if args.greedy_match == "True":
+        args.greedy_match = True
+    else:
+        args.greedy_match = False
+    if args.combine_pages == "True":
+        args.combine_pages = True
+    else:
+        args.combine_pages = False
+    if args.remove_duplicate_rows == "True":
+        args.remove_duplicate_rows = True
+    else:
+        args.remove_duplicate_rows = False
+    if args.return_pdf_end_of_redaction == "True":
+        args.return_pdf_end_of_redaction = True
+    else:
+        args.return_pdf_end_of_redaction = False
+    if args.compress_redacted_pdf == "True":
+        args.compress_redacted_pdf = True
+    else:
+        args.compress_redacted_pdf = False
+    if args.do_initial_clean == "True":
+        args.do_initial_clean = True
+    else:
+        args.do_initial_clean = False
+    if args.save_logs_to_csv == "True":
+        args.save_logs_to_csv = True
+    else:
+        args.save_logs_to_csv = False
+    if args.save_logs_to_dynamodb == "True":
+        args.save_logs_to_dynamodb = True
+    else:
+        args.save_logs_to_dynamodb = False
+    if args.display_file_names_in_logs == "True":
+        args.display_file_names_in_logs = True
+    else:
+        args.display_file_names_in_logs = False
+    if args.match_fuzzy_whole_phrase_bool == "True":
+        args.match_fuzzy_whole_phrase_bool = True
+    else:
+        args.match_fuzzy_whole_phrase_bool = False
+    # Convert save_to_user_folders to boolean (handles both string and boolean values)
+    args.save_to_user_folders = convert_string_to_boolean(args.save_to_user_folders)
+    # Combine extraction options
+    extraction_options = (
+        list(args.handwrite_signature_extraction)
+        if args.handwrite_signature_extraction
+        else []
+    )
+    if args.extract_forms:
+        extraction_options.append("Extract forms")
+    if args.extract_tables:
+        extraction_options.append("Extract tables")
+    if args.extract_layout:
+        extraction_options.append("Extract layout")
+    args.handwrite_signature_extraction = extraction_options
+    if args.task in ["redact", "deduplicate"]:
+        if args.input_file:
+            if isinstance(args.input_file, str):
+                args.input_file = [args.input_file]
+            _, file_extension = os.path.splitext(args.input_file[0])
+            file_extension = file_extension.lower()
+        else:
+            raise ValueError("Error: --input_file is required for 'redact' task.")
+    # Initialise usage logger if logging is enabled
+    usage_logger = None
+    if args.save_logs_to_csv or args.save_logs_to_dynamodb:
+        from tools.cli_usage_logger import create_cli_usage_logger
+        try:
+            usage_logger = create_cli_usage_logger(logs_folder=args.usage_logs_folder)
+        except Exception as e:
+            print(f"Warning: Could not initialise usage logger: {e}")
+    # Get username and folders
+    (
+        session_hash,
+        args.output_dir,
+        _,
+        args.input_dir,
+        args.textract_input_prefix,
+        args.textract_output_prefix,
+        args.s3_textract_document_logs_subfolder,
+        args.local_textract_document_logs_subfolder,
+    ) = get_username_and_folders(
+        username=args.username,
+        output_folder_textbox=args.output_dir,
+        input_folder_textbox=args.input_dir,
+        session_output_folder=args.save_to_user_folders,
+        textract_document_upload_input_folder=args.textract_input_prefix,
+        textract_document_upload_output_folder=args.textract_output_prefix,
+        s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder,
+        local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder,
+    )
+    print(
+        f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}."
+    )
+    # --- Route to the Correct Workflow Based on Task and File Type ---
+    # Validate input_file requirement for tasks that need it
+    if args.task in ["redact", "deduplicate"] and not args.input_file:
+        print(f"Error: --input_file is required for '{args.task}' task.")
+        return
+    if args.ocr_method in ["Local OCR", "AWS Textract"]:
+        args.prepare_images = True
+    else:
+        args.prepare_images = False
+    from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
+    # Task 1: Redaction/Anonymisation
+    if args.task == "redact":
+        # Workflow 1: PDF/Image Redaction
+        if file_extension in [".pdf", ".png", ".jpg", ".jpeg"]:
+            print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
+            start_time = time.time()
+            try:
+                from tools.file_conversion import prepare_image_or_pdf
+                from tools.file_redaction import choose_and_run_redactor
+                # Step 1: Prepare the document
+                print("\nStep 1: Preparing document...")
+                (
+                    prep_summary,
+                    prepared_pdf_paths,
+                    image_file_paths,
+                    _,
+                    _,
+                    pdf_doc,
+                    image_annotations,
+                    _,
+                    original_cropboxes,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = prepare_image_or_pdf(
+                    file_paths=args.input_file,
+                    text_extract_method=args.ocr_method,
+                    all_line_level_ocr_results_df=pd.DataFrame(),
+                    all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                    first_loop_state=True,
+                    prepare_for_review=False,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                    prepare_images=args.prepare_images,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                )
+                print(f"Preparation complete. {prep_summary}")
+                # Step 2: Redact the prepared document
+                print("\nStep 2: Running redaction...")
+                (
+                    output_summary,
+                    output_files,
+                    _,
+                    _,
+                    log_files,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    comprehend_query_number,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    page_sizes,
+                    _,
+                    _,
+                    _,
+                    total_textract_query_number,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = choose_and_run_redactor(
+                    file_paths=args.input_file,
+                    prepared_pdf_file_paths=prepared_pdf_paths,
+                    pdf_image_file_paths=image_file_paths,
+                    chosen_redact_entities=args.local_redact_entities,
+                    chosen_redact_comprehend_entities=args.aws_redact_entities,
+                    text_extraction_method=args.ocr_method,
+                    in_allow_list=args.allow_list_file,
+                    in_deny_list=args.deny_list_file,
+                    redact_whole_page_list=args.redact_whole_page_file,
+                    first_loop_state=True,
+                    page_min=args.page_min,
+                    page_max=args.page_max,
+                    handwrite_signature_checkbox=args.handwrite_signature_extraction,
+                    max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                    match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
+                    pymupdf_doc=pdf_doc,
+                    annotations_all_pages=image_annotations,
+                    page_sizes=page_sizes,
+                    document_cropboxes=original_cropboxes,
+                    pii_identification_method=args.pii_detector,
+                    aws_access_key_textbox=args.aws_access_key,
+                    aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language,
+                    output_folder=args.output_dir,
+                    input_folder=args.input_dir,
+                )
+                # Calculate processing time
+                end_time = time.time()
+                processing_time = end_time - start_time
+                # Log usage data if logger is available
+                if usage_logger:
+                    try:
+                        # Extract file name for logging
+                        print("Saving logs to CSV")
+                        doc_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "document"
+                        )
+                        data_file_name = ""  # Not applicable for PDF/image redaction
+                        # Determine if this was a Textract API call
+                        is_textract_call = args.ocr_method == "AWS Textract"
+                        # Count pages (approximate from page_sizes if available)
+                        total_pages = len(page_sizes) if page_sizes else 1
+                        # Count API calls (approximate - would need to be tracked in the redaction function)
+                        textract_queries = (
+                            int(total_textract_query_number) if is_textract_call else 0
+                        )
+                        comprehend_queries = (
+                            int(comprehend_query_number)
+                            if args.pii_detector == "AWS Comprehend"
+                            else 0
+                        )
+                        # Format handwriting/signature options
+                        handwriting_signature = (
+                            ", ".join(args.handwrite_signature_extraction)
+                            if args.handwrite_signature_extraction
+                            else ""
+                        )
+                        log_redaction_usage(
+                            logger=usage_logger,
+                            session_hash=session_hash,
+                            doc_file_name=doc_file_name,
+                            data_file_name=data_file_name,
+                            time_taken=processing_time,
+                            total_pages=total_pages,
+                            textract_queries=textract_queries,
+                            pii_method=args.pii_detector,
+                            comprehend_queries=comprehend_queries,
+                            cost_code=args.cost_code,
+                            handwriting_signature=handwriting_signature,
+                            text_extraction_method=args.ocr_method,
+                            is_textract_call=is_textract_call,
+                            task=args.task,
+                            save_to_dynamodb=args.save_logs_to_dynamodb,
+                            save_to_s3=args.upload_logs_to_s3,
+                            s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not log usage data: {e}")
+                print("\n--- Redaction Process Complete ---")
+                print(f"Summary: {output_summary}")
+                print(f"Processing time: {processing_time:.2f} seconds")
+                print(f"\nOutput files saved to: {args.output_dir}")
+                print("Generated Files:", sorted(output_files))
+                if log_files:
+                    print("Log Files:", sorted(log_files))
+            except Exception as e:
+                print(
+                    f"\nAn error occurred during the PDF/Image redaction workflow: {e}"
+                )
+        # Workflow 2: Word/Tabular Data Anonymisation
+        elif file_extension in [".docx", ".xlsx", ".xls", ".csv", ".parquet"]:
+            print(
+                "--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---"
+            )
+            start_time = time.time()
+            try:
+                from tools.data_anonymise import anonymise_files_with_open_text
+                # Run the anonymisation function directly
+                (
+                    output_summary,
+                    output_files,
+                    _,
+                    _,
+                    log_files,
+                    _,
+                    processing_time,
+                    comprehend_query_number,
+                ) = anonymise_files_with_open_text(
+                    file_paths=args.input_file,
+                    in_text="",  # Not used for file-based operations
+                    anon_strategy=args.anon_strategy,
+                    chosen_cols=args.text_columns,
+                    chosen_redact_entities=args.local_redact_entities,
+                    in_allow_list=args.allow_list_file,
+                    in_excel_sheets=args.excel_sheets,
+                    first_loop_state=True,
+                    output_folder=args.output_dir,
+                    in_deny_list=args.deny_list_file,
+                    max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                    pii_identification_method=args.pii_detector,
+                    chosen_redact_comprehend_entities=args.aws_redact_entities,
+                    aws_access_key_textbox=args.aws_access_key,
+                    aws_secret_key_textbox=args.aws_secret_key,
+                    language=args.language,
+                    do_initial_clean=args.do_initial_clean,
+                )
+                # Calculate processing time
+                end_time = time.time()
+                processing_time = end_time - start_time
+                # Log usage data if logger is available
+                if usage_logger:
+                    try:
+                        print("Saving logs to CSV")
+                        # Extract file name for logging
+                        doc_file_name = ""  # Not applicable for tabular data
+                        data_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "data_file"
+                        )
+                        # Determine if this was a Textract API call (not applicable for tabular)
+                        is_textract_call = False
+                        # Count pages (not applicable for tabular data)
+                        total_pages = 0
+                        # Count API calls (approximate - would need to be tracked in the anonymisation function)
+                        textract_queries = 0  # Not applicable for tabular data
+                        comprehend_queries = (
+                            comprehend_query_number
+                            if args.pii_detector == "AWS Comprehend"
+                            else 0
+                        )
+                        # Format handwriting/signature options (not applicable for tabular)
+                        handwriting_signature = ""
+                        log_redaction_usage(
+                            logger=usage_logger,
+                            session_hash=session_hash,
+                            doc_file_name=doc_file_name,
+                            data_file_name=data_file_name,
+                            time_taken=processing_time,
+                            total_pages=total_pages,
+                            textract_queries=textract_queries,
+                            pii_method=args.pii_detector,
+                            comprehend_queries=comprehend_queries,
+                            cost_code=args.cost_code,
+                            handwriting_signature=handwriting_signature,
+                            text_extraction_method="tabular",  # Indicate this is tabular processing
+                            is_textract_call=is_textract_call,
+                            task=args.task,
+                            save_to_dynamodb=args.save_logs_to_dynamodb,
+                            save_to_s3=args.upload_logs_to_s3,
+                            s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not log usage data: {e}")
+                print("\n--- Anonymisation Process Complete ---")
+                print(f"Summary: {output_summary}")
+                print(f"Processing time: {processing_time:.2f} seconds")
+                print(f"\nOutput files saved to: {args.output_dir}")
+                print("Generated Files:", sorted(output_files))
+                if log_files:
+                    print("Log Files:", sorted(log_files))
+            except Exception as e:
+                print(
+                    f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}"
+                )
+        else:
+            print(f"Error: Unsupported file type '{file_extension}' for redaction.")
+            print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+            print(
+                "Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet"
+            )
+    # Task 2: Duplicate Detection
+    elif args.task == "deduplicate":
+        print("--- Starting Duplicate Detection Workflow... ---")
+        try:
+            from tools.find_duplicate_pages import run_duplicate_analysis
+            if args.duplicate_type == "pages":
+                # Page duplicate detection
+                if file_extension == ".csv":
+                    print(
+                        "--- Detected OCR CSV file. Starting Page Duplicate Detection... ---"
+                    )
+                    start_time = time.time()
+                    if args.combine_pages is True:
+                        print("Combining pages...")
+                    else:
+                        print("Using line-level duplicate detection...")
+                    # Load the CSV file as a list for the duplicate analysis function
+                    (
+                        results_df,
+                        output_paths,
+                        full_data_by_file,
+                        processing_time,
+                        task_textbox,
+                    ) = run_duplicate_analysis(
+                        files=args.input_file,
+                        threshold=args.similarity_threshold,
+                        min_words=args.min_word_count,
+                        min_consecutive=args.min_consecutive_pages,
+                        greedy_match=args.greedy_match,
+                        combine_pages=args.combine_pages,
+                        output_folder=args.output_dir,
+                    )
+                    end_time = time.time()
+                    processing_time = end_time - start_time
+                    print("\n--- Page Duplicate Detection Complete ---")
+                    print(f"Found {len(results_df)} duplicate matches")
+                    print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths:
+                        print("Generated Files:", sorted(output_paths))
+                else:
+                    print(
+                        "Error: Page duplicate detection requires CSV files with OCR data."
+                    )
+                    print("Please provide a CSV file containing OCR output data.")
+                    # Log usage data if logger is available
+                    if usage_logger:
+                        try:
+                            # Extract file name for logging
+                            print("Saving logs to CSV")
+                            doc_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs
+                                else "document"
+                            )
+                            data_file_name = (
+                                ""  # Not applicable for PDF/image redaction
+                            )
+                            # Determine if this was a Textract API call
+                            is_textract_call = False
+                            # Count pages (approximate from page_sizes if available)
+                            total_pages = len(page_sizes) if page_sizes else 1
+                            # Count API calls (approximate - would need to be tracked in the redaction function)
+                            textract_queries = 0
+                            comprehend_queries = 0
+                            # Format handwriting/signature options
+                            handwriting_signature = ""
+                            log_redaction_usage(
+                                logger=usage_logger,
+                                session_hash=session_hash,
+                                doc_file_name=doc_file_name,
+                                data_file_name=data_file_name,
+                                time_taken=processing_time,
+                                total_pages=total_pages,
+                                textract_queries=textract_queries,
+                                pii_method=args.pii_detector,
+                                comprehend_queries=comprehend_queries,
+                                cost_code=args.cost_code,
+                                handwriting_signature=handwriting_signature,
+                                text_extraction_method=args.ocr_method,
+                                is_textract_call=is_textract_call,
+                                task=args.task,
+                                save_to_dynamodb=args.save_logs_to_dynamodb,
+                                save_to_s3=args.upload_logs_to_s3,
+                                s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not log usage data: {e}")
+            elif args.duplicate_type == "tabular":
+                # Tabular duplicate detection
+                from tools.find_duplicate_tabular import run_tabular_duplicate_detection
+                if file_extension in [".csv", ".xlsx", ".xls", ".parquet"]:
+                    print(
+                        "--- Detected tabular file. Starting Tabular Duplicate Detection... ---"
+                    )
+                    start_time = time.time()
+                    (
+                        results_df,
+                        output_paths,
+                        full_data_by_file,
+                        processing_time,
+                        task_textbox,
+                    ) = run_tabular_duplicate_detection(
+                        files=args.input_file,
+                        threshold=args.similarity_threshold,
+                        min_words=args.min_word_count,
+                        text_columns=args.text_columns,
+                        output_folder=args.output_dir,
+                        do_initial_clean_dup=args.do_initial_clean,
+                        in_excel_tabular_sheets=args.excel_sheets,
+                        remove_duplicate_rows=args.remove_duplicate_rows,
+                    )
+                    end_time = time.time()
+                    processing_time = end_time - start_time
+                    # Log usage data if logger is available
+                    if usage_logger:
+                        try:
+                            # Extract file name for logging
+                            print("Saving logs to CSV")
+                            doc_file_name = ""
+                            data_file_name = (
+                                os.path.basename(args.input_file[0])
+                                if args.display_file_names_in_logs
+                                else "data_file"
+                            )
+                            # Determine if this was a Textract API call
+                            is_textract_call = False
+                            # Count pages (approximate from page_sizes if available)
+                            total_pages = len(page_sizes) if page_sizes else 1
+                            # Count API calls (approximate - would need to be tracked in the redaction function)
+                            textract_queries = 0
+                            comprehend_queries = 0
+                            # Format handwriting/signature options
+                            handwriting_signature = ""
+                            log_redaction_usage(
+                                logger=usage_logger,
+                                session_hash=session_hash,
+                                doc_file_name=doc_file_name,
+                                data_file_name=data_file_name,
+                                time_taken=processing_time,
+                                total_pages=total_pages,
+                                textract_queries=textract_queries,
+                                pii_method=args.pii_detector,
+                                comprehend_queries=comprehend_queries,
+                                cost_code=args.cost_code,
+                                handwriting_signature=handwriting_signature,
+                                text_extraction_method=args.ocr_method,
+                                is_textract_call=is_textract_call,
+                                task=args.task,
+                                save_to_dynamodb=args.save_logs_to_dynamodb,
+                                save_to_s3=args.upload_logs_to_s3,
+                                s3_bucket=args.s3_bucket,
+                                s3_key_prefix=args.s3_logs_prefix,
+                            )
+                        except Exception as e:
+                            print(f"Warning: Could not log usage data: {e}")
+                    print("\n--- Tabular Duplicate Detection Complete ---")
+                    print(f"Found {len(results_df)} duplicate matches")
+                    print(f"\nOutput files saved to: {args.output_dir}")
+                    if output_paths:
+                        print("Generated Files:", sorted(output_paths))
+                else:
+                    print(
+                        "Error: Tabular duplicate detection requires CSV, Excel, or Parquet files."
+                    )
+                    print("Supported types: .csv, .xlsx, .xls, .parquet")
+            else:
+                print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
+                print("Valid options: 'pages' or 'tabular'")
+        except Exception as e:
+            print(f"\nAn error occurred during the duplicate detection workflow: {e}")
+    # Task 3: Textract Batch Operations
+    elif args.task == "textract":
+        print("--- Starting Textract Batch Operations Workflow... ---")
+        if not args.textract_action:
+            print("Error: --textract_action is required for textract task.")
+            print("Valid options: 'submit', 'retrieve', or 'list'")
+            return
+        try:
+            if args.textract_action == "submit":
+                from tools.textract_batch_call import (
+                    analyse_document_with_textract_api,
+                    load_in_textract_job_details,
+                )
+                # Submit document to Textract for analysis
+                if not args.input_file:
+                    print("Error: --input_file is required for submit action.")
+                    return
+                print(f"--- Submitting document to Textract: {args.input_file} ---")
+                start_time = time.time()
+                # Load existing job details
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
+                # Determine signature extraction options
+                signature_options = (
+                    ["Extract handwriting", "Extract signatures"]
+                    if args.extract_signatures
+                    else ["Extract handwriting"]
+                )
+                # Use configured bucket or override
+                textract_bucket = args.textract_bucket if args.textract_bucket else ""
+                # Submit the job
+                (
+                    result_message,
+                    job_id,
+                    job_type,
+                    successful_job_number,
+                    is_textract_call,
+                    total_pages,
+                    task_textbox,
+                ) = analyse_document_with_textract_api(
+                    local_pdf_path=args.input_file,
+                    s3_input_prefix=args.textract_input_prefix,
+                    s3_output_prefix=args.textract_output_prefix,
+                    job_df=job_df,
+                    s3_bucket_name=textract_bucket,
+                    general_s3_bucket_name=args.s3_bucket,
+                    local_output_dir=args.output_dir,
+                    handwrite_signature_checkbox=signature_options,
+                    aws_region=args.aws_region,
+                )
+                end_time = time.time()
+                processing_time = end_time - start_time
+                print("\n--- Textract Job Submitted Successfully ---")
+                print(f"Job ID: {job_id}")
+                print(f"Job Type: {job_type}")
+                print(f"Message: {result_message}")
+                print(f"Results will be available in: {args.output_dir}")
+                # Log usage data if logger is available
+                if usage_logger:
+                    try:
+                        # Extract file name for logging
+                        print("Saving logs to CSV")
+                        doc_file_name = (
+                            os.path.basename(args.input_file[0])
+                            if args.display_file_names_in_logs
+                            else "document"
+                        )
+                        data_file_name = ""
+                        # Determine if this was a Textract API call
+                        is_textract_call = True
+                        args.ocr_method == "AWS Textract"
+                        # Count API calls (approximate - would need to be tracked in the redaction function)
+                        textract_queries = total_pages
+                        comprehend_queries = 0
+                        # Format handwriting/signature options
+                        handwriting_signature = ""
+                        log_redaction_usage(
+                            logger=usage_logger,
+                            session_hash=session_hash,
+                            doc_file_name=doc_file_name,
+                            data_file_name=data_file_name,
+                            time_taken=processing_time,
+                            total_pages=total_pages,
+                            textract_queries=textract_queries,
+                            pii_method=args.pii_detector,
+                            comprehend_queries=comprehend_queries,
+                            cost_code=args.cost_code,
+                            handwriting_signature=handwriting_signature,
+                            text_extraction_method=args.ocr_method,
+                            is_textract_call=is_textract_call,
+                            task=args.task,
+                            save_to_dynamodb=args.save_logs_to_dynamodb,
+                            save_to_s3=args.upload_logs_to_s3,
+                            s3_bucket=args.s3_bucket,
+                            s3_key_prefix=args.s3_logs_prefix,
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not log usage data: {e}")
+            elif args.textract_action == "retrieve":
+                print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
+                from tools.textract_batch_call import (
+                    load_in_textract_job_details,
+                    poll_whole_document_textract_analysis_progress_and_download,
+                )
+                # Retrieve results by job ID
+                if not args.job_id:
+                    print("Error: --job_id is required for retrieve action.")
+                    return
+                # Load existing job details to get job type
+                print("Loading existing job details...")
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
+                # Find job type from the dataframe
+                job_type = "document_text_detection"  # default
+                if not job_df.empty and "job_id" in job_df.columns:
+                    matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
+                    if not matching_jobs.empty and "job_type" in matching_jobs.columns:
+                        job_type = matching_jobs.iloc[0]["job_type"]
+                # Use configured bucket or override
+                textract_bucket = args.textract_bucket if args.textract_bucket else ""
+                # Poll for completion and download results
+                print("Polling for completion and downloading results...")
+                downloaded_file_path, job_status, updated_job_df, output_filename = (
+                    poll_whole_document_textract_analysis_progress_and_download(
+                        job_id=args.job_id,
+                        job_type_dropdown=job_type,
+                        s3_output_prefix=args.textract_output_prefix,
+                        pdf_filename="",  # Will be determined from job details
+                        job_df=job_df,
+                        s3_bucket_name=textract_bucket,
+                        load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                        load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                        local_output_dir=args.output_dir,
+                        poll_interval_seconds=args.poll_interval,
+                        max_polling_attempts=args.max_poll_attempts,
+                    )
+                )
+                print("\n--- Textract Results Retrieved Successfully ---")
+                print(f"Job Status: {job_status}")
+                print(f"Downloaded File: {downloaded_file_path}")
+                # print(f"Output Filename: {output_filename}")
+            elif args.textract_action == "list":
+                from tools.textract_batch_call import load_in_textract_job_details
+                # List recent Textract jobs
+                print("--- Listing Recent Textract Jobs ---")
+                job_df = load_in_textract_job_details(
+                    load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
+                    load_local_jobs_loc=args.local_textract_document_logs_subfolder,
+                )
+                if job_df.empty:
+                    print("No recent Textract jobs found.")
+                else:
+                    print(f"\nFound {len(job_df)} recent Textract jobs:")
+                    print("-" * 80)
+                    for _, job in job_df.iterrows():
+                        print(f"Job ID: {job.get('job_id', 'N/A')}")
+                        print(f"File: {job.get('file_name', 'N/A')}")
+                        print(f"Type: {job.get('job_type', 'N/A')}")
+                        print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
+                        print(f"Date: {job.get('job_date_time', 'N/A')}")
+                        print("-" * 80)
+            else:
+                print(f"Error: Invalid textract_action '{args.textract_action}'.")
+                print("Valid options: 'submit', 'retrieve', or 'list'")
+        except Exception as e:
+            print(f"\nAn error occurred during the Textract workflow: {e}")
+    else:
+        print(f"Error: Invalid task '{args.task}'.")
+        print("Valid options: 'redact', 'deduplicate', or 'textract'")
+if __name__ == "__main__":
+    main()

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/sh
+# Exit immediately if a command exits with a non-zero status.
+set -e
+echo "Starting in APP_MODE: $APP_MODE"
+# --- Start the app based on mode ---
+if [ "$APP_MODE" = "lambda" ]; then
+    echo "Starting in Lambda mode..."
+    # The CMD from Dockerfile will be passed as "$@"
+    exec python -m awslambdaric "$@"
+else
+    echo "Starting in Gradio/FastAPI mode..."
+    if [ "$RUN_FASTAPI" = "True" ]; then
+        echo "Starting in FastAPI mode..."
+        GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME:-0.0.0.0}
+        GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
+        # Start uvicorn server.
+        echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
+        exec uvicorn app:app \
+            --host $GRADIO_SERVER_NAME \
+            --port $GRADIO_SERVER_PORT \
+            --proxy-headers
+    else
+        echo "Starting in Gradio mode..."
+        exec python app.py
+    fi
+fi

example_config.env ADDED Viewed

	@@ -0,0 +1,49 @@

+# Rename this file to app_config.env and place it in the folder config/ (i.e. it will be located at app_base_folder/config/app_config.env). The app will then automatically load in these variables at startup. See tools/config.py for all the possible config variables you can set, or src/app_settings.qmd for descriptions. Below are some suggested config variables to start
+TESSERACT_FOLDER=tesseract/ # If in a custom folder, not needed if in PATH
+POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/ # If in a custom folder, Not needed if in PATH
+SHOW_LANGUAGE_SELECTION=True
+SHOW_PADDLE_MODEL_OPTIONS=False
+SHOW_VLM_MODEL_OPTIONS=False
+SHOW_LOCAL_OCR_MODEL_OPTIONS=True
+CHOSEN_LOCAL_OCR_MODEL=tesseract
+SAVE_EXAMPLE_HYBRID_IMAGES=True
+SAVE_PAGE_OCR_VISUALISATIONS=True
+OVERWRITE_EXISTING_OCR_RESULTS=False
+CONVERT_LINE_TO_WORD_LEVEL=False
+LOAD_PADDLE_AT_STARTUP=False
+SAVE_VLM_INPUT_IMAGES=True
+SAVE_WORD_SEGMENTER_OUTPUT_IMAGES=True
+PREPROCESS_LOCAL_OCR_IMAGES=False
+SAVE_PREPROCESS_IMAGES=True
+SESSION_OUTPUT_FOLDER=False # Save outputs into user session folders
+DISPLAY_FILE_NAMES_IN_LOGS=False
+RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions. You can remove all the environment variables in the following section if you don't want to use them
+SAVE_LOGS_TO_DYNAMODB=True
+S3_COST_CODES_PATH=cost_codes.csv
+SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
+SHOW_AWS_PII_DETECTION_OPTIONS=True
+AWS_REGION=example-region
+DOCUMENT_REDACTION_BUCKET=example-bucket
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output
+LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True
+ACCESS_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-access-log
+USAGE_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-usage
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-feedback
+SHOW_COSTS=True
+GET_COST_CODES=True
+COST_CODES_PATH=config/cost_codes.csv
+ENFORCE_COST_CODES=True
+DEFAULT_COST_CODE=example_cost_code
+CUSTOM_BOX_COLOUR=(128, 128, 128)
+USE_GUI_BOX_COLOURS_FOR_OUTPUTS=False
+GRADIO_SERVER_NAME=127.0.0.1
+GRADIO_SERVER_PORT=7860

example_data/Bold minimalist professional cover letter.docx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8551ac157f350b2093e5d8c89f68474f613350074201cff6d52d5ed5ec28ff
+size 23992

example_data/Difficult handwritten note.jpg ADDED Viewed

Git LFS Details

SHA256: 28896bfa4c4d6ef48222a285c02529dc8967d15d799df5c4b4cf0f62224e7b6c
Pointer size: 130 Bytes
Size of remote file: 85.1 kB

example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:caf00ca5cb06b8019804d1a7eaeceec772607969e8cad6c34d1d583876345b90
+size 116763

example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

example_data/Partnership-Agreement-Toolkit_0_0.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0db46a784d7aaafb8d02acf8686523dd376400117d07926a5dcb51ceb69e3236
+size 426602

example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ another country or territory sign a formel agreement on behalf? of their communities endorsing a
2	+ soster citues international

example_data/combined_case_notes.csv ADDED Viewed

	@@ -0,0 +1,19 @@

+Date,Social Worker,Client,Case Note
+"January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
+"January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
+"February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
+"February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
+"March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
+"March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
+"April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
+"April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
+"May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
+"January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
+"January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
+"February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
+"February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
+"March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
+"March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
+"April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
+"April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
+"May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."

example_data/combined_case_notes.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09300597024591d0b5b4ef97faef12fcceb28fcbb6ea09260bc42f43967753a4
+size 12579

example_data/doubled_output_joined.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eeac353164447c2aa429196e1a6ffae4c095d7171e63c2d1cd1966fdf32d1ed
+size 1274719

example_data/example_complaint_letter.jpg ADDED Viewed

Git LFS Details

SHA256: db33b67ebe685132a589593e4a3ca05f2dbce358b63de9142c2f2a36202e3f15
Pointer size: 131 Bytes
Size of remote file: 118 kB

example_data/example_of_emails_sent_to_a_professor_before_applying.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0cd82b5b5826b851ca0e7c102d2d4d27580f7a90de4211a33178a6664d008d
+size 8848

example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv ADDED Viewed

	@@ -0,0 +1,277 @@

+page,text,left,top,width,height,line
+1,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
+1,SisterCities,0.169804,0.033333,0.238431,0.028182,2
+1,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+1,Toolkit,0.830588,0.07303,0.126667,0.025152,4
+1,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,5
+1,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
+1,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
+1,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
+1,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
+1,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
+1,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
+1,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
+1,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
+1,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
+1,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
+1,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
+1,Friendship City,0.118039,0.372121,0.127059,0.013939,17
+1,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
+1,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
+1,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
+1,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
+1,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
+1,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
+1,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
+1,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
+1,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
+1,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
+1,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
+1,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
+1,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
+1,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
+1,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
+1,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
+1,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
+1,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
+1,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
+1,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
+1,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
+1,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
+1,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
+1,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
+1,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
+1,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
+1,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
+1,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
+2,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
+2,SisterCities,0.169804,0.033333,0.238824,0.028182,2
+2,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+2,Toolkit,0.83098,0.072727,0.127059,0.025455,4
+2,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,5
+2,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
+2,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
+2,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
+2,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
+2,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
+2,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
+2,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
+2,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
+2,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
+2,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
+2,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
+2,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
+2,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
+2,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
+2,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
+2,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
+2,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
+2,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
+2,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
+2,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
+2,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
+2,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
+2,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
+2,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
+2,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
+2,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
+2,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
+2,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
+2,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
+2,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
+2,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
+2,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
+2,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
+2,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
+2,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
+2,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
+2,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
+2,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
+2,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
+2,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,1
+2,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,2
+3,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,3
+3,SisterCities,0.169804,0.033333,0.239216,0.028182,4
+3,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,5
+3,Toolkit,0.83098,0.07303,0.126667,0.025152,6
+3,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,7
+3,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,8
+3,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,9
+3,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,10
+3,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,11
+3,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,12
+3,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,13
+3,and cooperation.,0.176471,0.25697,0.13451,0.013333,14
+3,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,15
+3,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,16
+3,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,17
+3,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,18
+3,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,19
+3,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,20
+3,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,21
+3,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,22
+3,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,23
+3,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,24
+3,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,25
+3,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,26
+3,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,27
+3,for their records.,0.176078,0.550606,0.131373,0.010606,28
+3,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,29
+3,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,30
+3,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,31
+3,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,32
+3,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,33
+3,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,34
+3,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,35
+3,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,36
+3,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,37
+3,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,38
+3,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,39
+3,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,40
+3,347-8630.,0.117647,0.799394,0.080392,0.010303,41
+4,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,1
+4,SisterCities,0.169412,0.033333,0.239608,0.028485,2
+4,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
+4,Toolkit,0.830588,0.072727,0.127843,0.025758,4
+4,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
+4,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
+4,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
+4,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
+4,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
+4,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
+4,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
+4,AND,0.487843,0.452727,0.048235,0.011212,12
+4,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
+4,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
+4,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
+4,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
+4,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
+4,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
+4,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
+4,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
+4,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
+4,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
+4,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
+4,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
+4,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
+4,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
+4,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
+4,A,0.344314,0.768485,0.084706,0.030303,28
+4,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
+4,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
+4,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,31
+4,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,32
+4,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
+5,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
+5,SisterCities,0.169412,0.033333,0.239608,0.028485,2
+5,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
+5,Toolkit,0.83098,0.072727,0.127059,0.025758,4
+5,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
+5,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
+5,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
+5,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
+5,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
+5,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
+5,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
+5,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
+5,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
+5,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
+5,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
+5,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
+5,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
+5,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
+5,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
+5,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
+5,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
+5,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
+5,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
+5,the cities;,0.22902,0.624545,0.076471,0.012424,24
+5,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
+5,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
+5,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
+5,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
+5,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
+5,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
+5,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
+5,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,32
+5,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,33
+5,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,34
+5,Mayor,0.311373,0.894848,0.053333,0.012727,35
+5,New York City,0.287843,0.909091,0.121176,0.013333,36
+5,London,0.701961,0.909091,0.061569,0.010606,37
+6,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,1
+6,SisterCities,0.169412,0.03303,0.24,0.028182,2
+6,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
+6,Toolkit,0.83098,0.072727,0.127451,0.025758,4
+6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
+6,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
+6,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
+6,California,0.551373,0.257273,0.136471,0.033333,8
+6,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
+6,between the,0.464706,0.352727,0.084314,0.009697,10
+6,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
+6,"California, USA",0.4,0.397576,0.21098,0.016061,12
+6,and the,0.48,0.415152,0.053333,0.009091,13
+6,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
+6,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
+6,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
+6,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
+6,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
+6,purposes:,0.216863,0.516061,0.058039,0.009394,19
+6,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
+6,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
+6,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
+6,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
+6,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
+6,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
+6,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
+6,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
+6,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
+6,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
+6,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
+6,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
+6,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
+6,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
+6,"California, USA",0.582745,0.765758,0.125098,0.01303,34
+6,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
+6,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
+6,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
+6,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
+6,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
+7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
+7,SisterCities,0.169412,0.03303,0.24,0.028485,2
+7,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
+7,Toolkit,0.83098,0.072727,0.127451,0.025758,4
+7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
+7,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
+7,adopted by,0.2,0.213333,0.080392,0.013636,7
+7,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
+7,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
+7,and,0.199608,0.260909,0.026275,0.010606,10
+7,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
+7,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
+7,ON,0.551765,0.298182,0.026667,0.011515,13
+7,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
+7,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
+7,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
+7,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
+7,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
+7,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
+7,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
+7,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
+7,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
+7,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
+7,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
+7,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
+7,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
+7,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
+7,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
+7,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
+7,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
+7,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
+7,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
+7,3h.5.,0.593725,0.750606,0.218039,0.06303,33
+7,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,34
+7,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,35
+7,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,36
+7,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,37

example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv ADDED Viewed

	@@ -0,0 +1,77 @@

+image,page,label,color,xmin,ymin,xmax,ymax,id,text
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.598431,0.524545,0.63098,0.535455,EG3nykuwvxbk,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.820392,0.798485,0.854118,0.809394,jy1R42e6phNz,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.433333,0.863333,0.46549,0.873939,9sbrsroLfZy0,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.354118,0.188788,0.386275,0.199697,k7bWBsQQchJZ,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.780392,0.204848,0.812941,0.215758,peo6UqIxrjmR,U.S.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,EMAIL,"(0, 0, 0)",0.447843,0.78303,0.648627,0.796667,DIfz0LenOtQv,akaplan@sister-cities.org
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.809804,0.78303,0.850196,0.796667,odJdySe9XrAn,(202)
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.117647,0.799394,0.198431,0.809697,iURSkUM7BbUG,347-8630
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.637647,0.432727,0.712941,0.44697,fRxAD9qm856s,U. A.E
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.489412,0.43303,0.614902,0.444545,qzRFPlNbslpH,ABU DHABI
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.385882,0.472121,0.593725,0.486364,v1uLbGsofN1f,"HOUSTON, TEXAS"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.392549,0.539697,0.573725,0.549394,MvbPQiHvSdL7,United States of America
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.539216,0.553333,0.635686,0.563333,05U3cgj5w9PY,United States
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.534902,0.594242,0.615294,0.603939,uHMikyBlMq5f,Abu Dhabi
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.651373,0.594242,0.717255,0.605455,XNUE0GopIBaf,Houston
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.221569,0.65,0.301176,0.659697,6FjbNu2CGA9n,Abu Dhabi
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.337647,0.65,0.404314,0.660606,Yvmm2225ityu,Houston
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,HANDWRITING,"(0, 0, 0)",0.344314,0.768485,0.42902,0.798788,EwTcqq7PENU8,A
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806364,0.612549,0.817576,Mj4gqwbgsZWp,Sheikh Mohammed bin Butti AI Hamed
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.52,0.806364,0.612549,0.81697,RXYOVgLwq8Ke,AI Hamed
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.729412,0.806364,0.848235,0.816667,REPZhwFWGoTc,Lee P.Brown
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806667,0.51451,0.817576,rFdxMRFRWLRJ,Sheikh Mohammed bin Butti
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.366667,0.823939,0.465098,0.834242,5iYCxRGdPG1i,Abu Dhabi
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.577647,0.262121,0.68,0.271515,3ZR43H3yYNdy,NEW YORK
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.555294,0.303333,WNoitmR9A6lu,NEW YORK
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.658039,0.303333,HjrhxMQhovlF,NEW YORK N.Y. 10007
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.563137,0.29303,0.658039,0.302121,nPN7g7UcnX4u,N.Y. 10007
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.314118,0.356667,0.42549,0.367576,ZoJf29CB3Wrq,NEW YORK
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.655294,0.480909,0.718431,0.491515,iezAqmD2ilnb,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.708627,0.639394,0.837255,0.652727,tWAuJEQVpfhi,New York City
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.60902,0.64,0.67098,0.650606,NaW3mmmlhMW9,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.667059,0.702727,0.751373,0.713636,pgMiwuMiBp8B,New York
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.198824,0.720303,0.261569,0.731212,fPvElSFZFRoL,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,HANDWRITING,"(0, 0, 0)",0.178824,0.795455,0.281961,0.896364,DfniF7P2bXAw,Thedder
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.178824,0.795455,0.423529,0.896364,QwnWsAeslO5f,Thedder Rudolph W. Giuliani
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME - ADDRESS,"(0, 0, 0)",0.672157,0.877576,0.80549,0.891212,Vdp95SShYOEO,Ken Livingstone
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.710196,0.877576,0.80549,0.891212,H5DGqsucPAjc,Livingstone
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.672157,0.877879,0.705098,0.888182,qotGtnMbhAJr,Ken
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.287843,0.909091,0.40902,0.922727,sFX0tNJJzpE5,New York City
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.701961,0.909091,0.763922,0.919697,2xFbVTbxiOhC,London
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.55451,0.203636,0.86549,0.258485,Nfe3WTBembGQ,Long Beach
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551373,0.257273,0.687843,0.290606,kndQY5X4itc8,California
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.558824,0.397879,0.611373,0.410303,B5vq8yhWLeOg,USA
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425882,0.429091,0.691373,0.441818,OtNgqUkoEaZb,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.347451,0.447879,0.665098,0.46303,Q52VzBx2SWNF,"Ecuador, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.724314,0.482121,0.798431,0.493939,O7gd9ywvKsKh,"Long Beach,"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.506275,0.502727,DzYr3xrM8Tvv,San Pablo de
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.715294,0.50303,iZ0knpQD54UU,"San Pablo de Manta, Ecundor, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.509804,0.49303,0.715294,0.50303,pZnYGzr7Pwsl,"Manta, Ecundor, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.217647,0.493333,0.321961,0.504242,r7Aar8FNQF6D,"California, USA"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.471765,0.543636,0.596863,0.553939,zg9uBDlSuuA1,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.295294,0.544242,0.36549,0.556061,A0OY6RjMEocW,Long Beach
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.563137,0.655152,0.748627,0.667576,HQlTdEUhOCgI,"Long Beach, California, USA"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.463529,0.665758,0.557255,0.674848,bCN9b7kJw0Ik,South America
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.277647,0.666061,0.403529,0.676061,qffN3bDgWRMk,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.587451,0.736667,0.709804,0.750303,eqMENFw5mbnL,Beverly 0 Neill
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.663137,0.751212,0.753333,0.764545,POqPQVBCES8h,Long Beach
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.582745,0.765758,0.708235,0.779091,mjrjsSMOxwaY,"California, USA"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,HANDWRITING,"(0, 0, 0)",0.490588,0.771818,0.71098,0.834242,xL8dSawihWuY,10.2aulus
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,NAME,"(0, 0, 0)",0.559608,0.825152,0.769804,0.838485,fHyvwmbOgLMJ,Jorge O. Zambrano Cedeño
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.624314,0.839394,0.782745,0.850303,zGhskyehufSv,San Pablo de Manta
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551765,0.854242,0.74,0.866061,dSPXmtb8M4nt,"Ecuador, South America"
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.556471,0.215152,0.731765,0.226667,BEhuvaI5BVaR,RICHARD M. DALEY
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.563137,0.261212,0.725098,0.272424,coo8KK7q6A72,ZHANG RONGMAO
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.566275,0.273636,0.666275,0.285152,0P9rVSbeNdB4,SHENYANG
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.526667,0.380303,0.588235,0.394242,1GDArufutI5y,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.628235,0.380606,0.702353,0.394242,QyD751r4fCU1,Shenyang
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.736863,0.411515,0.868235,0.424545,rntIekANI8BO,Zhang Rongmao
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.199216,0.411818,0.34,0.424848,96TaHazXGIM7,Richard M. Daley
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.514902,0.412424,0.580784,0.425758,kbyVj6qhZSPi,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.696471,0.443939,0.774118,0.45697,rJpaMvepsNln,Shenyang
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.353725,0.474545,0.415686,0.489091,PokCVpLQmDki,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.407451,0.554545,0.469804,0.568182,HqVr414KRg59,Chicago
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,HANDWRITING,"(0, 0, 0)",0.593725,0.750606,0.811765,0.813636,xdawEv0DUH6P,3h.5.
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.730196,0.819394,0.876471,0.830606,Gghr7ccN6lS2,ZHANG RONGMAO
+C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.34,0.821515,0.501176,0.831515,vOMIv1RS5Sag,RICHARD M. DALEY

example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv ADDED Viewed

	@@ -0,0 +1,923 @@

+page,text,left,top,width,height,line
+1,5-Point Networking Email,0.404314,0.050606,0.189804,0.012121,1
+1,"Steve Dalton, the author of The 2-Hour Job Search believes the perfect networking email is a ""5-Point E-mail"". The five",0.058824,0.086061,0.859608,0.012727,2
+1,points are as follows:,0.059216,0.10303,0.152941,0.012727,3
+1,1. 100 words or less,0.088627,0.136667,0.156078,0.010303,4
+1,2. No mention of jobs (in subject or body),0.088235,0.153333,0.31451,0.012727,5
+1,"3. Connection goes first (e.g., ND connection)",0.087843,0.170606,0.341569,0.01303,6
+1,4. Generalize your interest,0.087843,0.187879,0.205098,0.012424,7
+1,5. Maintain control of the follow up,0.088627,0.204545,0.27098,0.012727,8
+1,Here's an example of what a 5-Point email would look like:,0.059608,0.255455,0.42549,0.012727,9
+1,Subject: Notre Dame MBA Student Seeking Your Advice,0.117255,0.289394,0.414118,0.012424,10
+1,"Dear Mr. Jones,",0.118039,0.323939,0.112549,0.011515,11
+1,"My name is Brooke Franklin, and I'm a first-year Notre Dame MBA student who found your",0.118431,0.35697,0.661569,0.01303,12
+1,information in the ND alumni database. May I have 15 minutes of your time to ask you about,0.118039,0.374242,0.677255,0.012727,13
+1,your experience with IBM? I'm trying to learn more about marketing careers at technology,0.117255,0.391212,0.660784,0.01303,14
+1,companies and your insights would be very helpful.,0.117647,0.407879,0.373333,0.01303,15
+1,"I realize this may be a busy time for you, so if we're unable to connect this week, I'll try again",0.118039,0.442121,0.674902,0.012727,16
+1,next week to see whether that is more convenient.,0.118039,0.459091,0.370588,0.010303,17
+1,"Thank you for your time,",0.117255,0.492727,0.179216,0.012727,18
+1,Brooke,0.118431,0.51,0.050588,0.01,19
+1,The most important part of this email may be the follow-up; an email like this allows you to reach out again in a week if,0.058431,0.543333,0.872157,0.01303,20
+1,you haven't heard back without feeling like you're bothering the person at the other end. If you don't hear anything,0.058431,0.560606,0.843922,0.01303,21
+1,"after the second attempt, you can probably cross him/her off your list and move on to the next contact.",0.058824,0.577273,0.755686,0.01303,22
+2,36 Westmoreland Drive,0.705764,0.026796,0.209996,0.011403,1
+2,Newcastle upon Tyne,0.723499,0.04333,0.192664,0.013968,2
+2,NE1 8LT,0.836759,0.059863,0.079807,0.011117,3
+2,Mr Mark Wilson,0.083837,0.076112,0.138251,0.011403,4
+2,UK Health Trust,0.083837,0.09236,0.143087,0.011403,5
+2,18 Whitehall Square,0.084643,0.108609,0.179766,0.013968,6
+2,London,0.083837,0.125428,0.066102,0.011117,7
+2,SW1 9LT,0.083837,0.141391,0.083031,0.011403,8
+2,11th January 2015,0.755744,0.154789,0.161225,0.017389,9
+2,Dear Mr Wilson,0.083837,0.174173,0.137042,0.011403,10
+2,Re: Community Health Development Officer [HD/12/2014],0.083837,0.201539,0.544135,0.014253,11
+2,"I am writing to apply for the above post, as advertised on the Health UK recruitment site. I am",0.08424,0.228905,0.828295,0.014253,12
+2,a sociology graduate with a 2: 1from Newcastle University. I have relevant health awareness,0.083434,0.245439,0.822249,0.014253,13
+2,"experience, and I am looking for a position where I can employ my knowledge and skills in",0.083434,0.261973,0.802499,0.013968,14
+2,support of health and community development. I enclose my CV for your attention.,0.083434,0.277936,0.731963,0.014253,15
+2,I am eager to work for UK Health Trust because of your ground-breaking work within the field,0.08424,0.305302,0.825877,0.014253,16
+2,of community health. I became aware of the work of the Trust when carrying out my,0.083434,0.322121,0.744055,0.013968,17
+2,"dissertation, 'Generational Change in Local Health Awareness, where I researched health",0.083031,0.338084,0.798468,0.014253,18
+2,awareness of children and elderly people in a deprived location. I referred to a number of,0.083031,0.354618,0.792019,0.013968,19
+2,publications produced by UK Health Trust and was impressed by the innovative techniques,0.083837,0.371152,0.809351,0.013968,20
+2,your organisation uses to engage local community members in projects. The Community,0.083031,0.387685,0.788795,0.014253,21
+2,Health Development Officer position would further develop my existing abilities and my,0.08424,0.403934,0.771463,0.014253,22
+2,"understanding of community development, allowing me to contribute in a practical way to",0.083837,0.420468,0.789601,0.013968,23
+2,enhancing the health of disadvantaged people.,0.083434,0.436716,0.415961,0.013968,24
+2,The volunteer development aspect of the position particularly appeals to me. I have worked,0.083031,0.469213,0.811769,0.014538,25
+2,"in the voluntary sector, providing services tackling health inequalities and promoting healthy",0.083837,0.485747,0.814994,0.014253,26
+2,living in Newcastle. I promoted health awareness through one to one sessions and in large,0.083434,0.501995,0.805723,0.014253,27
+2,"groups and developed interpersonal skills, confidence and patience when engaging and",0.083031,0.518529,0.787183,0.014253,28
+2,"motivating participants. While raising the group's profile using social media, the local press",0.083434,0.534778,0.804917,0.013968,29
+2,"and at presentations to youth clubs, faith meetings and care homes I recognised the need to",0.083434,0.551596,0.820637,0.013968,30
+2,"change my delivery style to suit the audience. As a volunteer teacher in Ghana, I developed",0.083434,0.56756,0.8158,0.014253,31
+2,communication and team-building skills essential to your advertised role; liaising with,0.083434,0.584094,0.753325,0.013968,32
+2,colleagues and parents and a lively group of twenty-five 7-8 year olds to arrange a,0.083434,0.600627,0.731963,0.014253,33
+2,"community event. My retail experience, coupled with my extracurricular activities additionally",0.083434,0.617161,0.822249,0.013968,34
+2,"enhanced my ability to develop others, as I was responsible for inducting and training my",0.083434,0.633409,0.79081,0.014253,35
+2,peers.,0.083837,0.652509,0.05401,0.011117,36
+2,"In relation to the fundraising and budgeting aspect of the role, I have experience of raising",0.08424,0.68244,0.798065,0.014253,37
+2,"substantial amounts of money through several successful charity events, including a well -",0.083031,0.698404,0.802096,0.014538,38
+2,attended fashion show. I was also elected Treasurer of NU Sociology Society with,0.083434,0.715222,0.728335,0.014253,39
+2,responsibility for managing a budget of £3000.,0.083434,0.731471,0.411528,0.014538,40
+2,The necessity to travel to identify community issues only adds to the appeal of the position. I,0.083031,0.758837,0.82104,0.014253,41
+2,"enjoy driving, hold a full clean driving licence and I am very interested in relocating to London",0.083434,0.775086,0.828295,0.014538,42
+2,to work for UK Health Trust.,0.083031,0.791619,0.247481,0.011688,43
+2,Thank you for considering my application. I look forward to hearing from you.,0.083434,0.824401,0.68158,0.014253,44
+2,Yours sincerely,0.082628,0.857184,0.138251,0.014253,45
+2,Rachel Sullivan,0.083837,0.889966,0.137042,0.011403,46
+3,SisterCities,0.169804,0.033333,0.238431,0.028182,1
+3,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+3,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+3,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,4
+3,Toolkit,0.830588,0.07303,0.126667,0.025152,5
+3,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
+3,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
+3,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
+3,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
+3,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
+3,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
+3,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
+3,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
+3,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
+3,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
+3,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
+3,Friendship City,0.118039,0.372121,0.127059,0.013939,17
+3,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
+3,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
+3,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
+3,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
+3,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
+3,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
+3,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
+3,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
+3,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
+3,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
+3,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
+3,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
+3,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
+3,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
+3,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
+3,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
+3,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
+3,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
+3,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
+3,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
+3,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
+3,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
+3,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
+3,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
+3,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
+3,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
+3,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
+3,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
+4,SisterCities,0.169804,0.033333,0.238824,0.028182,1
+4,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+4,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+4,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+4,Toolkit,0.83098,0.072727,0.127059,0.025455,5
+4,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
+4,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
+4,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
+4,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
+4,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
+4,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
+4,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
+4,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
+4,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
+4,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
+4,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
+4,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
+4,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
+4,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
+4,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
+4,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
+4,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
+4,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
+4,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
+4,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
+4,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
+4,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
+4,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
+4,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
+4,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
+4,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
+4,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
+4,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
+4,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
+4,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
+4,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
+4,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
+4,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
+4,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
+4,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
+4,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
+4,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
+4,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
+4,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
+4,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,45
+4,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,46
+5,SisterCities,0.169804,0.033333,0.239216,0.028182,1
+5,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,2
+5,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,3
+5,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+5,Toolkit,0.83098,0.07303,0.126667,0.025152,5
+5,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,6
+5,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,7
+5,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,8
+5,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,9
+5,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,10
+5,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,11
+5,and cooperation.,0.176471,0.25697,0.13451,0.013333,12
+5,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,13
+5,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,14
+5,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,15
+5,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,16
+5,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,17
+5,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,18
+5,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,19
+5,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,20
+5,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,21
+5,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,22
+5,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,23
+5,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,24
+5,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,25
+5,for their records.,0.176078,0.550606,0.131373,0.010606,26
+5,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,27
+5,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,28
+5,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,29
+5,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,30
+5,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,31
+5,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,32
+5,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,33
+5,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,34
+5,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,35
+5,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,36
+5,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,37
+5,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,38
+5,347-8630.,0.117647,0.799394,0.080392,0.010303,39
+6,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+6,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,2
+6,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
+6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+6,Toolkit,0.830588,0.072727,0.127843,0.025758,5
+6,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
+6,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
+6,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
+6,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
+6,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
+6,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
+6,AND,0.487843,0.452727,0.048235,0.011212,12
+6,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
+6,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
+6,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
+6,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
+6,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
+6,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
+6,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
+6,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
+6,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
+6,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
+6,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
+6,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
+6,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
+6,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
+6,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
+6,A,0.344314,0.768485,0.084706,0.030303,28
+6,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
+6,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
+6,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,31
+6,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,32
+6,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
+7,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+7,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
+7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+7,Toolkit,0.83098,0.072727,0.127059,0.025758,5
+7,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
+7,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
+7,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
+7,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
+7,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
+7,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
+7,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
+7,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
+7,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
+7,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
+7,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
+7,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
+7,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
+7,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
+7,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
+7,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
+7,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
+7,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
+7,the cities;,0.22902,0.624545,0.076471,0.012424,24
+7,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
+7,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
+7,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
+7,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
+7,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
+7,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
+7,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
+7,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,32
+7,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,33
+7,Mayor,0.311373,0.894848,0.053333,0.012727,34
+7,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,35
+7,New York City,0.287843,0.909091,0.121176,0.013333,36
+7,London,0.701961,0.909091,0.061569,0.010606,37
+8,SisterCities,0.169412,0.03303,0.24,0.028182,1
+8,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,2
+8,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
+8,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+8,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+8,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
+8,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
+8,California,0.551373,0.257273,0.136471,0.033333,8
+8,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
+8,between the,0.464706,0.352727,0.084314,0.009697,10
+8,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
+8,"California, USA",0.4,0.397576,0.21098,0.016061,12
+8,and the,0.48,0.415152,0.053333,0.009091,13
+8,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
+8,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
+8,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
+8,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
+8,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
+8,purposes:,0.216863,0.516061,0.058039,0.009394,19
+8,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
+8,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
+8,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
+8,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
+8,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
+8,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
+8,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
+8,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
+8,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
+8,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
+8,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
+8,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
+8,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
+8,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
+8,"California, USA",0.582745,0.765758,0.125098,0.01303,34
+8,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
+8,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
+8,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
+8,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
+8,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
+9,SisterCities,0.169412,0.03303,0.24,0.028485,1
+9,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+9,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
+9,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+9,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+9,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
+9,adopted by,0.2,0.213333,0.080392,0.013636,7
+9,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
+9,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
+9,and,0.199608,0.260909,0.026275,0.010606,10
+9,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
+9,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
+9,ON,0.551765,0.298182,0.026667,0.011515,13
+9,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
+9,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
+9,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
+9,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
+9,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
+9,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
+9,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
+9,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
+9,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
+9,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
+9,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
+9,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
+9,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
+9,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
+9,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
+9,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
+9,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
+9,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
+9,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
+9,3h.5.,0.593725,0.750606,0.218039,0.06303,33
+9,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,34
+9,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,35
+9,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,36
+9,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,37
+10,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308796,0.008545,1
+10,agcas,0.726169,0.191722,0.053368,0.011749,2
+10,Example of a skills-based CV,0.3894,0.205874,0.224144,0.011482,3
+10,ASHLEY GILL,0.459698,0.246195,0.082812,0.008278,4
+10,3 Lappage Court,0.2212,0.259012,0.080972,0.008545,5
+10,Telephone: 01882 652349,0.592565,0.259012,0.129555,0.008278,6
+10,"Tyler Green, Bucks.",0.220464,0.269159,0.092381,0.008278,7
+10,Mobile: 07717 121824,0.593669,0.269159,0.112992,0.006676,8
+10,HP8 4JD,0.2212,0.279306,0.040486,0.006409,9
+10,Email: ashleygill2023@gotmail.com,0.594038,0.279039,0.178874,0.008545,10
+10,Personal Details,0.221568,0.299332,0.095326,0.007744,11
+10,Summary,0.220832,0.321495,0.048215,0.008278,12
+10,Business studies with Spanish undergraduate.,0.273463,0.340988,0.229297,0.008812,13
+10,Ability to speak French and Spanish.,0.272727,0.351135,0.179242,0.008545,14
+10,Extensive business experience including an internship with Top Choice Holidays.,0.273095,0.361015,0.398233,0.008812,15
+10,Education And Qualifications,0.2212,0.381041,0.144277,0.008278,16
+10,2008 present,0.220832,0.401602,0.074715,0.008011,17
+10,Buckinghamshire Edge University,0.386824,0.401068,0.167096,0.008545,18
+10,BA International Business Studies with Spanish (expected 2:1),0.386824,0.410681,0.308796,0.008812,19
+10,Relate your degree to,0.230033,0.420027,0.100847,0.008278,20
+10,Study semester at The University of Valloid (Spain).,0.399338,0.420828,0.252852,0.008812,21
+10,the job by listing your,0.229665,0.429105,0.101583,0.008278,22
+10,Six-month work placement in Madrid.,0.399338,0.431242,0.188811,0.008545,23
+10,relevant modules/,0.230033,0.438718,0.085388,0.007744,24
+10,Relevant modules included: Business Planning; Sales Promotion and,0.399338,0.441389,0.338241,0.008545,25
+10,dissertation.,0.230033,0.448064,0.057784,0.006676,26
+10,Marketing; and Business Operations Management.,0.398969,0.451268,0.25322,0.008812,27
+10,2000 2007,0.2212,0.467824,0.061833,0.006409,28
+10,Freebridge School,0.386824,0.46729,0.087965,0.008545,29
+10,"A-Levels: Business Studies (B), French (C)",0.386088,0.476903,0.200221,0.008812,30
+10,"8 GCSEs including Maths, English, Spanish and French",0.386824,0.487583,0.266838,0.008545,31
+10,Work History,0.220832,0.509212,0.065513,0.008278,32
+10,2008 2011,0.220832,0.529506,0.061833,0.006409,33
+10,Buckinghamshire Edge University Librarian/tour guide,0.386824,0.528972,0.277144,0.008812,34
+10,General administrative and customer service roles.,0.399338,0.539119,0.25138,0.006676,35
+10,Briefly list,0.707766,0.536716,0.045639,0.008011,36
+10,your relevant,0.70703,0.546061,0.061465,0.008011,37
+10,2011 (Feb-Aug),0.2212,0.55514,0.078027,0.008812,38
+10,Audigest S.A. (Madrid) - Audit Assistant,0.386456,0.554873,0.199485,0.009079,39
+10,duties.,0.707398,0.555674,0.030916,0.006409,40
+10,Six months' work experience in an international bank.,0.399338,0.565287,0.267575,0.008545,41
+10,Liaising with colleagues and clients in English and Spanish.,0.399338,0.575434,0.292602,0.008545,42
+10,2010 (June-Dec),0.220832,0.591188,0.082444,0.008278,43
+10,Finsbury's supermarket (Hazelbridge) — Supervisor,0.386824,0.591188,0.250644,0.008812,44
+10,Managing a small team.,0.398969,0.601602,0.121089,0.008545,45
+10,Customer service in a busy competitive environment.,0.398969,0.611215,0.264262,0.008545,46
+10,2010 (Jan-Aug),0.2212,0.627236,0.077291,0.008812,47
+10,Top Choice Holidays and Flights Ltd (Low Wycombe),0.386088,0.627503,0.257637,0.008812,48
+10,Financial Assistant/Supervisor,0.386824,0.637383,0.15127,0.008812,49
+10,Working in a range of teams to manage complex financial processes.,0.398969,0.64753,0.341921,0.008812,50
+10,2007 (Jul-Aug),0.220832,0.663284,0.074347,0.008812,51
+10,Dogs Protection League - General Assistant,0.386824,0.663818,0.216783,0.008812,52
+10,Dealing with enquiries and selling packages to a range of clients.,0.399706,0.673431,0.321678,0.009079,53
+10,2006 (Jan-Dec),0.220832,0.689453,0.076187,0.009079,54
+10,McHenry's Restaurant (Low Wycombe) - Supervisor,0.386456,0.68972,0.256533,0.009079,55
+10,Voluntary Experience,0.220464,0.708411,0.106367,0.008545,56
+10,2007/2011,0.220832,0.728438,0.055208,0.008011,57
+10,Teaching English in Mexico/Spain,0.386088,0.727904,0.167832,0.009079,58
+10,Interests,0.2212,0.748465,0.043062,0.006676,59
+10,Active member of University Business Club — Winner of the 'Bucks Best Business Pitch' award in 2010 Enterprise,0.220464,0.768224,0.556864,0.009079,60
+10,"week, judged by Michael Eavis.",0.220464,0.778104,0.15311,0.008812,61
+11,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308428,0.008545,1
+11,Make sure you carefully assess,0.468531,0.23498,0.142068,0.008011,2
+11,Skills And Achievements,0.220832,0.245394,0.121457,0.006676,3
+11,the job advert/job description,0.468163,0.244326,0.139124,0.008278,4
+11,and address all the skills they,0.468531,0.253672,0.13618,0.008278,5
+11,Effective communication,0.2212,0.265421,0.123298,0.006676,6
+11,require.,0.468531,0.263017,0.034965,0.008011,7
+11,"Able to communicate effectively with a wide range of clients and colleagues, by showing interest, carefully",0.233714,0.275567,0.530364,0.008545,8
+11,"listening to needs and appropriately adjusting my message, as demonstrated during my time at Finsbury's",0.23445,0.285447,0.528892,0.008812,9
+11,Supermarket.,0.234082,0.295861,0.066618,0.008278,10
+11,Strong presentation skills and confidence demonstrated by experience of delivering presentations in different,0.23445,0.305474,0.543614,0.008812,11
+11,languages to groups of five to fifty.,0.234082,0.315621,0.172617,0.008812,12
+11,Customer service,0.220832,0.335915,0.085388,0.006676,13
+11,Ability to quickly build rapport with customers and calmly deal with any problems as shown during my retail,0.233714,0.345527,0.541038,0.008812,14
+11,experience in high pressure environments.,0.234082,0.355941,0.210526,0.008278,15
+11,"Capacity to maintain professional relationships through email and other written correspondence, for example,",0.234082,0.365554,0.548767,0.008812,16
+11,"at Audigest in Madrid, where I built longstanding business relationships with customers and colleagues across",0.233714,0.375701,0.549871,0.008812,17
+11,the globe.,0.233714,0.385848,0.049687,0.008278,18
+11,Teamwork,0.220464,0.406142,0.052632,0.006409,19
+11,"At Top Choice Holidays demonstrated excellent teamwork skills in a busy financial environment, such as an",0.233346,0.415754,0.532573,0.008812,20
+11,"ability to listen to clients and managers, perform my role to a high level and support colleagues, resulting in",0.234082,0.425634,0.535885,0.008812,21
+11,promotion.,0.234082,0.436048,0.05484,0.008545,22
+11,Administration,0.220464,0.456075,0.075083,0.006409,23
+11,Prove you have each of the,0.639676,0.453672,0.123666,0.008278,24
+11,"Excellent ability to plan ahead and manage time effectively, for example,",0.23445,0.465688,0.360692,0.008812,25
+11,skills required by outlining,0.63894,0.463017,0.12293,0.008278,26
+11,managing complex roles during my internship at Top Choice Holidays.,0.23445,0.476101,0.346338,0.008545,27
+11,where you performed them,0.63894,0.472363,0.128082,0.008278,28
+11,Gathered data from a wide range of sources during my dissertation,0.234082,0.485714,0.334928,0.008812,29
+11,and how you performed,0.639308,0.481709,0.111888,0.008278,30
+11,them well.,0.63894,0.491055,0.048951,0.006409,31
+11,"whilst balancing my other studies and two jobs, resulting in a 73% grade.",0.233346,0.495861,0.365109,0.008812,32
+11,Experience of travellers' needs,0.2212,0.515888,0.150534,0.008545,33
+11,Recent travel consultancy experience gives me an in-depth understanding of the expectations of holiday,0.23445,0.525768,0.518955,0.008812,34
+11,customers and the competitive nature of the industry.,0.234082,0.535915,0.269047,0.008812,35
+11,International travel experience and language ability give me an empathy with travellers and a passion for,0.234082,0.545794,0.524107,0.008812,36
+11,helping them find a unique holiday experience.,0.234082,0.555941,0.23445,0.008812,37
+11,Initiative,0.2212,0.576235,0.044166,0.006676,38
+11,Self-funding an evening course in bookkeeping during my first accountancy role demonstrated my ability to,0.234082,0.585848,0.535149,0.008812,39
+11,plan ahead and take control of my career.,0.23445,0.595995,0.205006,0.008545,40
+11,Successful study and work in Spain and Mexico show that I can creatively develop my skills and experience and,0.234082,0.605874,0.551711,0.008545,41
+11,adapt to new and different environments.,0.234082,0.616288,0.208686,0.008278,42
+11,Sales knowledge,0.220464,0.636315,0.083916,0.008011,43
+11,Wide experience of financial roles gives me an awareness of the tight monetary pressures which drive UK,0.234082,0.645928,0.525212,0.009346,44
+11,service industries.,0.234082,0.656609,0.088333,0.006943,45
+11,Raised sales at The Dogs Protection League by 12% by up selling add-on packages to new and existing,0.23445,0.665955,0.505705,0.009079,46
+11,customers.,0.234082,0.67717,0.054472,0.006142,47
+11,Language ability,0.2212,0.696395,0.082444,0.008812,48
+11,"Spanish fluency obtained working overseas, French semi-fluent.",0.233714,0.706008,0.323151,0.009079,49
+11,Referees,0.2212,0.726569,0.041958,0.006676,50
+11,Include all your referee details including their email and,0.351859,0.722029,0.259109,0.008545,51
+11,phone number (but ask for their permission first).,0.352227,0.731108,0.230401,0.008545,52
+11,"Professional: Mr. Jose Andreas, Management Accountant, Audigest, Avenida de Concha Espina 2, Madrid, ES-",0.2212,0.746328,0.537725,0.008812,53
+11,"28036, +34 91 398 5476, j.andreas@audigest.es",0.2212,0.756475,0.238498,0.008278,54
+11,"Academic: Dr. Jane Luffle, Personal Tutor, Buckinghamshire Edge University, Due Road, Low Wycombe, Bucks,",0.220464,0.776502,0.536621,0.008812,55
+11,"HD15 3DL, 01628 435 6784, j.luffle@bedge.ac.uk",0.2212,0.786382,0.244755,0.008545,56
+12,5-Point Networking Email,0.404314,0.050606,0.189804,0.012121,1
+12,"Steve Dalton, the author of The 2-Hour Job Search believes the perfect networking email is a ""5-Point E-mail"". The five",0.058824,0.086061,0.859608,0.012727,2
+12,points are as follows:,0.059216,0.10303,0.152941,0.012727,3
+12,1. 100 words or less,0.088627,0.136667,0.156078,0.010303,4
+12,2. No mention of jobs (in subject or body),0.088235,0.153333,0.31451,0.012727,5
+12,"3. Connection goes first (e.g., ND connection)",0.087843,0.170606,0.341569,0.01303,6
+12,4. Generalize your interest,0.087843,0.187879,0.205098,0.012424,7
+12,5. Maintain control of the follow up,0.088627,0.204545,0.27098,0.012727,8
+12,Here's an example of what a 5-Point email would look like:,0.059608,0.255455,0.42549,0.012727,9
+12,Subject: Notre Dame MBA Student Seeking Your Advice,0.117255,0.289394,0.414118,0.012424,10
+12,"Dear Mr. Jones,",0.118039,0.323939,0.112549,0.011515,11
+12,"My name is Brooke Franklin, and I'm a first-year Notre Dame MBA student who found your",0.118431,0.35697,0.661569,0.01303,12
+12,information in the ND alumni database. May I have 15 minutes of your time to ask you about,0.118039,0.374242,0.677255,0.012727,13
+12,your experience with IBM? I'm trying to learn more about marketing careers at technology,0.117255,0.391212,0.660784,0.01303,14
+12,companies and your insights would be very helpful.,0.117647,0.407879,0.373333,0.01303,15
+12,"I realize this may be a busy time for you, so if we're unable to connect this week, I'll try again",0.118039,0.442121,0.674902,0.012727,16
+12,next week to see whether that is more convenient.,0.118039,0.459091,0.370588,0.010303,17
+12,"Thank you for your time,",0.117255,0.492727,0.179216,0.012727,18
+12,Brooke,0.118431,0.51,0.050588,0.01,19
+12,The most important part of this email may be the follow-up; an email like this allows you to reach out again in a week if,0.058431,0.543333,0.872157,0.01303,20
+12,you haven't heard back without feeling like you're bothering the person at the other end. If you don't hear anything,0.058431,0.560606,0.843922,0.01303,21
+12,"after the second attempt, you can probably cross him/her off your list and move on to the next contact.",0.058824,0.577273,0.755686,0.01303,22
+13,36 Westmoreland Drive,0.705764,0.026796,0.209996,0.011403,1
+13,Newcastle upon Tyne,0.723499,0.04333,0.192664,0.013968,2
+13,NE1 8LT,0.836759,0.059863,0.079807,0.011117,3
+13,Mr Mark Wilson,0.083837,0.076112,0.138251,0.011403,4
+13,UK Health Trust,0.083837,0.09236,0.143087,0.011403,5
+13,18 Whitehall Square,0.084643,0.108609,0.179766,0.013968,6
+13,London,0.083837,0.125428,0.066102,0.011117,7
+13,SW1 9LT,0.083837,0.141391,0.083031,0.011403,8
+13,11th January 2015,0.755744,0.154789,0.161225,0.017389,9
+13,Dear Mr Wilson,0.083837,0.174173,0.137042,0.011403,10
+13,Re: Community Health Development Officer [HD/12/2014],0.083837,0.201539,0.544135,0.014253,11
+13,"I am writing to apply for the above post, as advertised on the Health UK recruitment site. I am",0.08424,0.228905,0.828295,0.014253,12
+13,a sociology graduate with a 2: 1from Newcastle University. I have relevant health awareness,0.083434,0.245439,0.822249,0.014253,13
+13,"experience, and I am looking for a position where I can employ my knowledge and skills in",0.083434,0.261973,0.802499,0.013968,14
+13,support of health and community development. I enclose my CV for your attention.,0.083434,0.277936,0.731963,0.014253,15
+13,I am eager to work for UK Health Trust because of your ground-breaking work within the field,0.08424,0.305302,0.825877,0.014253,16
+13,of community health. I became aware of the work of the Trust when carrying out my,0.083434,0.322121,0.744055,0.013968,17
+13,"dissertation, 'Generational Change in Local Health Awareness, where I researched health",0.083031,0.338084,0.798468,0.014253,18
+13,awareness of children and elderly people in a deprived location. I referred to a number of,0.083031,0.354618,0.792019,0.013968,19
+13,publications produced by UK Health Trust and was impressed by the innovative techniques,0.083837,0.371152,0.809351,0.013968,20
+13,your organisation uses to engage local community members in projects. The Community,0.083031,0.387685,0.788795,0.014253,21
+13,Health Development Officer position would further develop my existing abilities and my,0.08424,0.403934,0.771463,0.014253,22
+13,"understanding of community development, allowing me to contribute in a practical way to",0.083837,0.420468,0.789601,0.013968,23
+13,enhancing the health of disadvantaged people.,0.083434,0.436716,0.415961,0.013968,24
+13,The volunteer development aspect of the position particularly appeals to me. I have worked,0.083031,0.469213,0.811769,0.014538,25
+13,"in the voluntary sector, providing services tackling health inequalities and promoting healthy",0.083837,0.485747,0.814994,0.014253,26
+13,living in Newcastle. I promoted health awareness through one to one sessions and in large,0.083434,0.501995,0.805723,0.014253,27
+13,"groups and developed interpersonal skills, confidence and patience when engaging and",0.083031,0.518529,0.787183,0.014253,28
+13,"motivating participants. While raising the group's profile using social media, the local press",0.083434,0.534778,0.804917,0.013968,29
+13,"and at presentations to youth clubs, faith meetings and care homes I recognised the need to",0.083434,0.551596,0.820637,0.013968,30
+13,"change my delivery style to suit the audience. As a volunteer teacher in Ghana, I developed",0.083434,0.56756,0.8158,0.014253,31
+13,communication and team-building skills essential to your advertised role; liaising with,0.083434,0.584094,0.753325,0.013968,32
+13,colleagues and parents and a lively group of twenty-five 7-8 year olds to arrange a,0.083434,0.600627,0.731963,0.014253,33
+13,"community event. My retail experience, coupled with my extracurricular activities additionally",0.083434,0.617161,0.822249,0.013968,34
+13,"enhanced my ability to develop others, as I was responsible for inducting and training my",0.083434,0.633409,0.79081,0.014253,35
+13,peers.,0.083837,0.652509,0.05401,0.011117,36
+13,"In relation to the fundraising and budgeting aspect of the role, I have experience of raising",0.08424,0.68244,0.798065,0.014253,37
+13,"substantial amounts of money through several successful charity events, including a well -",0.083031,0.698404,0.802096,0.014538,38
+13,attended fashion show. I was also elected Treasurer of NU Sociology Society with,0.083434,0.715222,0.728335,0.014253,39
+13,responsibility for managing a budget of £3000.,0.083434,0.731471,0.411528,0.014538,40
+13,The necessity to travel to identify community issues only adds to the appeal of the position. I,0.083031,0.758837,0.82104,0.014253,41
+13,"enjoy driving, hold a full clean driving licence and I am very interested in relocating to London",0.083434,0.775086,0.828295,0.014538,42
+13,to work for UK Health Trust.,0.083031,0.791619,0.247481,0.011688,43
+13,Thank you for considering my application. I look forward to hearing from you.,0.083434,0.824401,0.68158,0.014253,44
+13,Yours sincerely,0.082628,0.857184,0.138251,0.014253,45
+13,Rachel Sullivan,0.083837,0.889966,0.137042,0.011403,46
+14,SisterCities,0.169804,0.033333,0.238431,0.028182,1
+14,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+14,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+14,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,4
+14,Toolkit,0.830588,0.07303,0.126667,0.025152,5
+14,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
+14,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
+14,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
+14,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
+14,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
+14,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
+14,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
+14,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
+14,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
+14,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
+14,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
+14,Friendship City,0.118039,0.372121,0.127059,0.013939,17
+14,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
+14,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
+14,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
+14,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
+14,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
+14,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
+14,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
+14,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
+14,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
+14,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
+14,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
+14,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
+14,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
+14,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
+14,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
+14,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
+14,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
+14,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
+14,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
+14,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
+14,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
+14,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
+14,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
+14,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
+14,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
+14,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
+14,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
+14,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
+15,SisterCities,0.169804,0.033333,0.238824,0.028182,1
+15,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
+15,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
+15,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+15,Toolkit,0.83098,0.072727,0.127059,0.025455,5
+15,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
+15,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
+15,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
+15,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
+15,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
+15,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
+15,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
+15,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
+15,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
+15,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
+15,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
+15,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
+15,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
+15,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
+15,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
+15,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
+15,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
+15,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
+15,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
+15,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
+15,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
+15,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
+15,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
+15,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
+15,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
+15,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
+15,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
+15,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
+15,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
+15,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
+15,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
+15,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
+15,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
+15,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
+15,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
+15,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
+15,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
+15,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
+15,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
+15,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,45
+15,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,46
+16,SisterCities,0.169804,0.033333,0.239216,0.028182,1
+16,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,2
+16,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,3
+16,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
+16,Toolkit,0.83098,0.07303,0.126667,0.025152,5
+16,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,6
+16,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,7
+16,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,8
+16,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,9
+16,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,10
+16,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,11
+16,and cooperation.,0.176471,0.25697,0.13451,0.013333,12
+16,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,13
+16,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,14
+16,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,15
+16,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,16
+16,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,17
+16,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,18
+16,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,19
+16,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,20
+16,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,21
+16,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,22
+16,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,23
+16,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,24
+16,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,25
+16,for their records.,0.176078,0.550606,0.131373,0.010606,26
+16,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,27
+16,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,28
+16,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,29
+16,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,30
+16,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,31
+16,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,32
+16,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,33
+16,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,34
+16,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,35
+16,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,36
+16,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,37
+16,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,38
+16,347-8630.,0.117647,0.799394,0.080392,0.010303,39
+17,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+17,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,2
+17,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
+17,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+17,Toolkit,0.830588,0.072727,0.127843,0.025758,5
+17,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
+17,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
+17,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
+17,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
+17,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
+17,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
+17,AND,0.487843,0.452727,0.048235,0.011212,12
+17,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
+17,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
+17,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
+17,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
+17,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
+17,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
+17,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
+17,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
+17,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
+17,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
+17,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
+17,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
+17,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
+17,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
+17,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
+17,A,0.344314,0.768485,0.084706,0.030303,28
+17,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
+17,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
+17,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,31
+17,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,32
+17,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
+18,SisterCities,0.169412,0.033333,0.239608,0.028485,1
+18,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+18,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
+18,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+18,Toolkit,0.83098,0.072727,0.127059,0.025758,5
+18,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
+18,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
+18,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
+18,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
+18,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
+18,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
+18,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
+18,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
+18,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
+18,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
+18,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
+18,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
+18,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
+18,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
+18,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
+18,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
+18,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
+18,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
+18,the cities;,0.22902,0.624545,0.076471,0.012424,24
+18,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
+18,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
+18,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
+18,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
+18,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
+18,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
+18,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
+18,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,32
+18,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,33
+18,Mayor,0.311373,0.894848,0.053333,0.012727,34
+18,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,35
+18,New York City,0.287843,0.909091,0.121176,0.013333,36
+18,London,0.701961,0.909091,0.061569,0.010606,37
+19,SisterCities,0.169412,0.03303,0.24,0.028182,1
+19,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,2
+19,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
+19,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
+19,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+19,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
+19,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
+19,California,0.551373,0.257273,0.136471,0.033333,8
+19,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
+19,between the,0.464706,0.352727,0.084314,0.009697,10
+19,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
+19,"California, USA",0.4,0.397576,0.21098,0.016061,12
+19,and the,0.48,0.415152,0.053333,0.009091,13
+19,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
+19,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
+19,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
+19,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
+19,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
+19,purposes:,0.216863,0.516061,0.058039,0.009394,19
+19,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
+19,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
+19,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
+19,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
+19,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
+19,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
+19,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
+19,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
+19,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
+19,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
+19,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
+19,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
+19,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
+19,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
+19,"California, USA",0.582745,0.765758,0.125098,0.01303,34
+19,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
+19,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
+19,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
+19,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
+19,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
+20,SisterCities,0.169412,0.03303,0.24,0.028485,1
+20,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
+20,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
+20,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
+20,Toolkit,0.83098,0.072727,0.127451,0.025758,5
+20,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
+20,adopted by,0.2,0.213333,0.080392,0.013636,7
+20,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
+20,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
+20,and,0.199608,0.260909,0.026275,0.010606,10
+20,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
+20,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
+20,ON,0.551765,0.298182,0.026667,0.011515,13
+20,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
+20,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
+20,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
+20,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
+20,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
+20,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
+20,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
+20,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
+20,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
+20,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
+20,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
+20,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
+20,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
+20,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
+20,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
+20,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
+20,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
+20,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
+20,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
+20,3h.5.,0.593725,0.750606,0.218039,0.06303,33
+20,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,34
+20,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,35
+20,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,36
+20,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,37
+21,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308796,0.008545,1
+21,agcas,0.726169,0.191722,0.053368,0.011749,2
+21,Example of a skills-based CV,0.3894,0.205874,0.224144,0.011482,3
+21,ASHLEY GILL,0.459698,0.246195,0.082812,0.008278,4
+21,3 Lappage Court,0.2212,0.259012,0.080972,0.008545,5
+21,Telephone: 01882 652349,0.592565,0.259012,0.129555,0.008278,6
+21,"Tyler Green, Bucks.",0.220464,0.269159,0.092381,0.008278,7
+21,Mobile: 07717 121824,0.593669,0.269159,0.112992,0.006676,8
+21,HP8 4JD,0.2212,0.279306,0.040486,0.006409,9
+21,Email: ashleygill2023@gotmail.com,0.594038,0.279039,0.178874,0.008545,10
+21,Personal Details,0.221568,0.299332,0.095326,0.007744,11
+21,Summary,0.220832,0.321495,0.048215,0.008278,12
+21,Business studies with Spanish undergraduate.,0.273463,0.340988,0.229297,0.008812,13
+21,Ability to speak French and Spanish.,0.272727,0.351135,0.179242,0.008545,14
+21,Extensive business experience including an internship with Top Choice Holidays.,0.273095,0.361015,0.398233,0.008812,15
+21,Education And Qualifications,0.2212,0.381041,0.144277,0.008278,16
+21,2008 present,0.220832,0.401602,0.074715,0.008011,17
+21,Buckinghamshire Edge University,0.386824,0.401068,0.167096,0.008545,18
+21,BA International Business Studies with Spanish (expected 2:1),0.386824,0.410681,0.308796,0.008812,19
+21,Relate your degree to,0.230033,0.420027,0.100847,0.008278,20
+21,Study semester at The University of Valloid (Spain).,0.399338,0.420828,0.252852,0.008812,21
+21,the job by listing your,0.229665,0.429105,0.101583,0.008278,22
+21,Six-month work placement in Madrid.,0.399338,0.431242,0.188811,0.008545,23
+21,relevant modules/,0.230033,0.438718,0.085388,0.007744,24
+21,Relevant modules included: Business Planning; Sales Promotion and,0.399338,0.441389,0.338241,0.008545,25
+21,dissertation.,0.230033,0.448064,0.057784,0.006676,26
+21,Marketing; and Business Operations Management.,0.398969,0.451268,0.25322,0.008812,27
+21,2000 2007,0.2212,0.467824,0.061833,0.006409,28
+21,Freebridge School,0.386824,0.46729,0.087965,0.008545,29
+21,"A-Levels: Business Studies (B), French (C)",0.386088,0.476903,0.200221,0.008812,30
+21,"8 GCSEs including Maths, English, Spanish and French",0.386824,0.487583,0.266838,0.008545,31
+21,Work History,0.220832,0.509212,0.065513,0.008278,32
+21,2008 2011,0.220832,0.529506,0.061833,0.006409,33
+21,Buckinghamshire Edge University Librarian/tour guide,0.386824,0.528972,0.277144,0.008812,34
+21,General administrative and customer service roles.,0.399338,0.539119,0.25138,0.006676,35
+21,Briefly list,0.707766,0.536716,0.045639,0.008011,36
+21,your relevant,0.70703,0.546061,0.061465,0.008011,37
+21,2011 (Feb-Aug),0.2212,0.55514,0.078027,0.008812,38
+21,Audigest S.A. (Madrid) - Audit Assistant,0.386456,0.554873,0.199485,0.009079,39
+21,duties.,0.707398,0.555674,0.030916,0.006409,40
+21,Six months' work experience in an international bank.,0.399338,0.565287,0.267575,0.008545,41
+21,Liaising with colleagues and clients in English and Spanish.,0.399338,0.575434,0.292602,0.008545,42
+21,2010 (June-Dec),0.220832,0.591188,0.082444,0.008278,43
+21,Finsbury's supermarket (Hazelbridge) — Supervisor,0.386824,0.591188,0.250644,0.008812,44
+21,Managing a small team.,0.398969,0.601602,0.121089,0.008545,45
+21,Customer service in a busy competitive environment.,0.398969,0.611215,0.264262,0.008545,46
+21,2010 (Jan-Aug),0.2212,0.627236,0.077291,0.008812,47
+21,Top Choice Holidays and Flights Ltd (Low Wycombe),0.386088,0.627503,0.257637,0.008812,48
+21,Financial Assistant/Supervisor,0.386824,0.637383,0.15127,0.008812,49
+21,Working in a range of teams to manage complex financial processes.,0.398969,0.64753,0.341921,0.008812,50
+21,2007 (Jul-Aug),0.220832,0.663284,0.074347,0.008812,51
+21,Dogs Protection League - General Assistant,0.386824,0.663818,0.216783,0.008812,52
+21,Dealing with enquiries and selling packages to a range of clients.,0.399706,0.673431,0.321678,0.009079,53
+21,2006 (Jan-Dec),0.220832,0.689453,0.076187,0.009079,54
+21,McHenry's Restaurant (Low Wycombe) - Supervisor,0.386456,0.68972,0.256533,0.009079,55
+21,Voluntary Experience,0.220464,0.708411,0.106367,0.008545,56
+21,2007/2011,0.220832,0.728438,0.055208,0.008011,57
+21,Teaching English in Mexico/Spain,0.386088,0.727904,0.167832,0.009079,58
+21,Interests,0.2212,0.748465,0.043062,0.006676,59
+21,Active member of University Business Club — Winner of the 'Bucks Best Business Pitch' award in 2010 Enterprise,0.220464,0.768224,0.556864,0.009079,60
+21,"week, judged by Michael Eavis.",0.220464,0.778104,0.15311,0.008812,61
+22,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308428,0.008545,1
+22,Make sure you carefully assess,0.468531,0.23498,0.142068,0.008011,2
+22,Skills And Achievements,0.220832,0.245394,0.121457,0.006676,3
+22,the job advert/job description,0.468163,0.244326,0.139124,0.008278,4
+22,and address all the skills they,0.468531,0.253672,0.13618,0.008278,5
+22,Effective communication,0.2212,0.265421,0.123298,0.006676,6
+22,require.,0.468531,0.263017,0.034965,0.008011,7
+22,"Able to communicate effectively with a wide range of clients and colleagues, by showing interest, carefully",0.233714,0.275567,0.530364,0.008545,8
+22,"listening to needs and appropriately adjusting my message, as demonstrated during my time at Finsbury's",0.23445,0.285447,0.528892,0.008812,9
+22,Supermarket.,0.234082,0.295861,0.066618,0.008278,10
+22,Strong presentation skills and confidence demonstrated by experience of delivering presentations in different,0.23445,0.305474,0.543614,0.008812,11
+22,languages to groups of five to fifty.,0.234082,0.315621,0.172617,0.008812,12
+22,Customer service,0.220832,0.335915,0.085388,0.006676,13
+22,Ability to quickly build rapport with customers and calmly deal with any problems as shown during my retail,0.233714,0.345527,0.541038,0.008812,14
+22,experience in high pressure environments.,0.234082,0.355941,0.210526,0.008278,15
+22,"Capacity to maintain professional relationships through email and other written correspondence, for example,",0.234082,0.365554,0.548767,0.008812,16
+22,"at Audigest in Madrid, where I built longstanding business relationships with customers and colleagues across",0.233714,0.375701,0.549871,0.008812,17
+22,the globe.,0.233714,0.385848,0.049687,0.008278,18
+22,Teamwork,0.220464,0.406142,0.052632,0.006409,19
+22,"At Top Choice Holidays demonstrated excellent teamwork skills in a busy financial environment, such as an",0.233346,0.415754,0.532573,0.008812,20
+22,"ability to listen to clients and managers, perform my role to a high level and support colleagues, resulting in",0.234082,0.425634,0.535885,0.008812,21
+22,promotion.,0.234082,0.436048,0.05484,0.008545,22
+22,Administration,0.220464,0.456075,0.075083,0.006409,23
+22,Prove you have each of the,0.639676,0.453672,0.123666,0.008278,24
+22,"Excellent ability to plan ahead and manage time effectively, for example,",0.23445,0.465688,0.360692,0.008812,25
+22,skills required by outlining,0.63894,0.463017,0.12293,0.008278,26
+22,managing complex roles during my internship at Top Choice Holidays.,0.23445,0.476101,0.346338,0.008545,27
+22,where you performed them,0.63894,0.472363,0.128082,0.008278,28
+22,Gathered data from a wide range of sources during my dissertation,0.234082,0.485714,0.334928,0.008812,29
+22,and how you performed,0.639308,0.481709,0.111888,0.008278,30
+22,them well.,0.63894,0.491055,0.048951,0.006409,31
+22,"whilst balancing my other studies and two jobs, resulting in a 73% grade.",0.233346,0.495861,0.365109,0.008812,32
+22,Experience of travellers' needs,0.2212,0.515888,0.150534,0.008545,33
+22,Recent travel consultancy experience gives me an in-depth understanding of the expectations of holiday,0.23445,0.525768,0.518955,0.008812,34
+22,customers and the competitive nature of the industry.,0.234082,0.535915,0.269047,0.008812,35
+22,International travel experience and language ability give me an empathy with travellers and a passion for,0.234082,0.545794,0.524107,0.008812,36
+22,helping them find a unique holiday experience.,0.234082,0.555941,0.23445,0.008812,37
+22,Initiative,0.2212,0.576235,0.044166,0.006676,38
+22,Self-funding an evening course in bookkeeping during my first accountancy role demonstrated my ability to,0.234082,0.585848,0.535149,0.008812,39
+22,plan ahead and take control of my career.,0.23445,0.595995,0.205006,0.008545,40
+22,Successful study and work in Spain and Mexico show that I can creatively develop my skills and experience and,0.234082,0.605874,0.551711,0.008545,41
+22,adapt to new and different environments.,0.234082,0.616288,0.208686,0.008278,42
+22,Sales knowledge,0.220464,0.636315,0.083916,0.008011,43
+22,Wide experience of financial roles gives me an awareness of the tight monetary pressures which drive UK,0.234082,0.645928,0.525212,0.009346,44
+22,service industries.,0.234082,0.656609,0.088333,0.006943,45
+22,Raised sales at The Dogs Protection League by 12% by up selling add-on packages to new and existing,0.23445,0.665955,0.505705,0.009079,46
+22,customers.,0.234082,0.67717,0.054472,0.006142,47
+22,Language ability,0.2212,0.696395,0.082444,0.008812,48
+22,"Spanish fluency obtained working overseas, French semi-fluent.",0.233714,0.706008,0.323151,0.009079,49
+22,Referees,0.2212,0.726569,0.041958,0.006676,50
+22,Include all your referee details including their email and,0.351859,0.722029,0.259109,0.008545,51
+22,phone number (but ask for their permission first).,0.352227,0.731108,0.230401,0.008545,52
+22,"Professional: Mr. Jose Andreas, Management Accountant, Audigest, Avenida de Concha Espina 2, Madrid, ES-",0.2212,0.746328,0.537725,0.008812,53
+22,"28036, +34 91 398 5476, j.andreas@audigest.es",0.2212,0.756475,0.238498,0.008278,54
+22,"Academic: Dr. Jane Luffle, Personal Tutor, Buckinghamshire Edge University, Due Road, Low Wycombe, Bucks,",0.220464,0.776502,0.536621,0.008812,55
+22,"HD15 3DL, 01628 435 6784, j.luffle@bedge.ac.uk",0.2212,0.786382,0.244755,0.008545,56

example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv ADDED Viewed

	@@ -0,0 +1,40 @@

+page,text,left,top,width,height,line
+1,Example of emails sent to a professor before applying:,0.147059,0.093434,0.426471,0.013889,1
+1,Fwd: Prospective Graduate Student,0.145425,0.128788,0.277778,0.013889,2
+1,"Dr. Kornbluth,",0.147059,0.162879,0.114379,0.012626,3
+1,I am a senior biology major at the University of Notre Dame. I am applying to the CMB,0.147059,0.198232,0.689542,0.013889,4
+1,program and am very interested in your work. After glancing at a few of your recent,0.145425,0.214646,0.660131,0.013889,5
+1,papers and your research summary I find your work with apoptosis very interesting. Will,0.145425,0.232323,0.697712,0.013889,6
+1,"you be taking on new students next year? If I am invited to interview, is there any way",0.145425,0.25,0.683007,0.013889,7
+1,you will be able to meet with me?,0.145425,0.267677,0.264706,0.013889,8
+1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.147059,0.30303,0.69281,0.013889,9
+1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.147059,0.320707,0.697712,0.013889,10
+1,initiate Muller glia division post-light damage. My first research project was,0.147059,0.338384,0.598039,0.013889,11
+1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.147059,0.354798,0.637255,0.013889,12
+1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.372475,0.604575,0.013889,13
+1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.390152,0.689542,0.013889,14
+1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.147059,0.407828,0.635621,0.013889,15
+1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.425505,0.673203,0.013889,16
+1,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.443182,0.661765,0.013889,17
+1,transgenic line during retinal development and regeneration.,0.145425,0.459596,0.472222,0.013889,18
+1,Please find my CV attached.,0.145425,0.496212,0.222222,0.013889,19
+1,"Thank you for your time,",0.145425,0.531566,0.196078,0.013889,20
+1,--Lauren Lilley,0.147059,0.566919,0.119281,0.013889,21
+1,"Dr. Poss,",0.145425,0.637626,0.070261,0.012626,22
+1,I am a senior biology major at the University of Notre Dame. I am applying to your,0.145425,0.671717,0.655229,0.013889,23
+1,graduate program and am very interested in your work. After glancing at a few of your,0.145425,0.689394,0.679739,0.013889,24
+1,recent papers and your research summary I find your research greatly coincides with my,0.145425,0.707071,0.69281,0.013889,25
+1,research experiences and interests. Will you be taking on new students next year?,0.145425,0.723485,0.643791,0.015152,26
+1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.145425,0.760101,0.69281,0.013889,27
+1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.145425,0.777778,0.699346,0.013889,28
+1,initiate Muller glia division post-light damage. My first research project was,0.145425,0.795455,0.598039,0.013889,29
+1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.145425,0.811869,0.638889,0.013889,30
+1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.829545,0.604575,0.013889,31
+1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.847222,0.691176,0.013889,32
+1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.145425,0.864899,0.635621,0.013889,33
+1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.881313,0.673203,0.013889,34
+2,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.093434,0.661765,0.013889,1
+2,transgenic line during retinal development and regeneration.,0.145425,0.111111,0.472222,0.013889,2
+2,Please find my CV attached.,0.145425,0.146465,0.222222,0.013889,3
+2,"Thank you for your time,",0.145425,0.181818,0.196078,0.013889,4
+2,--Lauren Lilley,0.147059,0.218434,0.119281,0.013889,5

example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv ADDED Viewed

	@@ -0,0 +1,432 @@

+page,line,word_text,word_x0,word_y0,word_x1,word_y1,line_text,line_x0,line_y0,line_x1,line_y1
+1,1,Example,0.147059,0.093434,0.215686,0.107323,,,,,
+1,1,of,0.220588,0.093434,0.240196,0.104798,,,,,
+1,1,emails,0.24183,0.093434,0.292484,0.104798,,,,,
+1,1,sent,0.297386,0.094697,0.330065,0.104798,,,,,
+1,1,to,0.334967,0.094697,0.349673,0.104798,,,,,
+1,1,a,0.354575,0.097222,0.362745,0.104798,,,,,
+1,1,professor,0.367647,0.093434,0.441176,0.108586,,,,,
+1,1,before,0.446078,0.093434,0.496732,0.104798,,,,,
+1,1,applying:,0.501634,0.093434,0.573529,0.107323,,,,,
+1,2,Fwd:,0.145425,0.128788,0.184641,0.140152,,,,,
+1,2,Prospective,0.191176,0.128788,0.28268,0.142677,,,,,
+1,2,Graduate,0.287582,0.128788,0.359477,0.140152,,,,,
+1,2,Student,0.364379,0.128788,0.424837,0.140152,,,,,
+1,3,Dr.,0.147059,0.162879,0.171569,0.174242,,,,,
+1,3,"Kornbluth,",0.176471,0.162879,0.261438,0.176768,,,,,
+1,4,I,0.147059,0.198232,0.153595,0.209596,,,,,
+1,4,am,0.158497,0.200758,0.181373,0.209596,,,,,
+1,4,a,0.186275,0.20202,0.194444,0.209596,,,,,
+1,4,senior,0.199346,0.198232,0.248366,0.209596,,,,,
+1,4,biology,0.253268,0.198232,0.312092,0.212121,,,,,
+1,4,major,0.316993,0.198232,0.364379,0.212121,,,,,
+1,4,at,0.367647,0.199495,0.382353,0.209596,,,,,
+1,4,the,0.387255,0.198232,0.411765,0.209596,,,,,
+1,4,University,0.416667,0.198232,0.5,0.212121,,,,,
+1,4,of,0.504902,0.198232,0.522876,0.209596,,,,,
+1,4,Notre,0.52451,0.198232,0.570261,0.209596,,,,,
+1,4,Dame.,0.575163,0.198232,0.625817,0.209596,,,,,
+1,4,I,0.632353,0.198232,0.637255,0.209596,,,,,
+1,4,am,0.643791,0.200758,0.666667,0.209596,,,,,
+1,4,applying,0.671569,0.198232,0.740196,0.212121,,,,,
+1,4,to,0.745098,0.199495,0.759804,0.209596,,,,,
+1,4,the,0.764706,0.198232,0.789216,0.209596,,,,,
+1,4,CMB,0.794118,0.198232,0.836601,0.209596,,,,,
+1,5,program,0.145425,0.218434,0.212418,0.229798,,,,,
+1,5,and,0.21732,0.215909,0.245098,0.227273,,,,,
+1,5,am,0.25,0.218434,0.27451,0.227273,,,,,
+1,5,very,0.279412,0.218434,0.313725,0.229798,,,,,
+1,5,interested,0.320261,0.214646,0.395425,0.22601,,,,,
+1,5,in,0.400327,0.214646,0.416667,0.22601,,,,,
+1,5,your,0.419935,0.218434,0.457516,0.229798,,,,,
+1,5,work.,0.460784,0.214646,0.506536,0.227273,,,,,
+1,5,After,0.511438,0.214646,0.553922,0.227273,,,,,
+1,5,glancing,0.55719,0.215909,0.625817,0.229798,,,,,
+1,5,at,0.630719,0.217172,0.645425,0.227273,,,,,
+1,5,a,0.650327,0.218434,0.658497,0.227273,,,,,
+1,5,few,0.663399,0.214646,0.69281,0.22601,,,,,
+1,5,of,0.697712,0.214646,0.715686,0.227273,,,,,
+1,5,your,0.718954,0.218434,0.754902,0.229798,,,,,
+1,5,recent,0.759804,0.217172,0.80719,0.22601,,,,,
+1,6,papers,0.145425,0.236111,0.197712,0.247475,,,,,
+1,6,and,0.202614,0.232323,0.230392,0.243687,,,,,
+1,6,your,0.235294,0.236111,0.271242,0.247475,,,,,
+1,6,research,0.276144,0.232323,0.341503,0.243687,,,,,
+1,6,summary,0.346405,0.236111,0.419935,0.247475,,,,,
+1,6,I,0.424837,0.232323,0.431373,0.243687,,,,,
+1,6,find,0.436275,0.232323,0.46732,0.243687,,,,,
+1,6,your,0.472222,0.236111,0.50817,0.247475,,,,,
+1,6,work,0.513072,0.232323,0.553922,0.243687,,,,,
+1,6,with,0.558824,0.232323,0.593137,0.243687,,,,,
+1,6,apoptosis,0.598039,0.233586,0.671569,0.247475,,,,,
+1,6,very,0.678105,0.236111,0.712418,0.247475,,,,,
+1,6,interesting.,0.71732,0.232323,0.803922,0.247475,,,,,
+1,6,Will,0.810458,0.232323,0.844771,0.243687,,,,,
+1,7,you,0.145425,0.253788,0.174837,0.263889,,,,,
+1,7,be,0.179739,0.25,0.199346,0.261364,,,,,
+1,7,taking,0.204248,0.25,0.253268,0.265152,,,,,
+1,7,on,0.25817,0.253788,0.277778,0.261364,,,,,
+1,7,new,0.28268,0.253788,0.315359,0.261364,,,,,
+1,7,students,0.320261,0.25,0.383987,0.261364,,,,,
+1,7,next,0.388889,0.251263,0.423203,0.261364,,,,,
+1,7,year?,0.428105,0.25,0.470588,0.263889,,,,,
+1,7,If,0.480392,0.25,0.495098,0.261364,,,,,
+1,7,I,0.498366,0.25,0.504902,0.261364,,,,,
+1,7,am,0.509804,0.253788,0.534314,0.261364,,,,,
+1,7,invited,0.539216,0.25,0.593137,0.261364,,,,,
+1,7,to,0.598039,0.251263,0.612745,0.261364,,,,,
+1,7,"interview,",0.617647,0.25,0.696078,0.263889,,,,,
+1,7,is,0.702614,0.25,0.714052,0.261364,,,,,
+1,7,there,0.718954,0.25,0.759804,0.261364,,,,,
+1,7,any,0.763072,0.253788,0.792484,0.263889,,,,,
+1,7,way,0.797386,0.253788,0.830065,0.263889,,,,,
+1,8,you,0.145425,0.271465,0.176471,0.281566,,,,,
+1,8,will,0.179739,0.267677,0.210784,0.27904,,,,,
+1,8,be,0.215686,0.267677,0.235294,0.27904,,,,,
+1,8,able,0.238562,0.267677,0.272876,0.27904,,,,,
+1,8,to,0.276144,0.268939,0.292484,0.27904,,,,,
+1,8,meet,0.297386,0.268939,0.334967,0.27904,,,,,
+1,8,with,0.339869,0.267677,0.375817,0.27904,,,,,
+1,8,me?,0.380719,0.267677,0.411765,0.27904,,,,,
+1,9,I,0.147059,0.30303,0.151961,0.314394,,,,,
+1,9,have,0.156863,0.30303,0.194444,0.314394,,,,,
+1,9,worked,0.199346,0.30303,0.25817,0.314394,,,,,
+1,9,on,0.263072,0.306818,0.28268,0.314394,,,,,
+1,9,several,0.287582,0.30303,0.343137,0.314394,,,,,
+1,9,different,0.348039,0.30303,0.416667,0.314394,,,,,
+1,9,research,0.419935,0.30303,0.485294,0.314394,,,,,
+1,9,projects,0.490196,0.30303,0.552288,0.318182,,,,,
+1,9,as,0.558824,0.306818,0.573529,0.314394,,,,,
+1,9,an,0.580065,0.306818,0.598039,0.314394,,,,,
+1,9,undergraduate,0.602941,0.30303,0.714052,0.318182,,,,,
+1,9,in,0.718954,0.30303,0.735294,0.314394,,,,,
+1,9,Dr.,0.740196,0.30303,0.764706,0.314394,,,,,
+1,9,David,0.769608,0.30303,0.816993,0.314394,,,,,
+1,9,R.,0.823529,0.30303,0.839869,0.314394,,,,,
+1,10,Hyde's,0.147059,0.320707,0.199346,0.334596,,,,,
+1,10,lab,0.204248,0.320707,0.228758,0.332071,,,,,
+1,10,at,0.23366,0.32197,0.248366,0.332071,,,,,
+1,10,the,0.251634,0.320707,0.276144,0.332071,,,,,
+1,10,University,0.281046,0.320707,0.364379,0.334596,,,,,
+1,10,of,0.369281,0.320707,0.387255,0.332071,,,,,
+1,10,Notre,0.390523,0.320707,0.434641,0.332071,,,,,
+1,10,Dame.,0.439542,0.320707,0.490196,0.332071,,,,,
+1,10,The,0.496732,0.320707,0.527778,0.332071,,,,,
+1,10,Hyde,0.53268,0.320707,0.573529,0.334596,,,,,
+1,10,lab,0.580065,0.320707,0.602941,0.332071,,,,,
+1,10,is,0.607843,0.320707,0.620915,0.332071,,,,,
+1,10,interested,0.625817,0.320707,0.702614,0.332071,,,,,
+1,10,in,0.707516,0.320707,0.722222,0.332071,,,,,
+1,10,the,0.727124,0.320707,0.751634,0.332071,,,,,
+1,10,signals,0.756536,0.320707,0.810458,0.334596,,,,,
+1,10,that,0.815359,0.320707,0.844771,0.332071,,,,,
+1,11,initiate,0.147059,0.338384,0.20098,0.349747,,,,,
+1,11,Muller,0.205882,0.338384,0.259804,0.349747,,,,,
+1,11,glia,0.264706,0.338384,0.292484,0.352273,,,,,
+1,11,division,0.297386,0.338384,0.361111,0.349747,,,,,
+1,11,post-light,0.366013,0.338384,0.44281,0.352273,,,,,
+1,11,damage.,0.446078,0.338384,0.511438,0.352273,,,,,
+1,11,My,0.51634,0.338384,0.544118,0.352273,,,,,
+1,11,first,0.54902,0.338384,0.581699,0.349747,,,,,
+1,11,research,0.584967,0.338384,0.650327,0.349747,,,,,
+1,11,project,0.655229,0.338384,0.710784,0.353535,,,,,
+1,11,was,0.715686,0.340909,0.745098,0.349747,,,,,
+1,12,characterizing,0.147059,0.354798,0.256536,0.369949,,,,,
+1,12,the,0.261438,0.356061,0.285948,0.367424,,,,,
+1,12,role,0.29085,0.356061,0.321895,0.367424,,,,,
+1,12,of,0.326797,0.356061,0.344771,0.367424,,,,,
+1,12,leukemia,0.348039,0.356061,0.419935,0.367424,,,,,
+1,12,inhibitory,0.424837,0.354798,0.501634,0.369949,,,,,
+1,12,factor,0.506536,0.356061,0.553922,0.367424,,,,,
+1,12,(LIF),0.55719,0.354798,0.599673,0.369949,,,,,
+1,12,in,0.604575,0.356061,0.620915,0.367424,,,,,
+1,12,the,0.624183,0.356061,0.648693,0.366162,,,,,
+1,12,activation,0.653595,0.356061,0.732026,0.367424,,,,,
+1,12,of,0.735294,0.354798,0.754902,0.367424,,,,,
+1,12,cell,0.756536,0.356061,0.785948,0.367424,,,,,
+1,13,proliferation,0.145425,0.372475,0.243464,0.387626,,,,,
+1,13,in,0.25,0.373737,0.264706,0.383838,,,,,
+1,13,the,0.269608,0.373737,0.292484,0.383838,,,,,
+1,13,undamaged,0.297386,0.372475,0.388889,0.387626,,,,,
+1,13,zebrafish,0.393791,0.372475,0.465686,0.383838,,,,,
+1,13,retina.,0.470588,0.373737,0.519608,0.383838,,,,,
+1,13,I,0.52451,0.373737,0.531046,0.383838,,,,,
+1,13,am,0.535948,0.376263,0.560458,0.383838,,,,,
+1,13,also,0.565359,0.372475,0.596405,0.383838,,,,,
+1,13,working,0.601307,0.372475,0.666667,0.387626,,,,,
+1,13,on,0.671569,0.376263,0.691176,0.385101,,,,,
+1,13,several,0.696078,0.373737,0.751634,0.383838,,,,,
+1,14,experiments,0.145425,0.390152,0.24183,0.405303,,,,,
+1,14,that,0.246732,0.390152,0.276144,0.401515,,,,,
+1,14,are,0.281046,0.393939,0.305556,0.401515,,,,,
+1,14,related,0.308824,0.390152,0.362745,0.401515,,,,,
+1,14,to,0.367647,0.392677,0.383987,0.401515,,,,,
+1,14,a,0.388889,0.393939,0.397059,0.401515,,,,,
+1,14,genetic,0.401961,0.390152,0.45915,0.405303,,,,,
+1,14,screen,0.464052,0.393939,0.514706,0.401515,,,,,
+1,14,that,0.517974,0.390152,0.547386,0.401515,,,,,
+1,14,the,0.552288,0.390152,0.576797,0.401515,,,,,
+1,14,Hyde,0.581699,0.390152,0.624183,0.405303,,,,,
+1,14,lab,0.629085,0.390152,0.653595,0.401515,,,,,
+1,14,plans,0.658497,0.390152,0.699346,0.405303,,,,,
+1,14,on,0.704248,0.393939,0.723856,0.401515,,,,,
+1,14,performing,0.728758,0.390152,0.816993,0.405303,,,,,
+1,14,to,0.821895,0.391414,0.836601,0.401515,,,,,
+1,15,identify,0.147059,0.407828,0.207516,0.421717,,,,,
+1,15,mutants,0.212418,0.409091,0.272876,0.419192,,,,,
+1,15,in,0.279412,0.407828,0.294118,0.419192,,,,,
+1,15,the,0.29902,0.407828,0.323529,0.419192,,,,,
+1,15,regeneration,0.328431,0.407828,0.426471,0.42298,,,,,
+1,15,pathway--I,0.429739,0.407828,0.51634,0.42298,,,,,
+1,15,am,0.522876,0.411616,0.545752,0.419192,,,,,
+1,15,developing,0.550654,0.407828,0.638889,0.42298,,,,,
+1,15,a,0.643791,0.411616,0.651961,0.419192,,,,,
+1,15,neuroD4:EGFP,0.656863,0.407828,0.78268,0.419192,,,,,
+1,16,transgenic,0.145425,0.425505,0.227124,0.439394,,,,,
+1,16,line,0.232026,0.425505,0.261438,0.436869,,,,,
+1,16,for,0.26634,0.425505,0.289216,0.436869,,,,,
+1,16,use,0.294118,0.42803,0.320261,0.436869,,,,,
+1,16,in,0.325163,0.425505,0.339869,0.436869,,,,,
+1,16,this,0.344771,0.425505,0.372549,0.436869,,,,,
+1,16,screen,0.377451,0.42803,0.428105,0.436869,,,,,
+1,16,and,0.433007,0.425505,0.460784,0.436869,,,,,
+1,16,I,0.46732,0.425505,0.472222,0.436869,,,,,
+1,16,am,0.477124,0.42803,0.501634,0.436869,,,,,
+1,16,characterizing,0.506536,0.425505,0.617647,0.439394,,,,,
+1,16,the,0.622549,0.425505,0.647059,0.436869,,,,,
+1,16,extent,0.651961,0.426768,0.70098,0.436869,,,,,
+1,16,of,0.704248,0.425505,0.722222,0.436869,,,,,
+1,16,damage,0.72549,0.425505,0.787582,0.439394,,,,,
+1,16,and,0.79085,0.425505,0.820261,0.436869,,,,,
+1,17,regeneration,0.145425,0.443182,0.243464,0.457071,,,,,
+1,17,in,0.25,0.443182,0.264706,0.454545,,,,,
+1,17,sheer,0.267974,0.443182,0.312092,0.454545,,,,,
+1,17,zebrafish,0.316993,0.443182,0.388889,0.454545,,,,,
+1,17,retinas.,0.393791,0.443182,0.449346,0.454545,,,,,
+1,17,"Finally,",0.455882,0.443182,0.51634,0.457071,,,,,
+1,17,I,0.521242,0.443182,0.527778,0.454545,,,,,
+1,17,am,0.53268,0.445707,0.55719,0.454545,,,,,
+1,17,characterizing,0.560458,0.443182,0.671569,0.457071,,,,,
+1,17,the,0.676471,0.443182,0.70098,0.454545,,,,,
+1,17,chx10:EGFP,0.705882,0.443182,0.808824,0.454545,,,,,
+1,18,transgenic,0.145425,0.459596,0.227124,0.474747,,,,,
+1,18,line,0.232026,0.459596,0.261438,0.47096,,,,,
+1,18,during,0.26634,0.459596,0.316993,0.474747,,,,,
+1,18,retinal,0.321895,0.459596,0.372549,0.47096,,,,,
+1,18,development,0.377451,0.459596,0.478758,0.474747,,,,,
+1,18,and,0.48366,0.460859,0.511438,0.47096,,,,,
+1,18,regeneration.,0.51634,0.459596,0.619281,0.474747,,,,,
+1,19,Please,0.145425,0.496212,0.196078,0.507576,,,,,
+1,19,find,0.20098,0.496212,0.232026,0.507576,,,,,
+1,19,my,0.236928,0.5,0.263072,0.510101,,,,,
+1,19,CV,0.267974,0.496212,0.295752,0.507576,,,,,
+1,19,attached.,0.29902,0.496212,0.369281,0.507576,,,,,
+1,20,Thank,0.145425,0.531566,0.196078,0.542929,,,,,
+1,20,you,0.20098,0.535354,0.230392,0.546717,,,,,
+1,20,for,0.235294,0.531566,0.25817,0.542929,,,,,
+1,20,your,0.263072,0.535354,0.29902,0.546717,,,,,
+1,20,"time,",0.303922,0.531566,0.343137,0.545455,,,,,
+1,21,--Lauren,0.147059,0.568182,0.215686,0.579545,,,,,
+1,21,Lilley,0.218954,0.566919,0.26634,0.582071,,,,,
+1,22,Dr.,0.145425,0.637626,0.171569,0.64899,,,,,
+1,22,"Poss,",0.176471,0.637626,0.21732,0.651515,,,,,
+1,23,I,0.145425,0.671717,0.151961,0.683081,,,,,
+1,23,am,0.158497,0.675505,0.181373,0.684343,,,,,
+1,23,a,0.186275,0.675505,0.194444,0.684343,,,,,
+1,23,senior,0.199346,0.671717,0.248366,0.683081,,,,,
+1,23,biology,0.253268,0.671717,0.312092,0.686869,,,,,
+1,23,major,0.316993,0.671717,0.364379,0.686869,,,,,
+1,23,at,0.369281,0.674242,0.382353,0.683081,,,,,
+1,23,the,0.387255,0.671717,0.411765,0.684343,,,,,
+1,23,University,0.416667,0.671717,0.498366,0.686869,,,,,
+1,23,of,0.504902,0.671717,0.522876,0.683081,,,,,
+1,23,Notre,0.52451,0.671717,0.570261,0.684343,,,,,
+1,23,Dame.,0.575163,0.671717,0.625817,0.684343,,,,,
+1,23,I,0.630719,0.671717,0.637255,0.683081,,,,,
+1,23,am,0.643791,0.675505,0.666667,0.684343,,,,,
+1,23,applying,0.671569,0.67298,0.740196,0.686869,,,,,
+1,23,to,0.745098,0.67298,0.759804,0.683081,,,,,
+1,23,your,0.764706,0.675505,0.802288,0.686869,,,,,
+1,24,graduate,0.145425,0.689394,0.214052,0.704545,,,,,
+1,24,program,0.218954,0.693182,0.284314,0.703283,,,,,
+1,24,and,0.289216,0.689394,0.318627,0.700758,,,,,
+1,24,am,0.323529,0.693182,0.348039,0.700758,,,,,
+1,24,very,0.351307,0.693182,0.387255,0.703283,,,,,
+1,24,interested,0.392157,0.689394,0.46732,0.700758,,,,,
+1,24,in,0.473856,0.689394,0.488562,0.700758,,,,,
+1,24,your,0.493464,0.693182,0.529412,0.703283,,,,,
+1,24,work.,0.534314,0.689394,0.578431,0.700758,,,,,
+1,24,After,0.583333,0.689394,0.625817,0.700758,,,,,
+1,24,glancing,0.630719,0.689394,0.697712,0.703283,,,,,
+1,24,at,0.702614,0.690657,0.71732,0.700758,,,,,
+1,24,a,0.722222,0.693182,0.730392,0.700758,,,,,
+1,24,few,0.735294,0.689394,0.764706,0.700758,,,,,
+1,24,of,0.769608,0.689394,0.787582,0.700758,,,,,
+1,24,your,0.79085,0.693182,0.826797,0.703283,,,,,
+1,25,recent,0.145425,0.708333,0.194444,0.718434,,,,,
+1,25,papers,0.199346,0.710859,0.25,0.72096,,,,,
+1,25,and,0.254902,0.707071,0.28268,0.718434,,,,,
+1,25,your,0.287582,0.710859,0.325163,0.72096,,,,,
+1,25,research,0.328431,0.707071,0.393791,0.718434,,,,,
+1,25,summary,0.398693,0.709596,0.472222,0.72096,,,,,
+1,25,I,0.477124,0.707071,0.48366,0.718434,,,,,
+1,25,find,0.488562,0.707071,0.519608,0.718434,,,,,
+1,25,your,0.52451,0.710859,0.562092,0.72096,,,,,
+1,25,research,0.565359,0.707071,0.632353,0.718434,,,,,
+1,25,greatly,0.637255,0.707071,0.691176,0.72096,,,,,
+1,25,coincides,0.696078,0.707071,0.769608,0.718434,,,,,
+1,25,with,0.77451,0.707071,0.810458,0.718434,,,,,
+1,25,my,0.813725,0.710859,0.839869,0.72096,,,,,
+1,26,research,0.145425,0.724747,0.210784,0.736111,,,,,
+1,26,experiences,0.21732,0.724747,0.308824,0.738636,,,,,
+1,26,and,0.313725,0.723485,0.341503,0.736111,,,,,
+1,26,interests.,0.346405,0.723485,0.416667,0.736111,,,,,
+1,26,Will,0.426471,0.723485,0.462418,0.736111,,,,,
+1,26,you,0.465686,0.727273,0.496732,0.738636,,,,,
+1,26,be,0.5,0.723485,0.519608,0.736111,,,,,
+1,26,taking,0.52451,0.724747,0.573529,0.738636,,,,,
+1,26,on,0.578431,0.727273,0.598039,0.736111,,,,,
+1,26,new,0.602941,0.727273,0.635621,0.736111,,,,,
+1,26,students,0.640523,0.724747,0.704248,0.736111,,,,,
+1,26,next,0.70915,0.72601,0.745098,0.734848,,,,,
+1,26,year?,0.748366,0.724747,0.79085,0.738636,,,,,
+1,27,I,0.145425,0.760101,0.151961,0.771465,,,,,
+1,27,have,0.156863,0.760101,0.194444,0.771465,,,,,
+1,27,worked,0.199346,0.760101,0.25817,0.771465,,,,,
+1,27,on,0.263072,0.763889,0.28268,0.771465,,,,,
+1,27,several,0.287582,0.760101,0.343137,0.771465,,,,,
+1,27,different,0.348039,0.760101,0.416667,0.771465,,,,,
+1,27,research,0.419935,0.760101,0.485294,0.771465,,,,,
+1,27,projects,0.490196,0.760101,0.552288,0.775253,,,,,
+1,27,as,0.55719,0.763889,0.573529,0.771465,,,,,
+1,27,an,0.578431,0.763889,0.598039,0.771465,,,,,
+1,27,undergraduate,0.602941,0.760101,0.714052,0.775253,,,,,
+1,27,in,0.718954,0.760101,0.735294,0.771465,,,,,
+1,27,Dr.,0.740196,0.760101,0.764706,0.771465,,,,,
+1,27,David,0.769608,0.760101,0.818627,0.771465,,,,,
+1,27,R.,0.823529,0.760101,0.839869,0.771465,,,,,
+1,28,Hyde's,0.145425,0.777778,0.199346,0.791667,,,,,
+1,28,lab,0.204248,0.777778,0.228758,0.789141,,,,,
+1,28,at,0.23366,0.77904,0.248366,0.789141,,,,,
+1,28,the,0.251634,0.777778,0.276144,0.789141,,,,,
+1,28,University,0.281046,0.777778,0.364379,0.791667,,,,,
+1,28,of,0.369281,0.777778,0.387255,0.789141,,,,,
+1,28,Notre,0.390523,0.777778,0.434641,0.789141,,,,,
+1,28,Dame.,0.439542,0.777778,0.490196,0.789141,,,,,
+1,28,The,0.496732,0.777778,0.527778,0.789141,,,,,
+1,28,Hyde,0.53268,0.777778,0.573529,0.791667,,,,,
+1,28,lab,0.580065,0.777778,0.602941,0.789141,,,,,
+1,28,is,0.607843,0.777778,0.620915,0.789141,,,,,
+1,28,interested,0.625817,0.777778,0.702614,0.789141,,,,,
+1,28,in,0.707516,0.777778,0.722222,0.789141,,,,,
+1,28,the,0.727124,0.777778,0.751634,0.789141,,,,,
+1,28,signals,0.756536,0.777778,0.810458,0.791667,,,,,
+1,28,that,0.815359,0.777778,0.846405,0.789141,,,,,
+1,29,initiate,0.145425,0.795455,0.20098,0.806818,,,,,
+1,29,Muller,0.205882,0.795455,0.259804,0.806818,,,,,
+1,29,glia,0.264706,0.795455,0.292484,0.809343,,,,,
+1,29,division,0.297386,0.795455,0.361111,0.806818,,,,,
+1,29,post-light,0.366013,0.795455,0.44281,0.809343,,,,,
+1,29,damage.,0.446078,0.795455,0.511438,0.809343,,,,,
+1,29,My,0.51634,0.795455,0.544118,0.809343,,,,,
+1,29,first,0.54902,0.795455,0.581699,0.806818,,,,,
+1,29,research,0.584967,0.795455,0.651961,0.806818,,,,,
+1,29,project,0.655229,0.795455,0.710784,0.809343,,,,,
+1,29,was,0.715686,0.799242,0.745098,0.806818,,,,,
+1,30,characterizing,0.145425,0.811869,0.25817,0.82702,,,,,
+1,30,the,0.261438,0.811869,0.285948,0.823232,,,,,
+1,30,role,0.29085,0.813131,0.321895,0.823232,,,,,
+1,30,of,0.326797,0.811869,0.344771,0.824495,,,,,
+1,30,leukemia,0.348039,0.811869,0.419935,0.823232,,,,,
+1,30,inhibitory,0.424837,0.811869,0.501634,0.82702,,,,,
+1,30,factor,0.506536,0.811869,0.553922,0.823232,,,,,
+1,30,(LIF),0.55719,0.813131,0.599673,0.82702,,,,,
+1,30,in,0.604575,0.811869,0.620915,0.824495,,,,,
+1,30,the,0.624183,0.811869,0.648693,0.824495,,,,,
+1,30,activation,0.653595,0.813131,0.732026,0.824495,,,,,
+1,30,of,0.735294,0.811869,0.754902,0.824495,,,,,
+1,30,cell,0.756536,0.811869,0.785948,0.824495,,,,,
+1,31,proliferation,0.145425,0.829545,0.245098,0.844697,,,,,
+1,31,in,0.25,0.829545,0.264706,0.840909,,,,,
+1,31,the,0.267974,0.829545,0.292484,0.840909,,,,,
+1,31,undamaged,0.297386,0.830808,0.388889,0.844697,,,,,
+1,31,zebrafish,0.393791,0.829545,0.465686,0.842172,,,,,
+1,31,retina.,0.470588,0.830808,0.519608,0.842172,,,,,
+1,31,I,0.52451,0.830808,0.531046,0.840909,,,,,
+1,31,am,0.535948,0.833333,0.560458,0.842172,,,,,
+1,31,also,0.565359,0.829545,0.596405,0.840909,,,,,
+1,31,working,0.601307,0.830808,0.666667,0.844697,,,,,
+1,31,on,0.671569,0.833333,0.691176,0.840909,,,,,
+1,31,several,0.696078,0.829545,0.751634,0.840909,,,,,
+1,32,experiments,0.145425,0.847222,0.24183,0.862374,,,,,
+1,32,that,0.246732,0.847222,0.276144,0.858586,,,,,
+1,32,are,0.281046,0.85101,0.305556,0.858586,,,,,
+1,32,related,0.308824,0.847222,0.362745,0.858586,,,,,
+1,32,to,0.367647,0.848485,0.383987,0.858586,,,,,
+1,32,a,0.388889,0.85101,0.397059,0.858586,,,,,
+1,32,genetic,0.401961,0.847222,0.45915,0.861111,,,,,
+1,32,screen,0.464052,0.85101,0.514706,0.858586,,,,,
+1,32,that,0.517974,0.847222,0.54902,0.858586,,,,,
+1,32,the,0.552288,0.847222,0.576797,0.858586,,,,,
+1,32,Hyde,0.581699,0.847222,0.624183,0.861111,,,,,
+1,32,lab,0.629085,0.847222,0.653595,0.858586,,,,,
+1,32,plans,0.656863,0.847222,0.699346,0.861111,,,,,
+1,32,on,0.704248,0.85101,0.723856,0.858586,,,,,
+1,32,performing,0.728758,0.847222,0.816993,0.862374,,,,,
+1,32,to,0.821895,0.848485,0.836601,0.858586,,,,,
+1,33,identify,0.145425,0.864899,0.207516,0.878788,,,,,
+1,33,mutants,0.212418,0.866162,0.272876,0.876263,,,,,
+1,33,in,0.279412,0.864899,0.294118,0.876263,,,,,
+1,33,the,0.29902,0.864899,0.323529,0.876263,,,,,
+1,33,regeneration,0.328431,0.864899,0.426471,0.878788,,,,,
+1,33,pathway--I,0.431373,0.864899,0.51634,0.878788,,,,,
+1,33,am,0.522876,0.868687,0.545752,0.876263,,,,,
+1,33,developing,0.550654,0.864899,0.638889,0.878788,,,,,
+1,33,a,0.643791,0.868687,0.651961,0.876263,,,,,
+1,33,neuroD4:EGFP,0.655229,0.864899,0.78268,0.876263,,,,,
+1,34,transgenic,0.145425,0.882576,0.227124,0.896465,,,,,
+1,34,line,0.232026,0.882576,0.261438,0.893939,,,,,
+1,34,for,0.26634,0.881313,0.289216,0.893939,,,,,
+1,34,use,0.294118,0.885101,0.320261,0.893939,,,,,
+1,34,in,0.325163,0.882576,0.339869,0.893939,,,,,
+1,34,this,0.344771,0.882576,0.372549,0.893939,,,,,
+1,34,screen,0.379085,0.885101,0.428105,0.893939,,,,,
+1,34,and,0.433007,0.882576,0.460784,0.893939,,,,,
+1,34,I,0.46732,0.882576,0.472222,0.893939,,,,,
+1,34,am,0.478758,0.885101,0.501634,0.893939,,,,,
+1,34,characterizing,0.506536,0.882576,0.617647,0.896465,,,,,
+1,34,the,0.622549,0.882576,0.647059,0.893939,,,,,
+1,34,extent,0.651961,0.883838,0.699346,0.892677,,,,,
+1,34,of,0.704248,0.882576,0.722222,0.893939,,,,,
+1,34,damage,0.72549,0.882576,0.785948,0.896465,,,,,
+1,34,and,0.79085,0.882576,0.820261,0.893939,,,,,
+2,1,regeneration,0.145425,0.093434,0.243464,0.107323,,,,,
+2,1,in,0.248366,0.093434,0.264706,0.104798,,,,,
+2,1,sheer,0.267974,0.093434,0.312092,0.104798,,,,,
+2,1,zebrafish,0.316993,0.093434,0.387255,0.104798,,,,,
+2,1,retinas.,0.392157,0.093434,0.449346,0.104798,,,,,
+2,1,"Finally,",0.455882,0.093434,0.514706,0.107323,,,,,
+2,1,I,0.521242,0.093434,0.527778,0.104798,,,,,
+2,1,am,0.53268,0.097222,0.555556,0.104798,,,,,
+2,1,characterizing,0.560458,0.093434,0.671569,0.107323,,,,,
+2,1,the,0.676471,0.093434,0.70098,0.104798,,,,,
+2,1,chx10:EGFP,0.705882,0.093434,0.808824,0.104798,,,,,
+2,2,transgenic,0.145425,0.111111,0.227124,0.125,,,,,
+2,2,line,0.232026,0.111111,0.261438,0.122475,,,,,
+2,2,during,0.26634,0.111111,0.316993,0.125,,,,,
+2,2,retinal,0.321895,0.111111,0.372549,0.122475,,,,,
+2,2,development,0.377451,0.111111,0.478758,0.125,,,,,
+2,2,and,0.48366,0.111111,0.511438,0.122475,,,,,
+2,2,regeneration.,0.51634,0.111111,0.617647,0.125,,,,,
+2,3,Please,0.145425,0.146465,0.196078,0.157828,,,,,
+2,3,find,0.20098,0.146465,0.232026,0.157828,,,,,
+2,3,my,0.236928,0.150253,0.263072,0.160354,,,,,
+2,3,CV,0.267974,0.146465,0.295752,0.157828,,,,,
+2,3,attached.,0.29902,0.146465,0.369281,0.157828,,,,,
+2,4,Thank,0.145425,0.183081,0.196078,0.193182,,,,,
+2,4,you,0.20098,0.185606,0.230392,0.19697,,,,,
+2,4,for,0.235294,0.181818,0.25817,0.193182,,,,,
+2,4,your,0.263072,0.185606,0.29902,0.19697,,,,,
+2,4,"time,",0.303922,0.181818,0.343137,0.195707,,,,,
+2,5,--Lauren,0.147059,0.218434,0.215686,0.229798,,,,,
+2,5,Lilley,0.218954,0.218434,0.26634,0.232323,,,,,

example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv ADDED Viewed

	@@ -0,0 +1,15 @@

+image,page,label,color,xmin,ymin,xmax,ymax,id,text
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.147059,0.162879,0.171569,0.174242,oJIosRHGyCRn,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.147059,0.162879,0.261438,0.176768,5C5tA6mfeL7T,Dr Kornbluth
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.162879,0.261438,0.176768,UoYN48bc2ry5,Kornbluth
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.30303,0.764706,0.314394,cAsjVETPEisV,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.30303,0.839869,0.314394,yQ5HKn4tfT7L,Dr David R.
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.30303,0.839869,0.314394,LR8phiOYnLWi,David R.
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.218954,0.566919,0.26634,0.582071,X8iObIauqZ9k,Lauren Lilley
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.145425,0.637626,0.171569,0.64899,SvWjK2F7R3un,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.145425,0.637626,0.21732,0.651515,zKJFVAOszwdM,Dr Poss
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.637626,0.21732,0.651515,Iqda7ixkzcmg,Poss
+placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.760101,0.764706,0.771465,TWQD93bGI3B3,Dr
+placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.760101,0.839869,0.771465,vQuQQwqWjSES,Dr David R.
+placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.760101,0.839869,0.771465,f8xf6ORJUSnG,David R.
+placeholder_image_1.png,2,NAME,"(0, 0, 0)",0.218954,0.218434,0.26634,0.232323,N0nje9UiCzZK,Lauren Lilley

example_data/graduate-job-example-cover-letter.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71cc851d41f80dd8b045af32657b76bf85dd8f72d39ae08fa43dc7a78256fe35
+size 77045

example_data/partnership_toolkit_redact_custom_deny_list.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Friendship City
2	+ United States

example_data/partnership_toolkit_redact_some_pages.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2
2	+ 5

example_data/test_allow_list_graduate.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Wilson

example_data/test_allow_list_partnership.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ akaplan@sister-cities.org