File size: 4,704 Bytes
546be9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "doc_redaction"
version = "1.6.2"
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
readme = "README.md"
authors = [
{ name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
]
maintainers = [
{ name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
]
license = { text = "AGPL-3.0-only" } # This licence type required to use PyMuPDF
keywords = [
"redaction",
"pdf",
"nlp",
"documents",
"document-processing",
"gradio",
"pii",
"pii-detection"
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Legal Industry",
"Topic :: Text Processing :: General",
"Topic :: Security :: Cryptography",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
requires-python = ">=3.10"
dependencies = [
"pdfminer.six==20251107",
"pdf2image==1.17.0",
"pymupdf==1.26.6",
"bleach==6.3.0",
"opencv-python==4.12.0.88",
"presidio_analyzer==2.2.360",
"presidio_anonymizer==2.2.360",
"presidio-image-redactor==0.0.57",
"pikepdf==9.11.0",
"pandas==2.3.3",
"scikit-learn==1.7.2",
"spacy==3.8.8",
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
"gradio==5.49.1",
"boto3==1.40.72",
"pyarrow==21.0.0",
"openpyxl==3.1.5",
"Faker==37.8.0",
"python-levenshtein==0.27.1",
"spaczz==0.6.1",
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
"rapidfuzz==3.14.1",
"python-dotenv==1.0.1",
"awslambdaric==3.1.1",
"python-docx==1.2.0",
"polars==1.35.2",
"defusedxml==0.7.1",
"numpy==2.2.6",
"spaces==0.42.1",
]
[project.optional-dependencies]
# For testing
dev = ["pytest"]
test = ["pytest", "pytest-cov"]
# To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[ocr,vlm] if using uv. Note need to GPU version of Torch below
# Extra dependencies for PaddleOCR
# The following installs the CPU version of paddleOCR. If you want the GPU-accelerated version, run manually pip install paddlepaddle-gpu<=3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
paddle = [
"paddlepaddle>=3.0.0,<=3.2.1",
"paddleocr==3.3.0",
"pycocotools==2.0.10",
]
# Extra dependencies for VLM models
# The following installs the CPU compatible version of pytorch. For torch cuda support you should run manually pip install --index-url https://download.pytorch.org/whl/cu129 after installation
vlm = [
"torch>=2.5.1,<=2.8.0",
"torchvision>=0.20.1",
"transformers==4.57.2",
"accelerate==1.11.0",
"bitsandbytes==0.48.2",
"sentencepiece==0.2.1", # Needed for PaddleOCRVL
]
# Run Gradio as an mcp server
mcp = [
"gradio[mcp]==5.49.1"
]
[project.urls]
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
Repository = "https://github.com/seanpedrick-case/doc_redaction"
[project.scripts]
cli_redact = "cli_redact:main"
# Configuration for Ruff linter:
[tool.ruff]
line-length = 88
[tool.ruff.lint]
select = ["E", "F", "I"]
ignore = [
"E501", # line-too-long (handled with Black)
"E402", # module-import-not-at-top-of-file (sometimes needed for conditional imports)
]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"] # Allow unused imports in __init__.py
# Configuration for a Black formatter:
[tool.black]
line-length = 88
target-version = ['py310']
# Configuration for pytest:
[tool.pytest.ini_options]
filterwarnings = [
"ignore::DeprecationWarning:click.parser",
"ignore::DeprecationWarning:weasel.util.config",
"ignore::DeprecationWarning:builtin type",
"ignore::DeprecationWarning:websockets.legacy",
"ignore::DeprecationWarning:websockets.server",
"ignore::DeprecationWarning:spacy.cli._util",
"ignore::DeprecationWarning:weasel.util.config",
"ignore::DeprecationWarning:importlib._bootstrap",
]
testpaths = ["test"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
"-v",
"--tb=short",
"--strict-markers",
"--disable-warnings",
] |