File size: 4,704 Bytes
d864d45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "doc_redaction"
version = "1.6.2"
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
readme = "README.md"
authors = [
    { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
]
maintainers = [
    { name = "Sean Pedrick-Case", email = "spedrickcase@lambeth.gov.uk" },
]
license = { text = "AGPL-3.0-only" } # This licence type required to use PyMuPDF
keywords = [
    "redaction",
    "pdf",
    "nlp",
    "documents",
    "document-processing",
    "gradio",
    "pii",
    "pii-detection"
]
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Intended Audience :: Developers",
    "Intended Audience :: Legal Industry",
    "Topic :: Text Processing :: General",
    "Topic :: Security :: Cryptography",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
]
requires-python = ">=3.10"
dependencies = [
    "pdfminer.six==20251107",
    "pdf2image==1.17.0",
    "pymupdf==1.26.6",
    "bleach==6.3.0",
    "opencv-python==4.12.0.88",
    "presidio_analyzer==2.2.360",
    "presidio_anonymizer==2.2.360",
    "presidio-image-redactor==0.0.57",
    "pikepdf==9.11.0",
    "pandas==2.3.3",
    "scikit-learn==1.7.2",
    "spacy==3.8.8",
    "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
    "gradio==5.49.1",
    "boto3==1.40.72",
    "pyarrow==21.0.0",
    "openpyxl==3.1.5",
    "Faker==37.8.0",
    "python-levenshtein==0.27.1",
    "spaczz==0.6.1",
    "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
    "rapidfuzz==3.14.1",
    "python-dotenv==1.0.1",
    "awslambdaric==3.1.1",
    "python-docx==1.2.0",
    "polars==1.35.2",
    "defusedxml==0.7.1",
    "numpy==2.2.6",
    "spaces==0.42.1",
]

[project.optional-dependencies]

# For testing
dev = ["pytest"]
test = ["pytest", "pytest-cov"]

# To install the app with paddle and vlm support with pip, example command (in base folder and correct python environment): pip install .[paddle,vlm], or uv pip install .[ocr,vlm] if using uv. Note need to GPU version of Torch below

# Extra dependencies for PaddleOCR
# The following installs the CPU version of paddleOCR. If you want the GPU-accelerated version, run manually pip install paddlepaddle-gpu<=3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
paddle = [
    "paddlepaddle>=3.0.0,<=3.2.1", 
    "paddleocr==3.3.0",
    "pycocotools==2.0.10",
]

# Extra dependencies for VLM models
# The following installs the CPU compatible version of pytorch. For torch cuda support you should run manually pip install --index-url https://download.pytorch.org/whl/cu129 after installation
vlm = [
    "torch>=2.5.1,<=2.8.0", 
    "torchvision>=0.20.1",
    "transformers==4.57.2",
    "accelerate==1.11.0",
    "bitsandbytes==0.48.2",
    "sentencepiece==0.2.1", # Needed for PaddleOCRVL
]

# Run Gradio as an mcp server
mcp = [
    "gradio[mcp]==5.49.1"
]

[project.urls]
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
Repository = "https://github.com/seanpedrick-case/doc_redaction"

[project.scripts]
cli_redact = "cli_redact:main"

# Configuration for Ruff linter:
[tool.ruff]
line-length = 88

[tool.ruff.lint]
select = ["E", "F", "I"]
ignore = [
    "E501",  # line-too-long (handled with Black)
    "E402",  # module-import-not-at-top-of-file (sometimes needed for conditional imports)
]

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]  # Allow unused imports in __init__.py

# Configuration for a Black formatter:
[tool.black]
line-length = 88
target-version = ['py310']

# Configuration for pytest:
[tool.pytest.ini_options]
filterwarnings = [
    "ignore::DeprecationWarning:click.parser",
    "ignore::DeprecationWarning:weasel.util.config",
    "ignore::DeprecationWarning:builtin type",
    "ignore::DeprecationWarning:websockets.legacy",
    "ignore::DeprecationWarning:websockets.server",
    "ignore::DeprecationWarning:spacy.cli._util",
    "ignore::DeprecationWarning:weasel.util.config",
    "ignore::DeprecationWarning:importlib._bootstrap",
]
testpaths = ["test"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
    "-v",
    "--tb=short",
    "--strict-markers",
    "--disable-warnings",
]