Spaces:
Sleeping
Sleeping
Update pipelines/anonymize.py
Browse files- pipelines/anonymize.py +17 -0
pipelines/anonymize.py
CHANGED
|
@@ -4,12 +4,21 @@ from reportlab.lib.pagesizes import A4
|
|
| 4 |
from reportlab.pdfgen import canvas
|
| 5 |
from reportlab.lib.units import mm
|
| 6 |
|
|
|
|
| 7 |
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
| 8 |
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
|
|
|
|
|
|
| 9 |
NAME_HINT = re.compile(r"^(氏名|Name)[::]?\s*(.+)$", re.MULTILINE)
|
| 10 |
|
|
|
|
| 11 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
mapping: Dict[str, str] = {}
|
|
|
|
| 13 |
def _mask_all(pattern: re.Pattern, label: str, s: str) -> str:
|
| 14 |
idx = 1
|
| 15 |
def _repl(m):
|
|
@@ -25,16 +34,22 @@ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
|
| 25 |
out = _mask_all(EMAIL_RE, "EMAIL", out)
|
| 26 |
out = _mask_all(PHONE_RE, "TEL", out)
|
| 27 |
|
|
|
|
| 28 |
for m in NAME_HINT.finditer(text):
|
| 29 |
full = m.group(0)
|
| 30 |
name = m.group(2).strip()
|
| 31 |
if name and name not in mapping:
|
| 32 |
mapping[name] = "<NAME>"
|
|
|
|
| 33 |
out = out.replace(full, f"{m.group(1)}: <NAME>")
|
| 34 |
|
| 35 |
return out, mapping
|
| 36 |
|
|
|
|
| 37 |
def render_anonymized_pdf(text: str) -> bytes:
|
|
|
|
|
|
|
|
|
|
| 38 |
from io import BytesIO
|
| 39 |
buf = BytesIO()
|
| 40 |
c = canvas.Canvas(buf, pagesize=A4)
|
|
@@ -46,6 +61,8 @@ def render_anonymized_pdf(text: str) -> bytes:
|
|
| 46 |
if y < 20 * mm:
|
| 47 |
c.showPage()
|
| 48 |
y = height - 20 * mm
|
|
|
|
|
|
|
| 49 |
c.drawString(x, y, line[:2000])
|
| 50 |
y -= line_h
|
| 51 |
c.save()
|
|
|
|
| 4 |
from reportlab.pdfgen import canvas
|
| 5 |
from reportlab.lib.units import mm
|
| 6 |
|
| 7 |
+
# 連絡先の簡易検出
|
| 8 |
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
| 9 |
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
|
| 10 |
+
|
| 11 |
+
# 「氏名: 山田太郎」などの行ヒント(自由記述の想定)
|
| 12 |
NAME_HINT = re.compile(r"^(氏名|Name)[::]?\s*(.+)$", re.MULTILINE)
|
| 13 |
|
| 14 |
+
|
| 15 |
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
|
| 16 |
+
"""
|
| 17 |
+
テキスト内のメール・電話・氏名(ヒント行ベース)を置換。
|
| 18 |
+
置換マップも返す。
|
| 19 |
+
"""
|
| 20 |
mapping: Dict[str, str] = {}
|
| 21 |
+
|
| 22 |
def _mask_all(pattern: re.Pattern, label: str, s: str) -> str:
|
| 23 |
idx = 1
|
| 24 |
def _repl(m):
|
|
|
|
| 34 |
out = _mask_all(EMAIL_RE, "EMAIL", out)
|
| 35 |
out = _mask_all(PHONE_RE, "TEL", out)
|
| 36 |
|
| 37 |
+
# 氏名ヒント行
|
| 38 |
for m in NAME_HINT.finditer(text):
|
| 39 |
full = m.group(0)
|
| 40 |
name = m.group(2).strip()
|
| 41 |
if name and name not in mapping:
|
| 42 |
mapping[name] = "<NAME>"
|
| 43 |
+
# ヒント行自体も置換
|
| 44 |
out = out.replace(full, f"{m.group(1)}: <NAME>")
|
| 45 |
|
| 46 |
return out, mapping
|
| 47 |
|
| 48 |
+
|
| 49 |
def render_anonymized_pdf(text: str) -> bytes:
|
| 50 |
+
"""
|
| 51 |
+
匿名化済みテキストをシンプルなPDFにレンダリングして bytes を返す。
|
| 52 |
+
"""
|
| 53 |
from io import BytesIO
|
| 54 |
buf = BytesIO()
|
| 55 |
c = canvas.Canvas(buf, pagesize=A4)
|
|
|
|
| 61 |
if y < 20 * mm:
|
| 62 |
c.showPage()
|
| 63 |
y = height - 20 * mm
|
| 64 |
+
# ReportLabは日本語フォント未設定でも ASCII は描画される。
|
| 65 |
+
# 日本語を含む場合はカスタムフォント登録を追加実装。
|
| 66 |
c.drawString(x, y, line[:2000])
|
| 67 |
y -= line_h
|
| 68 |
c.save()
|