Corin1998's picture
Update pipelines/anonymize.py
bb3db47 verified
import re
from typing import Tuple, Dict
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.pdfbase.pdfmetrics import stringWidth
from io import BytesIO
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
NAME_HINT_RE = re.compile(r"(氏名[::]?\s*)(\S+)", re.IGNORECASE)
def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
"""
超軽量匿名化:メール/電話/氏名っぽい先頭行を [REDACTED_*] に置換。
"""
mapping: Dict[str, str] = {}
def _sub_and_store(pattern, repl_key, s):
def _repl(m):
original = m.group(0)
masked = f"[REDACTED_{repl_key}]"
mapping[original] = masked
return masked
return pattern.sub(_repl, s)
out = text
out = _sub_and_store(EMAIL_RE, "EMAIL", out)
out = _sub_and_store(PHONE_RE, "PHONE", out)
# 氏名ヒント(例: "氏名: 山田太郎")
def _name_repl(m):
original = m.group(2)
masked = "[REDACTED_NAME]"
mapping[original] = masked
return m.group(1) + masked
out = NAME_HINT_RE.sub(_name_repl, out, count=1)
return out, mapping
def render_anonymized_pdf(text: str) -> bytes:
"""
依存を増やさずにReportLabでテキストをA4に流し込む最小PDFレンダラ。
"""
buf = BytesIO()
c = canvas.Canvas(buf, pagesize=A4)
width, height = A4
left_margin = 40
right_margin = 40
top_margin = 40
bottom_margin = 40
y = height - top_margin
max_width = width - left_margin - right_margin
line_height = 14
def _draw_wrapped(line: str):
nonlocal y
if not line:
y -= line_height
if y < bottom_margin:
c.showPage()
y = height - top_margin
return
words = line.split(" ")
current = ""
for w in words:
trial = (current + " " + w).strip()
if stringWidth(trial, "Helvetica", 11) <= max_width:
current = trial
else:
c.setFont("Helvetica", 11)
c.drawString(left_margin, y, current)
y -= line_height
if y < bottom_margin:
c.showPage()
y = height - top_margin
current = w
if current:
c.setFont("Helvetica", 11)
c.drawString(left_margin, y, current)
y -= line_height
if y < bottom_margin:
c.showPage()
y = height - top_margin
for line in text.splitlines():
_draw_wrapped(line)
c.showPage()
c.save()
return buf.getvalue()