Corin1998 commited on
Commit
1c1a2e4
·
verified ·
1 Parent(s): 3d28506

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +17 -0
pipelines/anonymize.py CHANGED
@@ -4,12 +4,21 @@ from reportlab.lib.pagesizes import A4
4
  from reportlab.pdfgen import canvas
5
  from reportlab.lib.units import mm
6
 
 
7
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
8
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
 
 
9
  NAME_HINT = re.compile(r"^(氏名|Name)[::]?\s*(.+)$", re.MULTILINE)
10
 
 
11
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
 
 
 
 
12
  mapping: Dict[str, str] = {}
 
13
  def _mask_all(pattern: re.Pattern, label: str, s: str) -> str:
14
  idx = 1
15
  def _repl(m):
@@ -25,16 +34,22 @@ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
25
  out = _mask_all(EMAIL_RE, "EMAIL", out)
26
  out = _mask_all(PHONE_RE, "TEL", out)
27
 
 
28
  for m in NAME_HINT.finditer(text):
29
  full = m.group(0)
30
  name = m.group(2).strip()
31
  if name and name not in mapping:
32
  mapping[name] = "<NAME>"
 
33
  out = out.replace(full, f"{m.group(1)}: <NAME>")
34
 
35
  return out, mapping
36
 
 
37
  def render_anonymized_pdf(text: str) -> bytes:
 
 
 
38
  from io import BytesIO
39
  buf = BytesIO()
40
  c = canvas.Canvas(buf, pagesize=A4)
@@ -46,6 +61,8 @@ def render_anonymized_pdf(text: str) -> bytes:
46
  if y < 20 * mm:
47
  c.showPage()
48
  y = height - 20 * mm
 
 
49
  c.drawString(x, y, line[:2000])
50
  y -= line_h
51
  c.save()
 
4
  from reportlab.pdfgen import canvas
5
  from reportlab.lib.units import mm
6
 
7
+ # 連絡先の簡易検出
8
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
10
+
11
+ # 「氏名: 山田太郎」などの行ヒント(自由記述の想定)
12
  NAME_HINT = re.compile(r"^(氏名|Name)[::]?\s*(.+)$", re.MULTILINE)
13
 
14
+
15
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
16
+ """
17
+ テキスト内のメール・電話・氏名(ヒント行ベース)を置換。
18
+ 置換マップも返す。
19
+ """
20
  mapping: Dict[str, str] = {}
21
+
22
  def _mask_all(pattern: re.Pattern, label: str, s: str) -> str:
23
  idx = 1
24
  def _repl(m):
 
34
  out = _mask_all(EMAIL_RE, "EMAIL", out)
35
  out = _mask_all(PHONE_RE, "TEL", out)
36
 
37
+ # 氏名ヒント行
38
  for m in NAME_HINT.finditer(text):
39
  full = m.group(0)
40
  name = m.group(2).strip()
41
  if name and name not in mapping:
42
  mapping[name] = "<NAME>"
43
+ # ヒント行自体も置換
44
  out = out.replace(full, f"{m.group(1)}: <NAME>")
45
 
46
  return out, mapping
47
 
48
+
49
  def render_anonymized_pdf(text: str) -> bytes:
50
+ """
51
+ 匿名化済みテキストをシンプルなPDFにレンダリングして bytes を返す。
52
+ """
53
  from io import BytesIO
54
  buf = BytesIO()
55
  c = canvas.Canvas(buf, pagesize=A4)
 
61
  if y < 20 * mm:
62
  c.showPage()
63
  y = height - 20 * mm
64
+ # ReportLabは日本語フォント未設定でも ASCII は描画される。
65
+ # 日本語を含む場合はカスタムフォント登録を追加実装。
66
  c.drawString(x, y, line[:2000])
67
  y -= line_h
68
  c.save()