Corin1998 commited on
Commit
94af959
·
verified ·
1 Parent(s): 02c1f58

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +38 -45
pipelines/anonymize.py CHANGED
@@ -1,66 +1,59 @@
1
  import re
2
- from typing import Tuple, Dict, List
3
- from reportlab.pdfgen import canvas
4
  from reportlab.lib.pagesizes import A4
 
5
  from reportlab.lib.units import mm
6
- import io
7
 
8
- # 簡易パターン(必要に応じて精緻化してください)
9
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
10
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
11
- # 氏名候補(行頭・「氏名:」「Name:」付近など)
12
- NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]\s*([^\s ].{0,40})$", re.MULTILINE)
13
 
14
- def anonymize_text(text: str) -> Tuple[str, Dict[str, List[str]]]:
15
- replacements = {"EMAIL": [], "PHONE": [], "NAME": []}
16
- # メール
17
- def _email_sub(m):
18
- val = m.group(0)
19
- replacements["EMAIL"].append(val)
20
- return "[EMAIL REDACTED]"
21
- text = EMAIL_RE.sub(_email_sub, text)
22
 
23
- # 電話
24
- def _phone_sub(m):
25
- val = m.group(0)
26
- # 数字が少なすぎるものはノイズとして除外
27
- digits = re.sub(r"\D", "", val)
28
- if len(digits) < 8:
29
- return val
30
- replacements["PHONE"].append(val)
31
- return "[PHONE REDACTED]"
32
- text = PHONE_RE.sub(_phone_sub, text)
33
 
34
- # 氏名行
35
- for m in NAME_LINE_RE.finditer(text):
36
- name = m.group(1).strip()
37
- if name and name not in replacements["NAME"]:
38
- replacements["NAME"].append(name)
39
- text = NAME_LINE_RE.sub(lambda m: m.group(0).split(":")[0] + ": [NAME REDACTED]", text)
40
 
41
- return text, replacements
 
 
 
 
 
 
 
 
42
 
43
 
44
  def render_anonymized_pdf(text: str) -> bytes:
45
- buf = io.BytesIO()
 
46
  c = canvas.Canvas(buf, pagesize=A4)
47
  width, height = A4
48
- # マージン
49
- x0, y0 = 15 * mm, height - 20 * mm
50
- line_h = 6.0 * mm
51
- c.setFont("Helvetica", 10)
52
 
53
- lines = text.splitlines()
54
- y = y0
55
- for line in lines:
56
- # ページ送り
 
57
  if y < 20 * mm:
58
  c.showPage()
59
- c.setFont("Helvetica", 10)
60
- y = y0
61
- c.drawString(x0, y, line[:2000]) # 超長行は切り詰め
62
- y -= line_h
63
 
64
- c.showPage()
65
  c.save()
66
  return buf.getvalue()
 
1
  import re
2
+ from typing import Tuple, Dict
 
3
  from reportlab.lib.pagesizes import A4
4
+ from reportlab.pdfgen import canvas
5
  from reportlab.lib.units import mm
 
6
 
 
7
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
8
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
9
+ NAME_HINT = re.compile(r"^(氏名|Name)[::]?\s*(.+)$", re.MULTILINE)
10
+
11
 
12
+ def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
13
+ mapping: Dict[str, str] = {}
 
 
 
 
 
 
14
 
15
+ def _mask_all(pattern: re.Pattern, label: str, s: str) -> str:
16
+ idx = 1
17
+ def _repl(m):
18
+ nonlocal idx
19
+ key = m.group(0)
20
+ if key not in mapping:
21
+ mapping[key] = f"<{label}{idx}>"
22
+ idx += 1
23
+ return mapping[key]
24
+ return pattern.sub(_repl, s)
25
 
26
+ out = text
27
+ out = _mask_all(EMAIL_RE, "EMAIL", out)
28
+ out = _mask_all(PHONE_RE, "TEL", out)
 
 
 
29
 
30
+ # 名前行の簡易マスク
31
+ for m in NAME_HINT.finditer(text):
32
+ full = m.group(0)
33
+ name = m.group(2).strip()
34
+ if name and name not in mapping:
35
+ mapping[name] = "<NAME>"
36
+ out = out.replace(full, f"{m.group(1)}: <NAME>")
37
+
38
+ return out, mapping
39
 
40
 
41
  def render_anonymized_pdf(text: str) -> bytes:
42
+ from io import BytesIO
43
+ buf = BytesIO()
44
  c = canvas.Canvas(buf, pagesize=A4)
45
  width, height = A4
 
 
 
 
46
 
47
+ line_height = 6 * mm
48
+ x = 15 * mm
49
+ y = height - 20 * mm
50
+
51
+ for line in text.splitlines():
52
  if y < 20 * mm:
53
  c.showPage()
54
+ y = height - 20 * mm
55
+ c.drawString(x, y, line[:2000])
56
+ y -= line_height
 
57
 
 
58
  c.save()
59
  return buf.getvalue()