Corin1998 commited on
Commit
088d472
·
verified ·
1 Parent(s): bc98150

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +27 -43
pipelines/anonymize.py CHANGED
@@ -2,63 +2,47 @@ import re
2
  from typing import Tuple, Dict
3
  from reportlab.lib.pagesizes import A4
4
  from reportlab.pdfgen import canvas
5
- from reportlab.lib.units import mm
6
- import io
7
 
8
- # 簡易匿名化(メール・電話・氏名行のマスク)
9
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
10
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
11
- NAME_LINE_RE = re.compile(r"^(?:氏名|Name)[::]?\s*([^\n\r]+)$", re.MULTILINE)
12
 
13
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
14
  mapping = {}
15
- # メール
16
  def _mask_email(m):
17
- key = f"EMAIL_{len(mapping)+1}"
18
- mapping[m.group(0)] = key
19
- return key + "@masked"
20
- text = EMAIL_RE.sub(_mask_email, text)
21
-
22
- # 電話
23
  def _mask_phone(m):
24
- ph = m.group(0)
25
- # 短すぎるノイズはスキップ
26
- if len(re.sub(r"\D", "", ph)) < 7:
27
- return ph
28
- key = f"TEL_{len(mapping)+1}"
29
- mapping[ph] = key
30
- return key
31
- text = PHONE_RE.sub(_mask_phone, text)
32
-
33
- # 氏名行
34
- def _mask_name(m):
35
- val = m.group(1).strip()
36
- key = f"NAME_{len(mapping)+1}"
37
- mapping[val] = key
38
- return m.group(0).replace(val, key)
39
- text = NAME_LINE_RE.sub(_mask_name, text)
40
 
41
- return text, mapping
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def render_anonymized_pdf(text: str) -> bytes:
44
- buf = io.BytesIO()
45
  c = canvas.Canvas(buf, pagesize=A4)
46
  width, height = A4
47
-
48
- # 素朴なテキスト描画(自動改頁)
49
- margin_x = 15 * mm
50
- margin_y = 15 * mm
51
- y = height - margin_y
52
- c.setFont("Helvetica", 10)
53
-
54
  for line in text.splitlines():
55
- if y < margin_y:
56
  c.showPage()
57
- c.setFont("Helvetica", 10)
58
- y = height - margin_y
59
- c.drawString(margin_x, y, line[:180]) # 超長行は素直に切る
60
- y -= 12
61
-
62
- c.showPage()
63
  c.save()
64
  return buf.getvalue()
 
2
  from typing import Tuple, Dict
3
  from reportlab.lib.pagesizes import A4
4
  from reportlab.pdfgen import canvas
5
+ from io import BytesIO
 
6
 
7
+ # ごく簡易な匿名化(メール/電話/氏名候補っぽい行)
8
  EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
  PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
10
+ NAME_HINT = re.compile(r"^\s*(氏名|Name)\s*[::]?\s*(.+)$")
11
 
12
  def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
13
  mapping = {}
 
14
  def _mask_email(m):
15
+ val = m.group(0); k = f"EMAIL_{len(mapping)+1}"
16
+ mapping[val] = k; return k + "@masked"
 
 
 
 
17
  def _mask_phone(m):
18
+ val = m.group(0); k = f"TEL_{len(mapping)+1}"
19
+ mapping[val] = k; return k
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # line単位で氏名候補をマスク
22
+ lines = []
23
+ for line in text.splitlines():
24
+ nm = NAME_HINT.search(line)
25
+ if nm:
26
+ full = nm.group(0)
27
+ mapping[full] = "NAME_MASKED"
28
+ lines.append("氏名: NAME_MASKED")
29
+ else:
30
+ lines.append(line)
31
+ masked = "\n".join(lines)
32
+ masked = EMAIL_RE.sub(_mask_email, masked)
33
+ masked = PHONE_RE.sub(_mask_phone, masked)
34
+ return masked, mapping
35
 
36
  def render_anonymized_pdf(text: str) -> bytes:
37
+ buf = BytesIO()
38
  c = canvas.Canvas(buf, pagesize=A4)
39
  width, height = A4
40
+ x, y = 40, height - 40
 
 
 
 
 
 
41
  for line in text.splitlines():
42
+ if y < 40:
43
  c.showPage()
44
+ y = height - 40
45
+ c.drawString(x, y, line[:1000]) # 1行制限(超長行ガード)
46
+ y -= 14
 
 
 
47
  c.save()
48
  return buf.getvalue()