Corin1998 commited on
Commit
e199664
·
verified ·
1 Parent(s): fa24e7c

Update pipelines/anonymize.py

Browse files
Files changed (1) hide show
  1. pipelines/anonymize.py +53 -32
pipelines/anonymize.py CHANGED
@@ -1,45 +1,66 @@
1
  import re
2
- from typing import Dict, Tuple
3
- from reportlab.platypus import SimpleDocTemplate, Preformatted
4
- from reportlab.lib.styles import getSampleStyleSheet
5
  from reportlab.lib.pagesizes import A4
 
6
  import io
7
 
8
- _EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
- _PHONE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
 
 
 
10
 
11
- # よくある氏名のパターン(超簡易)
12
- _NAME = re.compile(r"(?:氏名[::]?\s*|Name[::]?\s*)([^\s ]+)")
 
 
 
 
 
 
13
 
14
- def anonymize_text(text: str) -> Tuple[str, Dict[str, str]]:
15
- mapping: Dict[str, str] = {}
16
- def _mask(match, prefix):
17
- val = match.group(0)
18
- if val not in mapping:
19
- mapping[val] = f"[{prefix}_{len(mapping)+1}]"
20
- return mapping[val]
 
 
 
21
 
22
- # Email / Phone
23
- text = _EMAIL.sub(lambda m: _mask(m, "EMAIL"), text)
24
- text = _PHONE.sub(lambda m: _mask(m, "PHONE"), text)
 
 
 
25
 
26
- # 氏名(ヒューリスティック)
27
- def _name_mask(m):
28
- full = m.group(0)
29
- name = m.group(1)
30
- if name not in mapping:
31
- mapping[name] = f"[NAME_{len(mapping)+1}]"
32
- return full.replace(name, mapping[name])
33
 
34
- text = _NAME.sub(_name_mask, text)
35
-
36
- return text, mapping
37
 
38
  def render_anonymized_pdf(text: str) -> bytes:
39
  buf = io.BytesIO()
40
- doc = SimpleDocTemplate(buf, pagesize=A4)
41
- styles = getSampleStyleSheet()
42
- # 改行維持
43
- flow = [Preformatted(text, styles["Code"])]
44
- doc.build(flow)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  return buf.getvalue()
 
1
  import re
2
+ from typing import Tuple, Dict, List
3
+ from reportlab.pdfgen import canvas
 
4
  from reportlab.lib.pagesizes import A4
5
+ from reportlab.lib.units import mm
6
  import io
7
 
8
+ # 簡易パターン(必要に応じて精緻化してください)
9
+ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
10
+ PHONE_RE = re.compile(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}")
11
+ # 氏名候補(行頭・「氏名:」「Name:」付近など)
12
+ NAME_LINE_RE = re.compile(r"^(?:氏名|Name)\s*[::]\s*([^\s ].{0,40})$", re.MULTILINE)
13
 
14
+ def anonymize_text(text: str) -> Tuple[str, Dict[str, List[str]]]:
15
+ replacements = {"EMAIL": [], "PHONE": [], "NAME": []}
16
+ # メール
17
+ def _email_sub(m):
18
+ val = m.group(0)
19
+ replacements["EMAIL"].append(val)
20
+ return "[EMAIL REDACTED]"
21
+ text = EMAIL_RE.sub(_email_sub, text)
22
 
23
+ # 電話
24
+ def _phone_sub(m):
25
+ val = m.group(0)
26
+ # 数字が少なすぎるものはノイズとして除外
27
+ digits = re.sub(r"\D", "", val)
28
+ if len(digits) < 8:
29
+ return val
30
+ replacements["PHONE"].append(val)
31
+ return "[PHONE REDACTED]"
32
+ text = PHONE_RE.sub(_phone_sub, text)
33
 
34
+ # 氏名行
35
+ for m in NAME_LINE_RE.finditer(text):
36
+ name = m.group(1).strip()
37
+ if name and name not in replacements["NAME"]:
38
+ replacements["NAME"].append(name)
39
+ text = NAME_LINE_RE.sub(lambda m: m.group(0).split(":")[0] + ": [NAME REDACTED]", text)
40
 
41
+ return text, replacements
 
 
 
 
 
 
42
 
 
 
 
43
 
44
  def render_anonymized_pdf(text: str) -> bytes:
45
  buf = io.BytesIO()
46
+ c = canvas.Canvas(buf, pagesize=A4)
47
+ width, height = A4
48
+ # マージン
49
+ x0, y0 = 15 * mm, height - 20 * mm
50
+ line_h = 6.0 * mm
51
+ c.setFont("Helvetica", 10)
52
+
53
+ lines = text.splitlines()
54
+ y = y0
55
+ for line in lines:
56
+ # ページ送り
57
+ if y < 20 * mm:
58
+ c.showPage()
59
+ c.setFont("Helvetica", 10)
60
+ y = y0
61
+ c.drawString(x0, y, line[:2000]) # 超長行は切り詰め
62
+ y -= line_h
63
+
64
+ c.showPage()
65
+ c.save()
66
  return buf.getvalue()