Corin1998 commited on
Commit
7fced16
·
verified ·
1 Parent(s): 893163e

Create merge.py

Browse files
Files changed (1) hide show
  1. pipelines/merge.py +45 -0
pipelines/merge.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import re
3
+
4
+ def _period_key(item_text: str) -> str:
5
+ m = re.search(r"(\d{4}[./年]\d{1,2})\s*[-〜~]\s*(\d{4}[./年]?\d{0,2}|現在|至今)?", item_text)
6
+ return m.group(0) if m else item_text[:50]
7
+
8
+ def merge_normalized_records(records: List[Dict]) -> Dict:
9
+ merged = {"work_experience": [], "education": [], "certifications": [], "skills": [], "raw_sections": {}}
10
+ seen_we, seen_edu, seen_cert, skill_set = set(), set(), set(), set()
11
+
12
+ for r in records:
13
+ for w in r.get("work_experience", []):
14
+ key = _period_key(w.get("text", "")) + "|" + w.get("text", "")[:80]
15
+ if key not in seen_we:
16
+ seen_we.add(key)
17
+ merged["work_experience"].append(w)
18
+ for e in r.get("education", []):
19
+ k = e.get("text", "")
20
+ if k and k not in seen_edu:
21
+ seen_edu.add(k)
22
+ merged["education"].append(e)
23
+ for c in r.get("certifications", []):
24
+ k = c.get("text", "")
25
+ if k and k not in seen_cert:
26
+ seen_cert.add(k)
27
+ merged["certifications"].append(c)
28
+ for s in r.get("skills", []):
29
+ if s:
30
+ skill_set.add(s)
31
+ for k, v in r.get("raw_sections", {}).items():
32
+ merged["raw_sections"][k] = (merged["raw_sections"].get(k, "") + "\n" + v).strip()
33
+
34
+ def _sort_key(w):
35
+ m = re.search(r"(\d{4})([./年])(\d{1,2})", w.get("period", "") or w.get("text", ""))
36
+ if m:
37
+ try:
38
+ return (-(int(m.group(1)) * 100 + int(m.group(3))), 0)
39
+ except Exception:
40
+ return (0, 1)
41
+ return (0, 1)
42
+
43
+ merged["work_experience"].sort(key=_sort_key)
44
+ merged["skills"] = sorted(skill_set)
45
+ return merged