Corin1998 commited on
Commit
893163e
·
verified ·
1 Parent(s): d4d986b

Create parsing.py

Browse files
Files changed (1) hide show
  1. pipelines/parsing.py +31 -0
pipelines/parsing.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List
3
+
4
+ def normalize_resume(sections_dict: Dict[str, str]) -> Dict[str, List[Dict]]:
5
+ work_items = []
6
+ raw_work = sections_dict.get("work_experience", "") or sections_dict.get("work_experience_raw", "")
7
+ for line in raw_work.splitlines():
8
+ m = re.search(r"(\d{4}[./年]\d{1,2})\s*[-〜~]\s*(\d{4}[./年]?\d{0,2}|現在|至今)?", line)
9
+ if m:
10
+ work_items.append({"period": m.group(0), "text": line.strip()})
11
+
12
+ edu_items = [{"text": l.strip()} for l in (sections_dict.get("education", "") or sections_dict.get("education_raw", "")).splitlines() if l.strip()]
13
+ cert_items = [{"text": l.strip()} for l in (sections_dict.get("certifications", "") or sections_dict.get("certifications_raw", "")).splitlines() if l.strip()]
14
+
15
+ skills_raw = sections_dict.get("skills", "")
16
+ if isinstance(skills_raw, list):
17
+ skill_items = skills_raw
18
+ else:
19
+ skill_items = [s.strip() for s in re.split(r"[、,\n]\s*", skills_raw) if s.strip()]
20
+
21
+ return {
22
+ "work_experience": work_items,
23
+ "education": edu_items,
24
+ "certifications": cert_items,
25
+ "skills": skill_items,
26
+ "raw_sections": {
27
+ "work_experience": raw_work,
28
+ "education": sections_dict.get("education", "") or sections_dict.get("education_raw", ""),
29
+ "certifications": sections_dict.get("certifications", "") or sections_dict.get("certifications_raw", ""),
30
+ },
31
+ }