Corin1998 commited on
Commit
e291dcc
·
verified ·
1 Parent(s): 7fced16

Create skills.py

Browse files
Files changed (1) hide show
  1. pipelines/skills.py +31 -0
pipelines/skills.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List
3
+
4
+ SKILL_LEXICON = [
5
+ "Python","C++","Java","Go","Rust","JavaScript","TypeScript","SQL","R",
6
+ "PyTorch","TensorFlow","Keras","scikit-learn","Hugging Face","Transformers",
7
+ "FastAPI","Django","Flask","React","Vue","Next.js","Node.js",
8
+ "Spark","Hadoop","Airflow","dbt","Kafka",
9
+ "AWS","GCP","Azure","Docker","Kubernetes",
10
+ "Tableau","Power BI","Looker",
11
+ "Git","Linux","Terraform","OpenAPI",
12
+ ]
13
+
14
+ NAME_HINTS = ["氏名", "Name"]
15
+
16
+ def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
17
+ emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
18
+ phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
19
+ return {"EMAIL": list(set(emails)), "PHONE": list(set(phones))}
20
+
21
+ def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
22
+ contacts = _regex_ner_contacts(text)
23
+ name_lines = []
24
+ for hint in NAME_HINTS:
25
+ name_lines += [l.strip() for l in text.splitlines() if hint in l][:3]
26
+ found = []
27
+ text_lower = text.lower()
28
+ for s in SKILL_LEXICON:
29
+ if s.lower() in text_lower:
30
+ found.append(s)
31
+ return {"skills": sorted(list(set(found))), "contacts": contacts, "name_candidates": name_lines}