Spaces:
Sleeping
Sleeping
Update pipelines/skills.py
Browse files- pipelines/skills.py +9 -13
pipelines/skills.py
CHANGED
|
@@ -2,17 +2,15 @@ import re
|
|
| 2 |
from typing import Dict, List
|
| 3 |
|
| 4 |
SKILL_LEXICON = [
|
| 5 |
-
"Python","C++","Java","Go","Rust","JavaScript","TypeScript","SQL","R",
|
| 6 |
-
"PyTorch","TensorFlow","
|
| 7 |
-
"FastAPI","Django","Flask","React","Vue","Next.js","Node.js",
|
| 8 |
-
"Spark","
|
| 9 |
-
"AWS","GCP","Azure","Docker","Kubernetes",
|
| 10 |
-
"Tableau","Power BI","Looker",
|
| 11 |
-
"Git","Linux","Terraform","OpenAPI",
|
| 12 |
]
|
| 13 |
|
| 14 |
-
NAME_HINTS = ["氏名", "Name"]
|
| 15 |
-
|
| 16 |
def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
|
| 17 |
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
|
| 18 |
phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
|
|
@@ -20,12 +18,10 @@ def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
|
|
| 20 |
|
| 21 |
def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
|
| 22 |
contacts = _regex_ner_contacts(text)
|
| 23 |
-
name_lines = []
|
| 24 |
-
for hint in NAME_HINTS:
|
| 25 |
-
name_lines += [l.strip() for l in text.splitlines() if hint in l][:3]
|
| 26 |
found = []
|
| 27 |
text_lower = text.lower()
|
| 28 |
for s in SKILL_LEXICON:
|
| 29 |
if s.lower() in text_lower:
|
| 30 |
found.append(s)
|
| 31 |
-
|
|
|
|
|
|
| 2 |
from typing import Dict, List
|
| 3 |
|
| 4 |
SKILL_LEXICON = [
|
| 5 |
+
"Python", "C++", "Java", "Go", "Rust", "JavaScript", "TypeScript", "SQL", "R",
|
| 6 |
+
"PyTorch", "TensorFlow", "scikit-learn", "Hugging Face", "Transformers",
|
| 7 |
+
"FastAPI", "Django", "Flask", "React", "Vue", "Next.js", "Node.js",
|
| 8 |
+
"Spark", "Airflow", "Kafka",
|
| 9 |
+
"AWS", "GCP", "Azure", "Docker", "Kubernetes",
|
| 10 |
+
"Tableau", "Power BI", "Looker",
|
| 11 |
+
"Git", "Linux", "Terraform", "OpenAPI",
|
| 12 |
]
|
| 13 |
|
|
|
|
|
|
|
| 14 |
def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
|
| 15 |
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
|
| 16 |
phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
|
|
|
|
| 18 |
|
| 19 |
def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
|
| 20 |
contacts = _regex_ner_contacts(text)
|
|
|
|
|
|
|
|
|
|
| 21 |
found = []
|
| 22 |
text_lower = text.lower()
|
| 23 |
for s in SKILL_LEXICON:
|
| 24 |
if s.lower() in text_lower:
|
| 25 |
found.append(s)
|
| 26 |
+
|
| 27 |
+
return {"skills": sorted(list(set(found))), "contacts": contacts}
|