Corin1998 commited on
Commit
63c77bb
·
verified ·
1 Parent(s): c198598

Update pipelines/skills.py

Browse files
Files changed (1) hide show
  1. pipelines/skills.py +9 -13
pipelines/skills.py CHANGED
@@ -2,17 +2,15 @@ import re
2
  from typing import Dict, List
3
 
4
  SKILL_LEXICON = [
5
- "Python","C++","Java","Go","Rust","JavaScript","TypeScript","SQL","R",
6
- "PyTorch","TensorFlow","Keras","scikit-learn","Hugging Face","Transformers",
7
- "FastAPI","Django","Flask","React","Vue","Next.js","Node.js",
8
- "Spark","Hadoop","Airflow","dbt","Kafka",
9
- "AWS","GCP","Azure","Docker","Kubernetes",
10
- "Tableau","Power BI","Looker",
11
- "Git","Linux","Terraform","OpenAPI",
12
  ]
13
 
14
- NAME_HINTS = ["氏名", "Name"]
15
-
16
  def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
17
  emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
18
  phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
@@ -20,12 +18,10 @@ def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
20
 
21
  def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
22
  contacts = _regex_ner_contacts(text)
23
- name_lines = []
24
- for hint in NAME_HINTS:
25
- name_lines += [l.strip() for l in text.splitlines() if hint in l][:3]
26
  found = []
27
  text_lower = text.lower()
28
  for s in SKILL_LEXICON:
29
  if s.lower() in text_lower:
30
  found.append(s)
31
- return {"skills": sorted(list(set(found))), "contacts": contacts, "name_candidates": name_lines}
 
 
2
  from typing import Dict, List
3
 
4
  SKILL_LEXICON = [
5
+ "Python", "C++", "Java", "Go", "Rust", "JavaScript", "TypeScript", "SQL", "R",
6
+ "PyTorch", "TensorFlow", "scikit-learn", "Hugging Face", "Transformers",
7
+ "FastAPI", "Django", "Flask", "React", "Vue", "Next.js", "Node.js",
8
+ "Spark", "Airflow", "Kafka",
9
+ "AWS", "GCP", "Azure", "Docker", "Kubernetes",
10
+ "Tableau", "Power BI", "Looker",
11
+ "Git", "Linux", "Terraform", "OpenAPI",
12
  ]
13
 
 
 
14
  def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
15
  emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
16
  phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
 
18
 
19
  def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
20
  contacts = _regex_ner_contacts(text)
 
 
 
21
  found = []
22
  text_lower = text.lower()
23
  for s in SKILL_LEXICON:
24
  if s.lower() in text_lower:
25
  found.append(s)
26
+
27
+ return {"skills": sorted(list(set(found))), "contacts": contacts}