File size: 1,090 Bytes
e291dcc
 
 
 
63c77bb
 
 
 
 
 
 
e291dcc
 
 
 
 
 
 
 
 
 
 
 
 
 
63c77bb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re
from typing import Dict, List

SKILL_LEXICON = [
    "Python", "C++", "Java", "Go", "Rust", "JavaScript", "TypeScript", "SQL", "R",
    "PyTorch", "TensorFlow", "scikit-learn", "Hugging Face", "Transformers",
    "FastAPI", "Django", "Flask", "React", "Vue", "Next.js", "Node.js",
    "Spark", "Airflow", "Kafka",
    "AWS", "GCP", "Azure", "Docker", "Kubernetes",
    "Tableau", "Power BI", "Looker",
    "Git", "Linux", "Terraform", "OpenAPI",
]

def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
    emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
    return {"EMAIL": list(set(emails)), "PHONE": list(set(phones))}

def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
    contacts = _regex_ner_contacts(text)
    found = []
    text_lower = text.lower()
    for s in SKILL_LEXICON:
        if s.lower() in text_lower:
            found.append(s)

    return {"skills": sorted(list(set(found))), "contacts": contacts}