Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -616,22 +616,61 @@ def _extract_json(raw: str) -> str:
|
|
| 616 |
json_text = json_text.strip()
|
| 617 |
return json_text
|
| 618 |
|
|
|
|
|
|
|
| 619 |
def process_pdf(file):
|
| 620 |
if file is None:
|
| 621 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
try:
|
| 623 |
extracted_text = extract_text_from_pdf(file.name)
|
| 624 |
responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
|
| 625 |
if not responsibilities:
|
| 626 |
log_debug(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
|
| 627 |
-
return
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
skill_map = map_proficiency_and_assessment(skills, responsibilities)
|
|
|
|
| 635 |
has_esco = esco_occ.get("Level_5_ESCO_code") is not None
|
| 636 |
skill_esco_extract = []
|
| 637 |
skill_esco_map = []
|
|
@@ -641,6 +680,7 @@ def process_pdf(file):
|
|
| 641 |
skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
|
| 642 |
else:
|
| 643 |
log_debug(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
|
|
|
|
| 644 |
time.sleep(6)
|
| 645 |
assessment_lookup = {item['skill_name']: item for item in skill_map}
|
| 646 |
joined_skills = [
|
|
@@ -657,6 +697,7 @@ def process_pdf(file):
|
|
| 657 |
}
|
| 658 |
for skill in skills
|
| 659 |
]
|
|
|
|
| 660 |
joined_skills_esco = []
|
| 661 |
if has_esco and skill_esco_extract:
|
| 662 |
assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
|
|
@@ -669,44 +710,54 @@ def process_pdf(file):
|
|
| 669 |
}
|
| 670 |
for skill in skill_esco_extract
|
| 671 |
]
|
|
|
|
| 672 |
interview = build_interview(responsibilities, skills)
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
"
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
"file": os.path.basename(file.name),
|
| 683 |
"classified_job_family": job_family,
|
| 684 |
-
"skills":
|
| 685 |
}
|
| 686 |
-
}
|
| 687 |
-
if has_esco:
|
| 688 |
-
result.update({
|
| 689 |
-
**{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
|
| 690 |
-
for i in range(1, 6) for field in ["code", "name", "desc"]},
|
| 691 |
-
"skills_esco": {
|
| 692 |
-
"file": os.path.basename(file.name),
|
| 693 |
-
"classified_job_family": job_family,
|
| 694 |
-
"skills": joined_skills_esco
|
| 695 |
-
}
|
| 696 |
-
})
|
| 697 |
else:
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
})
|
| 703 |
-
|
| 704 |
debug_message = "Processing completed successfully."
|
| 705 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
|
| 707 |
except Exception as e:
|
| 708 |
error_message = f"Error processing PDF: {str(e)}"
|
| 709 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
|
| 711 |
from docx import Document
|
| 712 |
|
|
|
|
| 616 |
json_text = json_text.strip()
|
| 617 |
return json_text
|
| 618 |
|
| 619 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 620 |
+
|
| 621 |
def process_pdf(file):
|
| 622 |
if file is None:
|
| 623 |
+
return (
|
| 624 |
+
"Please upload a PDF file.",
|
| 625 |
+
"",
|
| 626 |
+
"",
|
| 627 |
+
"",
|
| 628 |
+
{},
|
| 629 |
+
"",
|
| 630 |
+
[],
|
| 631 |
+
{},
|
| 632 |
+
{},
|
| 633 |
+
"No file uploaded."
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
try:
|
| 637 |
extracted_text = extract_text_from_pdf(file.name)
|
| 638 |
responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
|
| 639 |
if not responsibilities:
|
| 640 |
log_debug(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
|
| 641 |
+
return (
|
| 642 |
+
os.path.basename(file.name),
|
| 643 |
+
"",
|
| 644 |
+
"",
|
| 645 |
+
"",
|
| 646 |
+
{},
|
| 647 |
+
"",
|
| 648 |
+
[],
|
| 649 |
+
{},
|
| 650 |
+
{},
|
| 651 |
+
"No responsibilities section found."
|
| 652 |
+
)
|
| 653 |
+
|
| 654 |
+
# Use ThreadPoolExecutor to parallelize independent tasks
|
| 655 |
+
with ThreadPoolExecutor() as executor:
|
| 656 |
+
# Submit tasks to the executor
|
| 657 |
+
job_family_future = executor.submit(classify_job_family, responsibilities)
|
| 658 |
+
occ_group_future = executor.submit(classify_occupational_group_by_level, responsibilities)
|
| 659 |
+
esco_occ_future = executor.submit(classify_esco_by_hierarchical_level, responsibilities)
|
| 660 |
+
qualification_future = executor.submit(extract_qualification, responsibilities)
|
| 661 |
+
skills_future = executor.submit(extract_skills, responsibilities)
|
| 662 |
+
|
| 663 |
+
# Retrieve results from futures
|
| 664 |
+
job_family = job_family_future.result()
|
| 665 |
+
occ_group = occ_group_future.result()
|
| 666 |
+
esco_occ = esco_occ_future.result()
|
| 667 |
+
qualification = qualification_future.result()
|
| 668 |
+
skills = skills_future.result()
|
| 669 |
+
|
| 670 |
+
log_debug(f"Identified {job_family}")
|
| 671 |
+
|
| 672 |
skill_map = map_proficiency_and_assessment(skills, responsibilities)
|
| 673 |
+
|
| 674 |
has_esco = esco_occ.get("Level_5_ESCO_code") is not None
|
| 675 |
skill_esco_extract = []
|
| 676 |
skill_esco_map = []
|
|
|
|
| 680 |
skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
|
| 681 |
else:
|
| 682 |
log_debug(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
|
| 683 |
+
|
| 684 |
time.sleep(6)
|
| 685 |
assessment_lookup = {item['skill_name']: item for item in skill_map}
|
| 686 |
joined_skills = [
|
|
|
|
| 697 |
}
|
| 698 |
for skill in skills
|
| 699 |
]
|
| 700 |
+
|
| 701 |
joined_skills_esco = []
|
| 702 |
if has_esco and skill_esco_extract:
|
| 703 |
assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
|
|
|
|
| 710 |
}
|
| 711 |
for skill in skill_esco_extract
|
| 712 |
]
|
| 713 |
+
|
| 714 |
interview = build_interview(responsibilities, skills)
|
| 715 |
+
|
| 716 |
+
# Prepare the results for each output component
|
| 717 |
+
ccoq_levels = {f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
|
| 718 |
+
for i in range(1, 5) for field in ["code", "name", "desc"]}
|
| 719 |
+
|
| 720 |
+
if has_esco:
|
| 721 |
+
esco_levels = {f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
|
| 722 |
+
for i in range(1, 6) for field in ["code", "name", "desc"]}
|
| 723 |
+
esco_skills = {
|
| 724 |
+
"file": os.path.basename(file.name),
|
| 725 |
"classified_job_family": job_family,
|
| 726 |
+
"skills": joined_skills_esco
|
| 727 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
else:
|
| 729 |
+
esco_levels = {f"Level_{i}_ESCO_{field}": None
|
| 730 |
+
for i in range(1, 6) for field in ["code", "name", "desc"]}
|
| 731 |
+
esco_skills = None
|
| 732 |
+
|
|
|
|
|
|
|
| 733 |
debug_message = "Processing completed successfully."
|
| 734 |
+
return (
|
| 735 |
+
os.path.basename(file.name),
|
| 736 |
+
responsibilities,
|
| 737 |
+
job_family,
|
| 738 |
+
"\n".join(qualification),
|
| 739 |
+
ccoq_levels,
|
| 740 |
+
"\n".join(interview),
|
| 741 |
+
joined_skills,
|
| 742 |
+
esco_levels,
|
| 743 |
+
esco_skills,
|
| 744 |
+
debug_message if DEBUG else None
|
| 745 |
+
)
|
| 746 |
|
| 747 |
except Exception as e:
|
| 748 |
error_message = f"Error processing PDF: {str(e)}"
|
| 749 |
+
return (
|
| 750 |
+
error_message,
|
| 751 |
+
"",
|
| 752 |
+
"",
|
| 753 |
+
"",
|
| 754 |
+
{},
|
| 755 |
+
"",
|
| 756 |
+
[],
|
| 757 |
+
{},
|
| 758 |
+
{},
|
| 759 |
+
error_message
|
| 760 |
+
)
|
| 761 |
|
| 762 |
from docx import Document
|
| 763 |
|