edouardlgp commited on
Commit
60b7707
·
verified ·
1 Parent(s): f2ffdef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -38
app.py CHANGED
@@ -616,22 +616,61 @@ def _extract_json(raw: str) -> str:
616
  json_text = json_text.strip()
617
  return json_text
618
 
 
 
619
  def process_pdf(file):
620
  if file is None:
621
- return "Please upload a PDF file."
 
 
 
 
 
 
 
 
 
 
 
 
622
  try:
623
  extracted_text = extract_text_from_pdf(file.name)
624
  responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
625
  if not responsibilities:
626
  log_debug(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
627
- return None
628
- job_family = classify_job_family(responsibilities)
629
- log_debug(f"Identified {job_family} ")
630
- occ_group = classify_occupational_group_by_level(responsibilities)
631
- esco_occ = classify_esco_by_hierarchical_level(responsibilities)
632
- qualification = extract_qualification(responsibilities)
633
- skills = extract_skills(responsibilities)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  skill_map = map_proficiency_and_assessment(skills, responsibilities)
 
635
  has_esco = esco_occ.get("Level_5_ESCO_code") is not None
636
  skill_esco_extract = []
637
  skill_esco_map = []
@@ -641,6 +680,7 @@ def process_pdf(file):
641
  skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
642
  else:
643
  log_debug(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
 
644
  time.sleep(6)
645
  assessment_lookup = {item['skill_name']: item for item in skill_map}
646
  joined_skills = [
@@ -657,6 +697,7 @@ def process_pdf(file):
657
  }
658
  for skill in skills
659
  ]
 
660
  joined_skills_esco = []
661
  if has_esco and skill_esco_extract:
662
  assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
@@ -669,44 +710,54 @@ def process_pdf(file):
669
  }
670
  for skill in skill_esco_extract
671
  ]
 
672
  interview = build_interview(responsibilities, skills)
673
- result = {
674
- "file": os.path.basename(file.name),
675
- "responsibilities": responsibilities,
676
- "classified_job_family": job_family,
677
- **{f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
678
- for i in range(1, 5) for field in ["code", "name", "desc"]},
679
- "qualification": qualification,
680
- "interview": interview,
681
- "skills": {
682
- "file": os.path.basename(file.name),
683
  "classified_job_family": job_family,
684
- "skills": joined_skills
685
  }
686
- }
687
- if has_esco:
688
- result.update({
689
- **{f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
690
- for i in range(1, 6) for field in ["code", "name", "desc"]},
691
- "skills_esco": {
692
- "file": os.path.basename(file.name),
693
- "classified_job_family": job_family,
694
- "skills": joined_skills_esco
695
- }
696
- })
697
  else:
698
- result.update({
699
- **{f"Level_{i}_ESCO_{field}": None
700
- for i in range(1, 6) for field in ["code", "name", "desc"]},
701
- "skills_esco": None
702
- })
703
-
704
  debug_message = "Processing completed successfully."
705
- return result, debug_message
 
 
 
 
 
 
 
 
 
 
 
706
 
707
  except Exception as e:
708
  error_message = f"Error processing PDF: {str(e)}"
709
- return error_message, error_message
 
 
 
 
 
 
 
 
 
 
 
710
 
711
  from docx import Document
712
 
 
616
  json_text = json_text.strip()
617
  return json_text
618
 
619
+ from concurrent.futures import ThreadPoolExecutor
620
+
621
  def process_pdf(file):
622
  if file is None:
623
+ return (
624
+ "Please upload a PDF file.",
625
+ "",
626
+ "",
627
+ "",
628
+ {},
629
+ "",
630
+ [],
631
+ {},
632
+ {},
633
+ "No file uploaded."
634
+ )
635
+
636
  try:
637
  extracted_text = extract_text_from_pdf(file.name)
638
  responsibilities = extract_section_from_pdf(extracted_text, section_title="Responsibilities and Accountabilities")
639
  if not responsibilities:
640
  log_debug(f"Skipping {os.path.basename(file.name)} - no responsibilities section found")
641
+ return (
642
+ os.path.basename(file.name),
643
+ "",
644
+ "",
645
+ "",
646
+ {},
647
+ "",
648
+ [],
649
+ {},
650
+ {},
651
+ "No responsibilities section found."
652
+ )
653
+
654
+ # Use ThreadPoolExecutor to parallelize independent tasks
655
+ with ThreadPoolExecutor() as executor:
656
+ # Submit tasks to the executor
657
+ job_family_future = executor.submit(classify_job_family, responsibilities)
658
+ occ_group_future = executor.submit(classify_occupational_group_by_level, responsibilities)
659
+ esco_occ_future = executor.submit(classify_esco_by_hierarchical_level, responsibilities)
660
+ qualification_future = executor.submit(extract_qualification, responsibilities)
661
+ skills_future = executor.submit(extract_skills, responsibilities)
662
+
663
+ # Retrieve results from futures
664
+ job_family = job_family_future.result()
665
+ occ_group = occ_group_future.result()
666
+ esco_occ = esco_occ_future.result()
667
+ qualification = qualification_future.result()
668
+ skills = skills_future.result()
669
+
670
+ log_debug(f"Identified {job_family}")
671
+
672
  skill_map = map_proficiency_and_assessment(skills, responsibilities)
673
+
674
  has_esco = esco_occ.get("Level_5_ESCO_code") is not None
675
  skill_esco_extract = []
676
  skill_esco_map = []
 
680
  skill_esco_map = map_proficiency_and_assessment(skill_esco_extract, responsibilities)
681
  else:
682
  log_debug(f"No Level 5 ESCO code found for {os.path.basename(file.name)}, skipping ESCO skills mapping")
683
+
684
  time.sleep(6)
685
  assessment_lookup = {item['skill_name']: item for item in skill_map}
686
  joined_skills = [
 
697
  }
698
  for skill in skills
699
  ]
700
+
701
  joined_skills_esco = []
702
  if has_esco and skill_esco_extract:
703
  assessment_esco_lookup = {item['skill_name']: item for item in skill_esco_map}
 
710
  }
711
  for skill in skill_esco_extract
712
  ]
713
+
714
  interview = build_interview(responsibilities, skills)
715
+
716
+ # Prepare the results for each output component
717
+ ccoq_levels = {f"Level_{i}_CCOG_{field}": occ_group.get(f"Level_{i}_CCOG_{field}")
718
+ for i in range(1, 5) for field in ["code", "name", "desc"]}
719
+
720
+ if has_esco:
721
+ esco_levels = {f"Level_{i}_ESCO_{field}": esco_occ.get(f"Level_{i}_ESCO_{field}")
722
+ for i in range(1, 6) for field in ["code", "name", "desc"]}
723
+ esco_skills = {
724
+ "file": os.path.basename(file.name),
725
  "classified_job_family": job_family,
726
+ "skills": joined_skills_esco
727
  }
 
 
 
 
 
 
 
 
 
 
 
728
  else:
729
+ esco_levels = {f"Level_{i}_ESCO_{field}": None
730
+ for i in range(1, 6) for field in ["code", "name", "desc"]}
731
+ esco_skills = None
732
+
 
 
733
  debug_message = "Processing completed successfully."
734
+ return (
735
+ os.path.basename(file.name),
736
+ responsibilities,
737
+ job_family,
738
+ "\n".join(qualification),
739
+ ccoq_levels,
740
+ "\n".join(interview),
741
+ joined_skills,
742
+ esco_levels,
743
+ esco_skills,
744
+ debug_message if DEBUG else None
745
+ )
746
 
747
  except Exception as e:
748
  error_message = f"Error processing PDF: {str(e)}"
749
+ return (
750
+ error_message,
751
+ "",
752
+ "",
753
+ "",
754
+ {},
755
+ "",
756
+ [],
757
+ {},
758
+ {},
759
+ error_message
760
+ )
761
 
762
  from docx import Document
763