Spaces:

Corin1998
/

HFResumeIntakeSystem_DC

Sleeping

File size: 6,256 Bytes

d9468cf
e1db6bc
d9468cf
 
 
 
 
 
 
 
 
 
 
 
e1db6bc
d9468cf
e1db6bc
d9468cf
 
e1db6bc
d9468cf
bc98150
44dd3d1
 
d9468cf
 
 
e1db6bc
d9468cf
b63bdda
44dd3d1
e1db6bc
 
8344d24
44dd3d1
cfbd159
8344d24
d9468cf
44dd3d1
e8bbd9b
bc98150
44dd3d1
d9468cf
e1db6bc
d9468cf
8344d24
d9468cf
 
 
 
 
 
 
 
44dd3d1
d9468cf
 
 
 
 
e1db6bc
d9468cf
 
e1db6bc
 
d9468cf
 
 
 
 
 
 
b63bdda
e1db6bc
 
 
 
d9468cf
 
e1db6bc
d9468cf
 
e1db6bc
 
d9468cf
e1db6bc
44dd3d1
d9468cf
 
 
 
e1db6bc
d9468cf
 
 
e1db6bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9468cf
e1db6bc
b63bdda
e1db6bc
44dd3d1
 
 
e1db6bc
 
d9468cf
 
 
4b92cf5
e1db6bc
d9468cf
 
b63bdda
d9468cf
 
 
 
e1db6bc
d9468cf
 
 
 
e8bbd9b
d9468cf
 
0db7548
d9468cf
 
b63bdda
d6abadb
d9468cf
 
44dd3d1
d9468cf
 
 
 
 
 
e1db6bc
 
 
 
 
 
d9468cf
efd9e64
d9468cf
e1db6bc
d9468cf
 
a7f8bc2
d9468cf
b63bdda
d6abadb

import os
import io
import json
import hashlib
import gradio as gr

from pipelines.openai_ingest import (
    extract_text_with_openai,
    structure_with_openai,
    summarize_with_openai,
)
from pipelines.parsing import normalize_resume
from pipelines.merge import merge_normalized_records
from pipelines.skills import extract_skills
from pipelines.anonymize import anonymize_text, render_anonymized_pdf
from pipelines.scoring import compute_quality_score
from pipelines.storage import persist_to_hf
from pipelines.utils import detect_filetype, load_doc_text

APP_TITLE = "候補者インテーク & レジュメ標準化（OpenAI版）"


def process_resumes(filepaths, candidate_id: str, additional_notes: str = ""):
    if not filepaths:
        raise gr.Error("少なくとも1ファイルをアップロードしてください。")

    partial_records = []
    raw_texts = []

    # gr.Files(type="filepath") からは文字列パスが渡る
    for path in filepaths:
        with open(path, "rb") as f:
            raw_bytes = f.read()
        fname = os.path.basename(path)
        filetype = detect_filetype(fname, raw_bytes)

        # 1) テキスト抽出
        if filetype in {"pdf", "image"}:
            text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
        else:
            base_text = load_doc_text(filetype, raw_bytes)
            text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")

        raw_texts.append({"filename": fname, "text": text})

        # 2) 構造化 → 3) 正規化
        structured = structure_with_openai(text)
        normalized = normalize_resume({
            "work_experience": structured.get("work_experience_raw", ""),
            "education": structured.get("education_raw", ""),
            "certifications": structured.get("certifications_raw", ""),
            "skills": ", ".join(structured.get("skills_list", [])),
        })
        partial_records.append({
            "source": fname,
            "text": text,
            "structured": structured,
            "normalized": normalized,
        })

    # 4) 統合
    merged = merge_normalized_records([r["normalized"] for r in partial_records])

    # 5) スキル抽出
    merged_text = "\n\n".join([r["text"] for r in partial_records])
    skills = extract_skills(merged_text, {
        "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
        "education": merged.get("raw_sections", {}).get("education", ""),
        "certifications": merged.get("raw_sections", {}).get("certifications", ""),
        "skills": ", ".join(merged.get("skills", [])),
    })

    # 6) 匿名化
    anonymized_text, anon_map = anonymize_text(merged_text)
    anon_pdf_bytes = render_anonymized_pdf(anonymized_text)

    # 7) 品質スコア
    score = compute_quality_score(merged_text, merged)

    # 8) 要約
    summaries = summarize_with_openai(merged_text)

    # 9) 出力組み立て
    cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
    result_json = {
        "candidate_id": cid,
        "files": [os.path.basename(p) for p in filepaths],
        "merged": merged,
        "skills": skills,
        "quality_score": score,
        "summaries": summaries,
        "anonymization_map": anon_map,
        "notes": additional_notes,
    }

    # 10) HF Datasets 保存（任意）
    dataset_repo = os.environ.get("DATASET_REPO")
    commit_info = None
    if dataset_repo:
        commit_info = persist_to_hf(
            dataset_repo=dataset_repo,
            record=result_json,
            anon_pdf_bytes=anon_pdf_bytes,
            parquet_path=f"candidates/{cid}.parquet",
            json_path=f"candidates/{cid}.json",
            pdf_path=f"candidates/{cid}.anon.pdf",
        )

    anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)

    return (
        json.dumps(result_json, ensure_ascii=False, indent=2),
        json.dumps(skills, ensure_ascii=False, indent=2),     # JSONはCodeで安全表示
        json.dumps(score, ensure_ascii=False, indent=2),
        summaries.get("300chars", ""),
        summaries.get("100chars", ""),
        summaries.get("onesent", ""),
        anon_pdf,
        json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
    )


with gr.Blocks(title=APP_TITLE) as demo:
    gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")

    with gr.Row():
        # Gradio v4: type="file" は無効。filepath を使う
        in_files = gr.Files(
            label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
            file_count="multiple",
            file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
            type="filepath",
        )
        candidate_id = gr.Textbox(label="候補者ID（任意。未入力なら自動生成）")
    notes = gr.Textbox(label="補足メモ（任意）", lines=3)

    run_btn = gr.Button("実行")

    with gr.Tab("構造化JSON"):
        out_json = gr.Code(label="統合出力 (JSON)")

    with gr.Tab("抽出スキル"):
        # JSONウィジェットのスキーマ推論バグ回避のため Code で出力
        out_skills = gr.Code(label="スキル一覧 (JSON)")

    with gr.Tab("品質スコア"):
        out_score = gr.Code(label="品質評価 (JSON)")

    with gr.Tab("要約 (300/100/1文)"):
        out_sum_300 = gr.Textbox(label="300字要約")
        out_sum_100 = gr.Textbox(label="100字要約")
        out_sum_1 = gr.Textbox(label="1文要約")

    with gr.Tab("匿名PDF"):
        out_pdf = gr.File(label="匿名PDFダウンロード")

    with gr.Tab("Datasets 保存ログ"):
        out_commit = gr.Code(label="コミット情報")

    run_btn.click(
        process_resumes,
        inputs=[in_files, candidate_id, notes],
        outputs=[out_json, out_skills, out_score, out_sum_300, out_sum_100, out_sum_1, out_pdf, out_commit],
    )


if __name__ == "__main__":
    # localhost 到達不可環境でも起動できるように share=True を明示
    demo.launch(share=True, server_name="0.0.0.0", server_port=7860)