Spaces:
Sleeping
Sleeping
File size: 6,256 Bytes
d9468cf e1db6bc d9468cf e1db6bc d9468cf e1db6bc d9468cf e1db6bc d9468cf bc98150 44dd3d1 d9468cf e1db6bc d9468cf b63bdda 44dd3d1 e1db6bc 8344d24 44dd3d1 cfbd159 8344d24 d9468cf 44dd3d1 e8bbd9b bc98150 44dd3d1 d9468cf e1db6bc d9468cf 8344d24 d9468cf 44dd3d1 d9468cf e1db6bc d9468cf e1db6bc d9468cf b63bdda e1db6bc d9468cf e1db6bc d9468cf e1db6bc d9468cf e1db6bc 44dd3d1 d9468cf e1db6bc d9468cf e1db6bc d9468cf e1db6bc b63bdda e1db6bc 44dd3d1 e1db6bc d9468cf 4b92cf5 e1db6bc d9468cf b63bdda d9468cf e1db6bc d9468cf e8bbd9b d9468cf 0db7548 d9468cf b63bdda d6abadb d9468cf 44dd3d1 d9468cf e1db6bc d9468cf efd9e64 d9468cf e1db6bc d9468cf a7f8bc2 d9468cf b63bdda d6abadb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import os
import io
import json
import hashlib
import gradio as gr
from pipelines.openai_ingest import (
extract_text_with_openai,
structure_with_openai,
summarize_with_openai,
)
from pipelines.parsing import normalize_resume
from pipelines.merge import merge_normalized_records
from pipelines.skills import extract_skills
from pipelines.anonymize import anonymize_text, render_anonymized_pdf
from pipelines.scoring import compute_quality_score
from pipelines.storage import persist_to_hf
from pipelines.utils import detect_filetype, load_doc_text
APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
def process_resumes(filepaths, candidate_id: str, additional_notes: str = ""):
if not filepaths:
raise gr.Error("少なくとも1ファイルをアップロードしてください。")
partial_records = []
raw_texts = []
# gr.Files(type="filepath") からは文字列パスが渡る
for path in filepaths:
with open(path, "rb") as f:
raw_bytes = f.read()
fname = os.path.basename(path)
filetype = detect_filetype(fname, raw_bytes)
# 1) テキスト抽出
if filetype in {"pdf", "image"}:
text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
else:
base_text = load_doc_text(filetype, raw_bytes)
text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
raw_texts.append({"filename": fname, "text": text})
# 2) 構造化 → 3) 正規化
structured = structure_with_openai(text)
normalized = normalize_resume({
"work_experience": structured.get("work_experience_raw", ""),
"education": structured.get("education_raw", ""),
"certifications": structured.get("certifications_raw", ""),
"skills": ", ".join(structured.get("skills_list", [])),
})
partial_records.append({
"source": fname,
"text": text,
"structured": structured,
"normalized": normalized,
})
# 4) 統合
merged = merge_normalized_records([r["normalized"] for r in partial_records])
# 5) スキル抽出
merged_text = "\n\n".join([r["text"] for r in partial_records])
skills = extract_skills(merged_text, {
"work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
"education": merged.get("raw_sections", {}).get("education", ""),
"certifications": merged.get("raw_sections", {}).get("certifications", ""),
"skills": ", ".join(merged.get("skills", [])),
})
# 6) 匿名化
anonymized_text, anon_map = anonymize_text(merged_text)
anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
# 7) 品質スコア
score = compute_quality_score(merged_text, merged)
# 8) 要約
summaries = summarize_with_openai(merged_text)
# 9) 出力組み立て
cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
result_json = {
"candidate_id": cid,
"files": [os.path.basename(p) for p in filepaths],
"merged": merged,
"skills": skills,
"quality_score": score,
"summaries": summaries,
"anonymization_map": anon_map,
"notes": additional_notes,
}
# 10) HF Datasets 保存(任意)
dataset_repo = os.environ.get("DATASET_REPO")
commit_info = None
if dataset_repo:
commit_info = persist_to_hf(
dataset_repo=dataset_repo,
record=result_json,
anon_pdf_bytes=anon_pdf_bytes,
parquet_path=f"candidates/{cid}.parquet",
json_path=f"candidates/{cid}.json",
pdf_path=f"candidates/{cid}.anon.pdf",
)
anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
return (
json.dumps(result_json, ensure_ascii=False, indent=2),
json.dumps(skills, ensure_ascii=False, indent=2), # JSONはCodeで安全表示
json.dumps(score, ensure_ascii=False, indent=2),
summaries.get("300chars", ""),
summaries.get("100chars", ""),
summaries.get("onesent", ""),
anon_pdf,
json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
)
with gr.Blocks(title=APP_TITLE) as demo:
gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
with gr.Row():
# Gradio v4: type="file" は無効。filepath を使う
in_files = gr.Files(
label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
file_count="multiple",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
type="filepath",
)
candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
notes = gr.Textbox(label="補足メモ(任意)", lines=3)
run_btn = gr.Button("実行")
with gr.Tab("構造化JSON"):
out_json = gr.Code(label="統合出力 (JSON)")
with gr.Tab("抽出スキル"):
# JSONウィジェットのスキーマ推論バグ回避のため Code で出力
out_skills = gr.Code(label="スキル一覧 (JSON)")
with gr.Tab("品質スコア"):
out_score = gr.Code(label="品質評価 (JSON)")
with gr.Tab("要約 (300/100/1文)"):
out_sum_300 = gr.Textbox(label="300字要約")
out_sum_100 = gr.Textbox(label="100字要約")
out_sum_1 = gr.Textbox(label="1文要約")
with gr.Tab("匿名PDF"):
out_pdf = gr.File(label="匿名PDFダウンロード")
with gr.Tab("Datasets 保存ログ"):
out_commit = gr.Code(label="コミット情報")
run_btn.click(
process_resumes,
inputs=[in_files, candidate_id, notes],
outputs=[out_json, out_skills, out_score, out_sum_300, out_sum_100, out_sum_1, out_pdf, out_commit],
)
if __name__ == "__main__":
# localhost 到達不可環境でも起動できるように share=True を明示
demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
|