Corin1998 commited on
Commit
d9468cf
·
verified ·
1 Parent(s): 36a7531

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -0
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import hashlib
5
+ import gradio as gr
6
+
7
+ from pipelines.openai_ingest import (
8
+ extract_text_with_openai,
9
+ structure_with_openai,
10
+ summarize_with_openai,
11
+ )
12
+ from pipelines.parsing import normalize_resume
13
+ from pipelines.merge import merge_normalized_records
14
+ from pipelines.skills import extract_skills
15
+ from pipelines.anonymize import anonymize_text, render_anonymized_pdf
16
+ from pipelines.scoring import compute_quality_score
17
+ from pipelines.storage import persist_to_hf
18
+ from pipelines.utils import detect_filetype, load_doc_text
19
+
20
+ APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
21
+
22
+
23
+ def _read_file_bytes(path: str) -> bytes:
24
+ with open(path, "rb") as f:
25
+ return f.read()
26
+
27
+
28
+ def process_resumes(filepaths, candidate_id: str, additional_notes: str = ""):
29
+ if not filepaths:
30
+ raise gr.Error("少なくとも1ファイルをアップロードしてください。")
31
+
32
+ partial_records = []
33
+ raw_texts = []
34
+
35
+ for p in filepaths:
36
+ raw_bytes = _read_file_bytes(p)
37
+ fname = os.path.basename(p)
38
+ filetype = detect_filetype(fname, raw_bytes)
39
+
40
+ # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
41
+ if filetype in {"pdf", "image"}:
42
+ text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
43
+ else:
44
+ base_text = load_doc_text(filetype, raw_bytes)
45
+ text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
46
+
47
+ raw_texts.append({"filename": fname, "text": text})
48
+
49
+ # 2) OpenAIでセクション構造化 → ルール正規化
50
+ structured = structure_with_openai(text)
51
+ normalized = normalize_resume({
52
+ "work_experience": structured.get("work_experience_raw", ""),
53
+ "education": structured.get("education_raw", ""),
54
+ "certifications": structured.get("certifications_raw", ""),
55
+ "skills": ", ".join(structured.get("skills_list", [])),
56
+ })
57
+ partial_records.append({
58
+ "source": fname,
59
+ "text": text,
60
+ "structured": structured,
61
+ "normalized": normalized,
62
+ })
63
+
64
+ # 3) 統合(複数ファイル→1候補者)
65
+ merged = merge_normalized_records([r["normalized"] for r in partial_records])
66
+
67
+ # 4) スキル抽出
68
+ merged_text = "\n\n".join([r["text"] for r in partial_records])
69
+ skills = extract_skills(merged_text, {
70
+ "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
71
+ "education": merged.get("raw_sections", {}).get("education", ""),
72
+ "certifications": merged.get("raw_sections", {}).get("certifications", ""),
73
+ "skills": ", ".join(merged.get("skills", [])),
74
+ })
75
+
76
+ # 5) 匿名化
77
+ anonymized_text, anon_map = anonymize_text(merged_text)
78
+ anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
79
+
80
+ # 6) 品質スコア
81
+ score = compute_quality_score(merged_text, merged)
82
+
83
+ # 7) 要約(300/100/1文)
84
+ summaries = summarize_with_openai(merged_text)
85
+
86
+ # 8) 構造化出力
87
+ result_json = {
88
+ "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
89
+ "files": [os.path.basename(p) for p in filepaths],
90
+ "merged": merged,
91
+ "skills": skills,
92
+ "quality_score": score,
93
+ "summaries": summaries,
94
+ "anonymization_map": anon_map,
95
+ "notes": additional_notes,
96
+ }
97
+
98
+ # 9) HF Datasets 保存
99
+ dataset_repo = os.environ.get("DATASET_REPO")
100
+ commit_info = None
101
+ if dataset_repo:
102
+ file_hash = result_json["candidate_id"]
103
+ commit_info = persist_to_hf(
104
+ dataset_repo=dataset_repo,
105
+ record=result_json,
106
+ anon_pdf_bytes=anon_pdf_bytes,
107
+ parquet_path=f"candidates/{file_hash}.parquet",
108
+ json_path=f"candidates/{file_hash}.json",
109
+ pdf_path=f"candidates/{file_hash}.anon.pdf",
110
+ )
111
+
112
+ anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
113
+
114
+ # 出力はすべて文字列/ファイルに統一(GradioのAPI情報生成で安全)
115
+ return (
116
+ json.dumps(result_json, ensure_ascii=False, indent=2),
117
+ json.dumps(skills, ensure_ascii=False, indent=2),
118
+ json.dumps(score, ensure_ascii=False, indent=2),
119
+ summaries.get("300chars", ""),
120
+ summaries.get("100chars", ""),
121
+ summaries.get("onesent", ""),
122
+ anon_pdf,
123
+ json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
124
+ )
125
+
126
+
127
+ with gr.Blocks(title=APP_TITLE) as demo:
128
+ gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
129
+
130
+ with gr.Row():
131
+ in_files = gr.Files(
132
+ label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
133
+ file_count="multiple",
134
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
135
+ type="filepath", # 重要:filepath に統一
136
+ )
137
+ candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
138
+ notes = gr.Textbox(label="補足メモ(任意)", lines=3)
139
+
140
+ run_btn = gr.Button("実行")
141
+
142
+ with gr.Tab("構造化JSON"):
143
+ out_json = gr.Code(label="統合出力 (JSON)")
144
+
145
+ with gr.Tab("抽出スキル"):
146
+ out_skills = gr.Code(label="スキル一覧 (JSON)")
147
+
148
+ with gr.Tab("品質スコア"):
149
+ out_score = gr.Code(label="品質評価 (JSON)")
150
+
151
+ with gr.Tab("要約 (300/100/1文)"):
152
+ out_sum_300 = gr.Textbox(label="300字要約")
153
+ out_sum_100 = gr.Textbox(label="100字要約")
154
+ out_sum_1 = gr.Textbox(label="1文要約")
155
+
156
+ with gr.Tab("匿名PDF"):
157
+ out_pdf = gr.File(label="匿名PDFダウンロード")
158
+
159
+ with gr.Tab("Datasets 保存ログ"):
160
+ out_commit = gr.Code(label="コミット情報")
161
+
162
+ run_btn.click(
163
+ process_resumes,
164
+ inputs=[in_files, candidate_id, notes],
165
+ outputs=[out_json, out_skills, out_score, out_sum_300, out_sum_100, out_sum_1, out_pdf, out_commit],
166
+ )
167
+
168
+
169
+ if __name__ == "__main__":
170
+ # Spaces でもローカルでも OK
171
+ demo.launch(server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
172
+ server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
173
+ share=False)