Corin1998 commited on
Commit
a7f8bc2
·
verified ·
1 Parent(s): ccf8168

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -71
app.py CHANGED
@@ -2,9 +2,7 @@ import os
2
  import io
3
  import json
4
  import hashlib
5
- import gradio # 一部の前方参照バグ回避用
6
  import gradio as gr
7
- from typing import List
8
 
9
  from pipelines.openai_ingest import (
10
  extract_text_with_openai,
@@ -14,43 +12,7 @@ from pipelines.openai_ingest import (
14
  from pipelines.parsing import normalize_resume
15
  from pipelines.merge import merge_normalized_records
16
  from pipelines.skills import extract_skills
17
-
18
- # --- 匿名化のフォールバック(pipelines/anonymize.py が空/未実装でも動く) ---
19
- try:
20
- from pipelines.anonymize import anonymize_text, render_anonymized_pdf # type: ignore
21
- except Exception:
22
- import re
23
- try:
24
- from reportlab.pdfgen import canvas
25
- from reportlab.lib.pagesizes import A4
26
- except Exception:
27
- canvas = None
28
- A4 = None
29
-
30
- def anonymize_text(text: str):
31
- masked = re.sub(r"([A-Za-z0-9._%+-]+)@([A-Za-z0-9.-]+\.[A-Za-z]{2,})", r"***@\2", text)
32
- masked = re.sub(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", "***-****-****", masked)
33
- masked = re.sub(r"(氏名[::]?\s*)(\S+)", r"\1***", masked)
34
- return masked, {"fallback": True}
35
-
36
- def render_anonymized_pdf(text: str) -> bytes:
37
- if canvas is None:
38
- return text.encode("utf-8")
39
- buf = io.BytesIO()
40
- c = canvas.Canvas(buf, pagesize=A4)
41
- width, height = A4
42
- m = 40
43
- y = height - m
44
- for line in text.splitlines() or ["(no content)"]:
45
- if y < m:
46
- c.showPage()
47
- y = height - m
48
- c.drawString(m, y, line[:95])
49
- y -= 14
50
- c.save()
51
- return buf.getvalue()
52
- # ----------------------------------------------------------------------
53
-
54
  from pipelines.scoring import compute_quality_score
55
  from pipelines.storage import persist_to_hf
56
  from pipelines.utils import detect_filetype, load_doc_text
@@ -58,34 +20,26 @@ from pipelines.utils import detect_filetype, load_doc_text
58
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
59
 
60
 
61
- def _read_bytes(path: str) -> bytes:
62
- with open(path, "rb") as f:
63
- return f.read()
64
-
65
-
66
- def process_resumes(files: List[str], candidate_id: str = "", additional_notes: str = ""):
67
- """
68
- files: gr.Files(type='filepath') から渡るファイルパスのリスト
69
- """
70
  if not files:
71
  raise gr.Error("少なくとも1ファイルをアップロードしてください。")
72
 
73
  partial_records = []
74
  raw_texts = []
75
 
76
- for path in files:
77
- filename = os.path.basename(path)
78
- raw_bytes = _read_bytes(path)
79
- filetype = detect_filetype(filename, raw_bytes)
80
 
81
- # 1) 抽出
82
  if filetype in {"pdf", "image"}:
83
- text = extract_text_with_openai(raw_bytes, filename=filename, filetype=filetype)
84
  else:
85
  base_text = load_doc_text(filetype, raw_bytes)
86
- text = extract_text_with_openai(base_text.encode("utf-8"), filename=filename, filetype="txt")
87
 
88
- raw_texts.append({"filename": filename, "text": text})
89
 
90
  # 2) 構造化→正規化
91
  structured = structure_with_openai(text)
@@ -96,7 +50,7 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
96
  "skills": ", ".join(structured.get("skills_list", [])),
97
  })
98
  partial_records.append({
99
- "source": filename,
100
  "text": text,
101
  "structured": structured,
102
  "normalized": normalized,
@@ -124,7 +78,7 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
124
  # 7) 要約
125
  summaries = summarize_with_openai(merged_text)
126
 
127
- # 8) 出力まとめ
128
  cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
129
  result_json = {
130
  "candidate_id": cid,
@@ -152,21 +106,16 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
152
 
153
  anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
154
 
155
- # UI用:すべて文字列化して返す(gr.JSON を使わない)
156
- out_json_str = json.dumps(result_json, ensure_ascii=False, indent=2)
157
- out_skills_str = json.dumps(skills, ensure_ascii=False, indent=2)
158
- out_score_str = json.dumps(score, ensure_ascii=False, indent=2)
159
- out_commit_str = json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2)
160
-
161
  return (
162
- out_json_str,
163
- out_skills_str,
164
- out_score_str,
165
  summaries.get("300chars", ""),
166
  summaries.get("100chars", ""),
167
  summaries.get("onesent", ""),
168
  anon_pdf,
169
- out_commit_str,
170
  )
171
 
172
 
@@ -178,7 +127,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
178
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
179
  file_count="multiple",
180
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
181
- type="filepath", # ← 重要:'file' ではなく 'filepath'
182
  )
183
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
184
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
@@ -189,7 +138,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
189
  out_json = gr.Code(label="統合出力 (JSON)")
190
 
191
  with gr.Tab("抽出スキル"):
192
- out_skills = gr.Code(label="スキル一覧 (JSON)") # ← gr.JSON を避ける
193
 
194
  with gr.Tab("品質スコア"):
195
  out_score = gr.Code(label="品質評価 (JSON)")
@@ -212,6 +161,6 @@ with gr.Blocks(title=APP_TITLE) as demo:
212
  api_name="run",
213
  )
214
 
 
215
  if __name__ == "__main__":
216
- # 到達性のため share=True 推奨
217
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
2
  import io
3
  import json
4
  import hashlib
 
5
  import gradio as gr
 
6
 
7
  from pipelines.openai_ingest import (
8
  extract_text_with_openai,
 
12
  from pipelines.parsing import normalize_resume
13
  from pipelines.merge import merge_normalized_records
14
  from pipelines.skills import extract_skills
15
+ from pipelines.anonymize import anonymize_text, render_anonymized_pdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from pipelines.scoring import compute_quality_score
17
  from pipelines.storage import persist_to_hf
18
  from pipelines.utils import detect_filetype, load_doc_text
 
20
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
21
 
22
 
23
+ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
 
 
 
 
 
 
 
 
24
  if not files:
25
  raise gr.Error("少なくとも1ファイルをアップロードしてください。")
26
 
27
  partial_records = []
28
  raw_texts = []
29
 
30
+ for p in files: # gr.Files(type="filepath") でパスが来る
31
+ raw_bytes = open(p, "rb").read()
32
+ fname = os.path.basename(p)
33
+ filetype = detect_filetype(fname, raw_bytes)
34
 
35
+ # 1) テキスト抽出
36
  if filetype in {"pdf", "image"}:
37
+ text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
38
  else:
39
  base_text = load_doc_text(filetype, raw_bytes)
40
+ text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
41
 
42
+ raw_texts.append({"filename": fname, "text": text})
43
 
44
  # 2) 構造化→正規化
45
  structured = structure_with_openai(text)
 
50
  "skills": ", ".join(structured.get("skills_list", [])),
51
  })
52
  partial_records.append({
53
+ "source": fname,
54
  "text": text,
55
  "structured": structured,
56
  "normalized": normalized,
 
78
  # 7) 要約
79
  summaries = summarize_with_openai(merged_text)
80
 
81
+ # 8) 構造化出力
82
  cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
83
  result_json = {
84
  "candidate_id": cid,
 
106
 
107
  anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
108
 
109
+ # UI には全て文字列(JSONダンプ)で返す
 
 
 
 
 
110
  return (
111
+ json.dumps(result_json, ensure_ascii=False, indent=2),
112
+ json.dumps(skills, ensure_ascii=False, indent=2),
113
+ json.dumps(score, ensure_ascii=False, indent=2),
114
  summaries.get("300chars", ""),
115
  summaries.get("100chars", ""),
116
  summaries.get("onesent", ""),
117
  anon_pdf,
118
+ json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
119
  )
120
 
121
 
 
127
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
128
  file_count="multiple",
129
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
130
+ type="filepath", # ←重要
131
  )
132
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
133
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
 
138
  out_json = gr.Code(label="統合出力 (JSON)")
139
 
140
  with gr.Tab("抽出スキル"):
141
+ out_skills = gr.Code(label="スキル一覧 (JSON)") # ← gr.JSON を使わない
142
 
143
  with gr.Tab("品質スコア"):
144
  out_score = gr.Code(label="品質評価 (JSON)")
 
161
  api_name="run",
162
  )
163
 
164
+
165
  if __name__ == "__main__":
 
166
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)