Spaces:
Sleeping
Sleeping
File size: 4,957 Bytes
d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b fb38314 d4d986b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import io
import base64
from typing import List
from pdf2image import convert_from_bytes
from PIL import Image
from openai import OpenAI
MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
_client = None
def _client_lazy():
global _client
if _client is None:
_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
return _client
def _img_to_base64(img: Image.Image) -> str:
buf = io.BytesIO()
img.save(buf, format="PNG")
return base64.b64encode(buf.getvalue()).decode("utf-8")
def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]:
pages = convert_from_bytes(pdf_bytes, dpi=dpi)
return pages[:max_pages]
def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
"""
画像/PDF: 画像化して Vision (chat.completions) へ。
txt/docx: テキスト整形だけ実施(安定・低コスト)。
"""
client = _client_lazy()
# テキストの場合はそのまま整形
if filetype not in {"pdf", "image"}:
text = payload.decode("utf-8", errors="ignore")
sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise."
user = (
"以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去:\n\n" + text
)
resp = client.chat.completions.create(
model=MODEL_TEXT,
messages=[
{"role": "system", "content": sys},
{"role": "user", "content": user},
],
temperature=0.2,
)
return resp.choices[0].message.content.strip()
# 画像/PDF → 画像列へ
if filetype == "pdf":
images = _pdf_to_images(payload)
else:
images = [Image.open(io.BytesIO(payload)).convert("RGB")]
vision_msgs = [
{"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."},
{"role": "user", "content": [
{
"type": "text",
"text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。"
},
*[
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"}
} for img in images
]
]},
]
resp = client.chat.completions.create(
model=MODEL_VISION,
messages=vision_msgs,
temperature=0.0,
)
return resp.choices[0].message.content.strip()
def structure_with_openai(text: str) -> dict:
client = _client_lazy()
sys = (
"あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、"
"JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
"skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。"
)
user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
resp = client.chat.completions.create(
model=MODEL_TEXT,
messages=[
{"role": "system", "content": sys},
{"role": "user", "content": user},
],
temperature=0.2,
response_format={"type": "json_object"},
)
import json as _json
try:
data = _json.loads(resp.choices[0].message.content)
except Exception:
data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
for k in ("work_experience_raw", "education_raw", "certifications_raw"):
data.setdefault(k, "")
data.setdefault("skills_list", [])
return data
def summarize_with_openai(text: str) -> dict:
client = _client_lazy()
sys = "You write crisp, factual Japanese executive summaries."
user = (
"以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
"不要な記号は避け、事実を簡潔に述べてください。\n\n" + text
)
resp = client.chat.completions.create(
model=MODEL_TEXT,
messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
temperature=0.2,
)
full = resp.choices[0].message.content.strip()
# ルールベース簡易抽出(フォーマット崩れでも破綻しない)
one_sent = full.split("。")[0] + "。" if "。" in full else full
return {
"300chars": full[:600], # だいたい300字相当(マージン確保)
"100chars": full[:120],
"onesent": one_sent,
}
|