Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import base64 | |
| from typing import List | |
| from pdf2image import convert_from_bytes | |
| from PIL import Image | |
| from openai import OpenAI | |
| MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini") | |
| MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini") | |
| _client = None | |
| def _client_lazy(): | |
| global _client | |
| if _client is None: | |
| _client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | |
| return _client | |
| def _img_to_base64(img: Image.Image) -> str: | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| return base64.b64encode(buf.getvalue()).decode("utf-8") | |
| def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]: | |
| pages = convert_from_bytes(pdf_bytes, dpi=dpi) | |
| return pages[:max_pages] | |
| def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str: | |
| """ | |
| 画像/PDF: 画像化して Vision (chat.completions) へ。 | |
| txt/docx: テキスト整形だけ実施(安定・低コスト)。 | |
| """ | |
| client = _client_lazy() | |
| # テキストの場合はそのまま整形 | |
| if filetype not in {"pdf", "image"}: | |
| text = payload.decode("utf-8", errors="ignore") | |
| sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise." | |
| user = ( | |
| "以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去:\n\n" + text | |
| ) | |
| resp = client.chat.completions.create( | |
| model=MODEL_TEXT, | |
| messages=[ | |
| {"role": "system", "content": sys}, | |
| {"role": "user", "content": user}, | |
| ], | |
| temperature=0.2, | |
| ) | |
| return resp.choices[0].message.content.strip() | |
| # 画像/PDF → 画像列へ | |
| if filetype == "pdf": | |
| images = _pdf_to_images(payload) | |
| else: | |
| images = [Image.open(io.BytesIO(payload)).convert("RGB")] | |
| vision_msgs = [ | |
| {"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."}, | |
| {"role": "user", "content": [ | |
| { | |
| "type": "text", | |
| "text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。" | |
| }, | |
| *[ | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"} | |
| } for img in images | |
| ] | |
| ]}, | |
| ] | |
| resp = client.chat.completions.create( | |
| model=MODEL_VISION, | |
| messages=vision_msgs, | |
| temperature=0.0, | |
| ) | |
| return resp.choices[0].message.content.strip() | |
| def structure_with_openai(text: str) -> dict: | |
| client = _client_lazy() | |
| sys = ( | |
| "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、" | |
| "JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。" | |
| "skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。" | |
| ) | |
| user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text | |
| resp = client.chat.completions.create( | |
| model=MODEL_TEXT, | |
| messages=[ | |
| {"role": "system", "content": sys}, | |
| {"role": "user", "content": user}, | |
| ], | |
| temperature=0.2, | |
| response_format={"type": "json_object"}, | |
| ) | |
| import json as _json | |
| try: | |
| data = _json.loads(resp.choices[0].message.content) | |
| except Exception: | |
| data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []} | |
| for k in ("work_experience_raw", "education_raw", "certifications_raw"): | |
| data.setdefault(k, "") | |
| data.setdefault("skills_list", []) | |
| return data | |
| def summarize_with_openai(text: str) -> dict: | |
| client = _client_lazy() | |
| sys = "You write crisp, factual Japanese executive summaries." | |
| user = ( | |
| "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。" | |
| "不要な記号は避け、事実を簡潔に述べてください。\n\n" + text | |
| ) | |
| resp = client.chat.completions.create( | |
| model=MODEL_TEXT, | |
| messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}], | |
| temperature=0.2, | |
| ) | |
| full = resp.choices[0].message.content.strip() | |
| # ルールベース簡易抽出(フォーマット崩れでも破綻しない) | |
| one_sent = full.split("。")[0] + "。" if "。" in full else full | |
| return { | |
| "300chars": full[:600], # だいたい300字相当(マージン確保) | |
| "100chars": full[:120], | |
| "onesent": one_sent, | |
| } | |