File size: 4,957 Bytes
d4d986b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb38314
d4d986b
 
 
 
 
fb38314
 
 
 
d4d986b
 
fb38314
 
d4d986b
fb38314
 
 
 
 
d4d986b
fb38314
 
 
d4d986b
fb38314
d4d986b
fb38314
d4d986b
fb38314
 
 
 
 
d4d986b
fb38314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4d986b
 
 
 
 
fb38314
 
 
d4d986b
 
fb38314
d4d986b
fb38314
 
 
d4d986b
fb38314
d4d986b
 
 
 
fb38314
d4d986b
 
 
 
 
 
 
 
 
 
fb38314
 
 
 
 
 
d4d986b
fb38314
 
d4d986b
fb38314
 
 
 
d4d986b
fb38314
 
 
d4d986b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import io
import base64
from typing import List
from pdf2image import convert_from_bytes
from PIL import Image
from openai import OpenAI

MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")

_client = None

def _client_lazy():
    global _client
    if _client is None:
        _client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    return _client


def _img_to_base64(img: Image.Image) -> str:
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return base64.b64encode(buf.getvalue()).decode("utf-8")


def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]:
    pages = convert_from_bytes(pdf_bytes, dpi=dpi)
    return pages[:max_pages]


def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
    """
    画像/PDF: 画像化して Vision (chat.completions) へ。
    txt/docx: テキスト整形だけ実施(安定・低コスト)。
    """
    client = _client_lazy()

    # テキストの場合はそのまま整形
    if filetype not in {"pdf", "image"}:
        text = payload.decode("utf-8", errors="ignore")
        sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise."
        user = (
            "以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去:\n\n" + text
        )
        resp = client.chat.completions.create(
            model=MODEL_TEXT,
            messages=[
                {"role": "system", "content": sys},
                {"role": "user", "content": user},
            ],
            temperature=0.2,
        )
        return resp.choices[0].message.content.strip()

    # 画像/PDF → 画像列へ
    if filetype == "pdf":
        images = _pdf_to_images(payload)
    else:
        images = [Image.open(io.BytesIO(payload)).convert("RGB")]

    vision_msgs = [
        {"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."},
        {"role": "user", "content": [
            {
                "type": "text",
                "text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。"
            },
            *[
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"}
                } for img in images
            ]
        ]},
    ]
    resp = client.chat.completions.create(
        model=MODEL_VISION,
        messages=vision_msgs,
        temperature=0.0,
    )
    return resp.choices[0].message.content.strip()


def structure_with_openai(text: str) -> dict:
    client = _client_lazy()
    sys = (
        "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、"
        "JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
        "skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。"
    )
    user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
    resp = client.chat.completions.create(
        model=MODEL_TEXT,
        messages=[
            {"role": "system", "content": sys},
            {"role": "user", "content": user},
        ],
        temperature=0.2,
        response_format={"type": "json_object"},
    )
    import json as _json
    try:
        data = _json.loads(resp.choices[0].message.content)
    except Exception:
        data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
    for k in ("work_experience_raw", "education_raw", "certifications_raw"):
        data.setdefault(k, "")
    data.setdefault("skills_list", [])
    return data


def summarize_with_openai(text: str) -> dict:
    client = _client_lazy()
    sys = "You write crisp, factual Japanese executive summaries."
    user = (
        "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
        "不要な記号は避け、事実を簡潔に述べてください。\n\n" + text
    )
    resp = client.chat.completions.create(
        model=MODEL_TEXT,
        messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
        temperature=0.2,
    )
    full = resp.choices[0].message.content.strip()

    # ルールベース簡易抽出(フォーマット崩れでも破綻しない)
    one_sent = full.split("。")[0] + "。" if "。" in full else full
    return {
        "300chars": full[:600],   # だいたい300字相当(マージン確保)
        "100chars": full[:120],
        "onesent": one_sent,
    }