Spaces:
Sleeping
Sleeping
Create pipelines/openai_ingest.py
Browse files- pipelines/openai_ingest.py +113 -0
pipelines/openai_ingest.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import base64
|
| 4 |
+
from typing import List
|
| 5 |
+
from pdf2image import convert_from_bytes
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
|
| 9 |
+
MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
|
| 10 |
+
MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
|
| 11 |
+
|
| 12 |
+
_client = None
|
| 13 |
+
|
| 14 |
+
def _client_lazy():
|
| 15 |
+
global _client
|
| 16 |
+
if _client is None:
|
| 17 |
+
_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
| 18 |
+
return _client
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _img_to_base64(img: Image.Image) -> str:
|
| 22 |
+
buf = io.BytesIO()
|
| 23 |
+
img.save(buf, format="PNG")
|
| 24 |
+
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> List[Image.Image]:
|
| 28 |
+
pages = convert_from_bytes(pdf_bytes, dpi=dpi)
|
| 29 |
+
return pages[:max_pages]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
|
| 33 |
+
"""画像/PDFは画像化してVisionに渡す。テキストは整形依頼してきれいな本文を返す。"""
|
| 34 |
+
client = _client_lazy()
|
| 35 |
+
|
| 36 |
+
images: List[Image.Image] = []
|
| 37 |
+
if filetype == "pdf":
|
| 38 |
+
images = _pdf_to_images(payload)
|
| 39 |
+
elif filetype == "image":
|
| 40 |
+
images = [Image.open(io.BytesIO(payload)).convert("RGB")]
|
| 41 |
+
else:
|
| 42 |
+
text = payload.decode("utf-8", errors="ignore")
|
| 43 |
+
prompt = (
|
| 44 |
+
"以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
|
| 45 |
+
)
|
| 46 |
+
resp = client.responses.create(
|
| 47 |
+
model=MODEL_TEXT,
|
| 48 |
+
input=[
|
| 49 |
+
{"role": "system", "content": "You are a meticulous document cleaner for Japanese resumes."},
|
| 50 |
+
{"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
|
| 51 |
+
],
|
| 52 |
+
)
|
| 53 |
+
return resp.output_text
|
| 54 |
+
|
| 55 |
+
content = [
|
| 56 |
+
{"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
|
| 57 |
+
]
|
| 58 |
+
for img in images:
|
| 59 |
+
content.append({
|
| 60 |
+
"type": "input_image",
|
| 61 |
+
"image_data": _img_to_base64(img),
|
| 62 |
+
})
|
| 63 |
+
|
| 64 |
+
resp = client.responses.create(
|
| 65 |
+
model=MODEL_VISION,
|
| 66 |
+
input=[{"role": "user", "content": content}],
|
| 67 |
+
)
|
| 68 |
+
return resp.output_text
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def structure_with_openai(text: str) -> dict:
|
| 72 |
+
client = _client_lazy()
|
| 73 |
+
sys = (
|
| 74 |
+
"あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、JSONで返してください。"
|
| 75 |
+
" JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
|
| 76 |
+
" skills_list は重複除去済み配列。work_experience_raw等は原文抜粋で良い。"
|
| 77 |
+
)
|
| 78 |
+
user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
|
| 79 |
+
resp = client.responses.create(
|
| 80 |
+
model=MODEL_TEXT,
|
| 81 |
+
input=[
|
| 82 |
+
{"role": "system", "content": [{"type": "input_text", "text": sys}]},
|
| 83 |
+
{"role": "user", "content": [{"type": "input_text", "text": user}]},
|
| 84 |
+
],
|
| 85 |
+
response_format={"type": "json_object"},
|
| 86 |
+
)
|
| 87 |
+
import json as _json
|
| 88 |
+
try:
|
| 89 |
+
data = _json.loads(resp.output_text)
|
| 90 |
+
except Exception:
|
| 91 |
+
data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
|
| 92 |
+
for k in ("work_experience_raw", "education_raw", "certifications_raw"):
|
| 93 |
+
data.setdefault(k, "")
|
| 94 |
+
data.setdefault("skills_list", [])
|
| 95 |
+
return data
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def summarize_with_openai(text: str) -> dict:
|
| 99 |
+
client = _client_lazy()
|
| 100 |
+
prompt = "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。"
|
| 101 |
+
resp = client.responses.create(
|
| 102 |
+
model=MODEL_TEXT,
|
| 103 |
+
input=[
|
| 104 |
+
{"role": "system", "content": [{"type": "input_text", "text": "You write crisp Japanese executive summaries."}]},
|
| 105 |
+
{"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
|
| 106 |
+
],
|
| 107 |
+
)
|
| 108 |
+
full = resp.output_text
|
| 109 |
+
return {
|
| 110 |
+
"300chars": full[:600] if len(full) > 0 else "",
|
| 111 |
+
"100chars": full[:120] if len(full) > 0 else "",
|
| 112 |
+
"onesent": full.split("。")[0] + "。" if "。" in full else full,
|
| 113 |
+
}
|