Spaces:

Corin1998
/

HFResumeIntakeSystem_DC

Sleeping

App Files Files Community

HFResumeIntakeSystem_DC / pipelines /openai_ingest.py

Corin1998

Update pipelines/openai_ingest.py

fb38314 verified 20 days ago

raw

history blame contribute delete

4.96 kB

	import os
	import io
	import base64
	from typing import List
	from pdf2image import convert_from_bytes
	from PIL import Image
	from openai import OpenAI

	MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
	MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")

	_client = None

	def _client_lazy():
	global _client
	if _client is None:
	_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
	return _client


	def _img_to_base64(img: Image.Image) -> str:
	buf = io.BytesIO()
	img.save(buf, format="PNG")
	return base64.b64encode(buf.getvalue()).decode("utf-8")


	def _pdf_to_images(pdf_bytes: bytes, dpi: int = 200, max_pages: int = 8) -> List[Image.Image]:
	pages = convert_from_bytes(pdf_bytes, dpi=dpi)
	return pages[:max_pages]


	def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
	"""
	画像/PDF: 画像化して Vision (chat.completions) へ。
	txt/docx: テキスト整形だけ実施（安定・低コスト）。
	"""
	client = _client_lazy()

	# テキストの場合はそのまま整形
	if filetype not in {"pdf", "image"}:
	text = payload.decode("utf-8", errors="ignore")
	sys = "You clean up Japanese resumes, preserving headings and bullet structure and removing layout noise."
	user = (
	"以下の本文を、見出し・箇条書きを保ちつつ整形してください。不要な罫線/番号/改ページは除去：\n\n" + text
	)
	resp = client.chat.completions.create(
	model=MODEL_TEXT,
	messages=[
	{"role": "system", "content": sys},
	{"role": "user", "content": user},
	],
	temperature=0.2,
	)
	return resp.choices[0].message.content.strip()

	# 画像/PDF → 画像列へ
	if filetype == "pdf":
	images = _pdf_to_images(payload)
	else:
	images = [Image.open(io.BytesIO(payload)).convert("RGB")]

	vision_msgs = [
	{"role": "system", "content": "You are an accurate Japanese OCR assistant for resumes."},
	{"role": "user", "content": [
	{
	"type": "text",
	"text": "日本語の履歴書/職務経歴書画像です。OCRして本文を日本語テキストで忠実に返してください。"
	},
	*[
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{_img_to_base64(img)}"}
	} for img in images
	]
	]},
	]
	resp = client.chat.completions.create(
	model=MODEL_VISION,
	messages=vision_msgs,
	temperature=0.0,
	)
	return resp.choices[0].message.content.strip()


	def structure_with_openai(text: str) -> dict:
	client = _client_lazy()
	sys = (
	"あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、"
	"JSONで返してください。JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
	"skills_list は重複除去済み配列。各 *_raw は原文抜粋で構いません。"
	)
	user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
	resp = client.chat.completions.create(
	model=MODEL_TEXT,
	messages=[
	{"role": "system", "content": sys},
	{"role": "user", "content": user},
	],
	temperature=0.2,
	response_format={"type": "json_object"},
	)
	import json as _json
	try:
	data = _json.loads(resp.choices[0].message.content)
	except Exception:
	data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
	for k in ("work_experience_raw", "education_raw", "certifications_raw"):
	data.setdefault(k, "")
	data.setdefault("skills_list", [])
	return data


	def summarize_with_openai(text: str) -> dict:
	client = _client_lazy()
	sys = "You write crisp, factual Japanese executive summaries."
	user = (
	"以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文の3粒度で日本語要約してください。"
	"不要な記号は避け、事実を簡潔に述べてください。\n\n" + text
	)
	resp = client.chat.completions.create(
	model=MODEL_TEXT,
	messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
	temperature=0.2,
	)
	full = resp.choices[0].message.content.strip()

	# ルールベース簡易抽出（フォーマット崩れでも破綻しない）
	one_sent = full.split("。")[0] + "。" if "。" in full else full
	return {
	"300chars": full[:600], # だいたい300字相当（マージン確保）
	"100chars": full[:120],
	"onesent": one_sent,
	}