Spaces:

intelli-zen
/

OpenGeminiAPI

Sleeping

App Files Files Community

OpenGeminiAPI / examples /api_eval /eval_openai.py

HoneyTian

update

601ac30 6 months ago

raw

history blame contribute delete

4.61 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import argparse
	import json
	import os
	import sys
	import time

	pwd = os.path.abspath(os.path.dirname(__file__))
	sys.path.append(os.path.join(pwd, "../../"))

	import openai
	from openai import AzureOpenAI

	from project_settings import environment, project_path


	def get_args():
	"""
	python3 eval_openai.py --model_name gpt-4o --eval_result eval_math_result_gpt-4o.jsonl
	python3 eval_openai.py --model_name gpt-4o-mini --eval_result eval_math_result_gpt-4o-mini.jsonl
	:return:
	"""
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--api_key",
	default=environment.get(key="OPENAI_API_KEY"),
	type=str
	)
	parser.add_argument(
	"--model_name",
	default="gpt-4o",
	# default="gpt-4o-mini",
	type=str
	)
	parser.add_argument(
	"--eval_data",
	default=(project_path / "data/arc-easy.jsonl").as_posix(),
	type=str
	)
	parser.add_argument(
	"--eval_result",
	default=(project_path / "data/eval_math_result.jsonl").as_posix(),
	type=str
	)
	args = parser.parse_args()
	return args


	def main():
	args = get_args()

	# gpt-4o: 82
	# gemini: 89
	client = AzureOpenAI(
	# api_key=args.api_key,
	api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
	api_version="2025-01-01-preview",
	azure_endpoint="https://west-us-chatgpt.openai.azure.com"
	)

	total = 0
	total_correct = 0

	# finished
	finished_idx_set = set()
	if os.path.exists(args.eval_result):
	with open(args.eval_result, "r", encoding="utf-8") as f:
	for row in f:
	row = json.loads(row)
	idx = row["id"]
	total = row["total"]
	total_correct = row["total_correct"]
	finished_idx_set.add(idx)
	print(f"finished count: {len(finished_idx_set)}")

	with open(args.eval_data, "r", encoding="utf-8") as fin, open(args.eval_result, "a+", encoding="utf-8") as fout:
	for row in fin:
	if total > 20:
	break

	row = json.loads(row)
	idx = row["id"]
	question = row["question"]
	choices = row["choices"]
	answer_key = row["answerkey"]

	if idx in finished_idx_set:
	continue
	finished_idx_set.add(idx)

	instruct = "Complete this single-choice question."

	choices_str = ""
	for choice in choices:
	label = choice["label"]
	text = choice["text"]
	choices_str += f"If you think the answer is `{text}` output: `{label}`\n"

	prompt = f"""
	{instruct}

	Question:
	{question}

	Choices:
	{choices_str}

	Remember to output ONLY the corresponding letter.
	Your output is:
	""".strip()
	# print(prompt)
	try:
	time_begin = time.time()
	response = client.chat.completions.create(
	model=args.model_name,
	messages=[{"role": "user", "content": prompt}],
	stream=False,
	# max_tokens=1,
	temperature=0.0,
	# logit_bias={
	# 32: 100,
	# 33: 100,
	# 34: 100,
	# 35: 100,
	# 36: 100,
	# }
	)
	time_cost = time.time() - time_begin
	print(time_cost)
	except openai.BadRequestError as e:
	print(f"request failed, error type: {type(e)}, error text: {str(e)}")
	continue

	prediction = response.choices[0].message.content

	correct = 1 if prediction == answer_key else 0

	total += 1
	total_correct += correct
	score = total_correct / total

	row_ = {
	"id": idx,
	"question": question,
	"choices": choices,
	"ground_true": answer_key,
	"prediction": prediction,
	"correct": correct,
	"total": total,
	"total_correct": total_correct,
	"score": score,
	"time_cost": time_cost,
	}
	row_ = json.dumps(row_, ensure_ascii=False)
	fout.write(f"{row_}\n")

	# print(f"score: {score}")

	return


	if __name__ == "__main__":
	main()