Spaces:

jbilcke-hf
/

SNIPED_grasp-any-region

Running on Zero

File size: 6,550 Bytes

46861c5

"""
Reference: https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/eval_gpt_review.py
"""

import argparse
import json
import os
import time

import cv2
import numpy as np
import openai
import requests
from paint_util import encode_image, paint_text_box, paint_text_point
from tqdm import tqdm

# Define Azure OpenAI details
model_name = "gpt-4o-2024-11-20"
max_tokens = 1000  # range: [1, 4095]

# Initialize the Azure client
client = openai.AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2024-03-01-preview",
)


def get_eval(content: str, max_tokens: int):
    while True:
        try:
            messages = [
                {
                    "role": "system",
                    "content": "You are a helpful and precise assistant for checking the quality of the answer.",
                },
                {
                    "role": "user",
                    "content": content,
                },
            ]
            completion = client.chat.completions.create(
                model=model_name,
                messages=messages,
                max_tokens=max_tokens,
                temperature=0,
            )
            ret = completion.choices[0].message.content
            break

        except Exception as e:
            print(e)
        time.sleep(1)

    return ret


def parse_score(review):
    try:
        score_pair = review.split("\n")[0]
        score_pair = score_pair.replace(",", " ")
        sp = score_pair.split(" ")
        print("score_pair:", score_pair, sp)
        return [float(sp[0]), float(sp[1])]
    except Exception as e:
        print(e)
        print("error", review)
        return [-1, -1]


def main(args):
    phase = args.phase  # android_QA_box
    domain = phase.split("_box")[0]  # android_QA

    if "natural" in phase:
        context_str = "The image is a natural image."
    elif "ocr" in phase:
        context_str = "The image contains text, and the user wishes to know the content of the text."
    elif "screen" in phase:
        context_str = "The image is a screenshot from a mobile phone or webpage."
    elif "panel" in phase:
        context_str = "The image is a multi-panel figure."
    elif "android" in phase:
        context_str = "The image is an andriod screenshot."
    elif "web" in phase:
        context_str = "The image is a webpage screenshot."

    question_path = f"mdvp_for_gpt4v_eval/{phase}/question.json"
    args.question = question_path
    # parser.add_argument('--question', default=question_path, help='path to question file')

    answer_list_path = [
        f"mdvp_for_gpt4v_eval/{phase}/answer.json",
        f"mdvp_for_gpt4v_eval/{phase}/prediction.json",
    ]
    args.answer_list = answer_list_path
    # parser.add_argument('--answer-list', nargs='+', default=answer_list_path, help='gpt answer and model answer json files')

    rule_path = f"annotations/rule.json"
    args.rule = rule_path
    # parser.add_argument('--rule', default=rule_path ,help='gpt rule')

    f_q = json.load(open(os.path.expanduser(args.question)))
    f_ans1 = json.load(open(os.path.expanduser(args.answer_list[0])))
    f_ans2 = json.load(open(os.path.expanduser(args.answer_list[1])))
    rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))

    os.makedirs("./result", exist_ok=True)

    if os.path.isfile(os.path.expanduser(args.output)):
        cur_reviews = [
            json.loads(line) for line in open(os.path.expanduser(args.output))
        ]
    else:
        cur_reviews = []

    review_file = open(f"{args.output}", "a")

    idx = 0
    for ques, ans1, ans2 in tqdm(zip(f_q, f_ans1, f_ans2)):
        # paint som mark on image
        image_name = ques["image"]
        image_path = f"data/{domain}/images/" + image_name
        # print("loading image from {}".format(image_path))
        image = cv2.imread(image_path)
        height, width, channels = image.shape
        (width, height)
        if "bbox" in ques["annotation"]:
            bbox = ques["annotation"]["bbox"]
            paint_image_path = paint_text_box(image_path, bbox)
            rule = rule_dict["box"]
        elif "points" in ques["annotation"]:
            points = ques["annotation"]["points"]
            paint_image_path = paint_text_point(image_path, points)
            rule = rule_dict["point"]
        base64_image = encode_image(paint_image_path)

        prompt = rule["prompt"]
        role = rule["role"]
        content_text = (
            f"[Context]\{context_str}\n\n"
            f'[Question]\n{ques["text"]}\n\n'
            f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
            f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
            f"[System]\n{prompt}\n\n"
        )

        content = [
            {
                "type": "text",
                "text": content_text,
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}",
                    "detail": "high",
                },
            },
        ]

        cur_js = {
            "id": idx + 1,
            "question_id": ques["question_id"],
            "answer1_id": ans1.get("answer_id", ans1["question_id"]),
            "answer2_id": ans2.get("answer_id", ans2["question_id"]),
            "category": phase,
        }
        # pdb.set_trace()
        if idx >= len(cur_reviews):
            review = get_eval(content, args.max_tokens)
            # print(review)

            scores = parse_score(review)
            cur_js["content"] = review
            cur_js["tuple"] = scores
            cur_js["answer1"] = ans1["text"]
            cur_js["answer2"] = ans2["text"]
            review_file.write(json.dumps(cur_js) + "\n")
            review_file.flush()
        else:
            print(f"Skipping {idx} as we already have it.")

        idx += 1
        print(idx)

    review_file.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
    parser.add_argument(
        "--phase", help="MDVP domain", type=str, required=True
    )  # android_QA_box
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=1024,
        help="maximum number of tokens produced in the output",
    )
    parser.add_argument(
        "--output", default=f"result/gpt_score.jsonl", help="output json dir"
    )
    args = parser.parse_args()
    main(args)