| | |
| | |
| |
|
| | |
| | |
| | """ |
| | Single-phase evaluator (DeepSeek API) — Calculate EM / F1 only. |
| | |
| | Usage Example |
| | -------- |
| | python eval_single_phase.py --input data/2wikimqa.jsonl |
| | """ |
| |
|
| | import argparse, time, jsonlines, os |
| | from pathlib import Path |
| | from tqdm import tqdm |
| | from openai import OpenAI |
| | from utils.metrics import qa_em_score, qa_f1_score |
| | from utils.llmjudge import judge_answer_with_api |
| |
|
| | |
| | p = argparse.ArgumentParser("Single-phase evaluator") |
| | p.add_argument("--input", required=True, help="Path to the *.jsonl file to evaluate") |
| | p.add_argument("--model", default="deepseek-r1") |
| | p.add_argument("--temperature", type=float, default=0.5) |
| | p.add_argument("--max_tokens", type=int, default=30) |
| | p.add_argument("--sleep", type=float, default=0.0) |
| | args = p.parse_args() |
| |
|
| | client = OpenAI( |
| | base_url=os.environ.get("OPENAI_BASE_URL"), |
| | api_key=os.environ.get("OPENAI_API_KEY") |
| | ) |
| |
|
| | |
| | def ask(context: str, question: str) -> str: |
| | """Call DeepSeek to get answer (return final answer only)""" |
| | messages = [ |
| | {"role": "system", |
| | "content": ("You are a QA assistant. " |
| | "Answer strictly based on the passages; " |
| | "output only the final answer.")}, |
| | {"role": "user", |
| | "content": f"Answer the question and output only the final answer without extra words. Passages:\n{context}\n\nQuestion: {question}\nAnswer:"} |
| | ] |
| | resp = client.chat.completions.create( |
| | model=args.model, |
| | messages=messages, |
| | temperature=args.temperature, |
| | max_tokens=args.max_tokens |
| | ) |
| | if not resp.choices[0].message.content: |
| | return "None" |
| |
|
| | return resp.choices[0].message.content.strip() |
| |
|
| |
|
| | |
| | def evaluate_file(path: Path): |
| | dataset = path.stem |
| | data = {obj["input"]: obj for obj in jsonlines.open(path)} |
| |
|
| | total = len(data) |
| | em_hits = 0 |
| | f1_sum = 0.0 |
| |
|
| | for q, item in tqdm(data.items(), desc=f"{dataset}"): |
| | ctx = item["context"] |
| | golds = item["answers"] if isinstance(item["answers"], list) else [item["answers"]] |
| |
|
| | pred = ask(ctx, q).split('.', 1)[0] |
| | if pred == "None": |
| | continue |
| | em = max(qa_em_score(pred, g) for g in golds) |
| | f1 = max(qa_f1_score(pred, g) for g in golds) |
| |
|
| | em_hits += em |
| | f1_sum += f1 |
| | if args.sleep: |
| | time.sleep(args.sleep) |
| |
|
| | print(f"\n=== {dataset.upper()} SUMMARY ===") |
| | print(f"Total samples : {total}") |
| | print(f"Exact Match : {em_hits}/{total} ({em_hits/total:.2%})") |
| | print(f"Average F1 : {f1_sum/total:.4f}") |
| | print("-" * 40 + "\n") |
| |
|
| |
|
| | |
| | input_path = Path(args.input) |
| | if not input_path.exists(): |
| | raise SystemExit(f"File does not exist: {input_path}") |
| |
|
| | evaluate_file(input_path) |