| |
|
| | import argparse, os, jsonlines, torch |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig |
| | from utils.metrics import qa_f1_score, qa_em_score |
| | THINK_END_ID = 151668 |
| |
|
| | |
| | def strip_think(token_ids): |
| | try: |
| | cut = len(token_ids) - token_ids[::-1].index(THINK_END_ID) |
| | return token_ids[cut:] |
| | except ValueError: |
| | return token_ids |
| |
|
| | def main(): |
| | |
| | parser = argparse.ArgumentParser( |
| | description="Evaluate HotpotQA JSONL with Transformers + Qwen3-8B" |
| | ) |
| | parser.add_argument("-i", "--input", required=True, |
| | help="Path to input JSONL file") |
| | parser.add_argument("--model", required=True, |
| | help="HF model name, e.g. Qwen/Qwen3-8B") |
| | parser.add_argument("-d", "--devices", default="0", |
| | help="CUDA_VISIBLE_DEVICES (comma-separated)") |
| | parser.add_argument("-t", "--temperature", type=float, default=0.5, |
| | help="Sampling temperature") |
| | parser.add_argument("-k", "--max_tokens", type=int, default=40, |
| | help="max_new_tokens") |
| | args = parser.parse_args() |
| |
|
| |
|
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) |
| | model = AutoModelForCausalLM.from_pretrained( |
| | args.model, |
| | torch_dtype="auto", |
| | device_map="auto", |
| | trust_remote_code=True |
| | ) |
| | gen_cfg = GenerationConfig( |
| | temperature=args.temperature, |
| | max_new_tokens=args.max_tokens, |
| | do_sample=args.temperature > 0 |
| | ) |
| |
|
| |
|
| | with jsonlines.open(args.input) as reader: |
| | data = list(reader) |
| |
|
| | total_f1 = total_em = 0.0 |
| |
|
| | for idx, item in enumerate(data): |
| | question = item.get("input", "") |
| | context = item.get("context", "") |
| | answers = item.get("answers", []) |
| | if not answers: |
| | print(f"[{idx}] no gold answer, skip") |
| | continue |
| | gold = answers[0] |
| | print(gold) |
| |
|
| | |
| | prompt = ( |
| | "Answer the question based on the given passages. " |
| | "Only give me your answer and do not output any other words.\n" |
| | "Passages:\n" |
| | f"{context}\n" |
| | f"Question: {question}\n" |
| | "Answer:" |
| | ) |
| | messages = [{"role": "user", "content": prompt}] |
| | chat_text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True, |
| | enable_thinking=False |
| | ) |
| | inputs = tokenizer([chat_text], return_tensors="pt").to(model.device) |
| |
|
| |
|
| | |
| | try: |
| | with torch.no_grad(): |
| | outputs = model.generate(**inputs, max_new_tokens=args.max_tokens) |
| | except ValueError as e: |
| | if "position ids exceed" in str(e).lower() or "sequence length" in str(e).lower(): |
| | print(f"[{idx}] prompt too long – skipped") |
| | continue |
| | raise |
| | print("im here") |
| | new_ids = outputs[0][len(inputs.input_ids[0]):].tolist() |
| | try: |
| | index = len(new_ids) - new_ids[::-1].index(151668) |
| | except ValueError: |
| | index = 0 |
| | answer = tokenizer.decode(new_ids[index:], skip_special_tokens=True).strip("\n") |
| | answer = answer.strip() |
| |
|
| | |
| | f1 = qa_f1_score(answer, gold) |
| | em = qa_em_score(answer, gold) |
| | total_f1 += f1 |
| | total_em += em |
| |
|
| | print(f"[{idx}] Q: {question}") |
| | print(f" Resp: {answer!r} | Gold: {gold!r}") |
| | print(f" F1={f1:.2f}, EM={em:.2f}") |
| |
|
| | n = len(data) |
| | print(f"\nOverall F1: {total_f1/n:.4f}") |
| | print(f"Overall EM: {total_em/n:.4f}") |
| |
|
| | if __name__ == "__main__": |
| | main() |