import time import os import psutil import shutil import torch import math from transformers import AutoTokenizer, AutoModelForCausalLM from llama_cpp import Llama from statistics import mean, stdev # ---------- simple reasoning dataset ---------- TEST_PROMPTS = [ ("27 * 14 = ?", "378"), ("102 + 589 = ?", "691"), ("If a train travels 60 km in 1 hour, how far in 5 hours?", "300"), ("15^2 = ?", "225"), ("1000 / 25 = ?", "40") ] CPU_TDP_WATTS = 65 # typical desktop CPU estimate ELECTRICITY_PRICE_KWH = 0.90 # adjust for your region def load_model(model_type, repo_id, file_name, n_ctx): if model_type == "transformers": tokenizer = AutoTokenizer.from_pretrained(repo_id) model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True) return (tokenizer, model) else: if not os.path.exists(file_name): from huggingface_hub import hf_hub_download hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="./", local_dir_use_symlinks=False) llm = Llama(model_path=file_name, n_ctx=n_ctx) return llm def generate(model_type, model_obj, prompt, max_new_tokens, temperature, top_p, top_k): if model_type == "transformers": tokenizer, model = model_obj inputs = tokenizer(prompt, return_tensors="pt") start = time.time() with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k) end = time.time() text = tokenizer.decode(outputs[0], skip_special_tokens=True) tokens = outputs.shape[-1] return text, end-start, tokens else: start = time.time() out = model_obj(prompt, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k) end = time.time() text = out["choices"][0]["text"] tokens = len(text.split()) return text, end-start, tokens def reasoning_score(model_type, model_obj): correct = 0 for q, ans in TEST_PROMPTS: out, _, _ = generate(model_type, model_obj, q, 32, 0.2, 0.9, 40) if ans in out: correct += 1 return correct / len(TEST_PROMPTS) def energy_cost(seconds): kwh = (CPU_TDP_WATTS * seconds) / (1000 * 3600) return kwh * ELECTRICITY_PRICE_KWH def benchmark(model_type, repo_id, file_name="", runs=3): process = psutil.Process(os.getpid()) model_obj = load_model(model_type, repo_id, file_name, 2048) times = [] toks = [] for _ in range(runs): text, t, tok = generate(model_type, model_obj, "Explain gravity simply.", 128, 0.7, 0.9, 40) times.append(t) toks.append(tok/t) avg_time = mean(times) toksec = mean(toks) stability = stdev(times) if len(times) > 1 else 0 score = reasoning_score(model_type, model_obj) energy = energy_cost(avg_time) return { "avg_inference_time": round(avg_time,2), "tokens_per_sec": round(toksec,2), "stability_std": round(stability,3), "reasoning_score": round(score,2), "energy_cost_estimate": round(energy,5) } if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--type", choices=["transformers","gguf"], required=True) parser.add_argument("--repo", required=True) parser.add_argument("--file", default="") args = parser.parse_args() result = benchmark(args.type, args.repo, args.file) print(result)