AxionLab-official's picture
Update app.py
a30b358 verified
import time
import os
import psutil
import shutil
import torch
import math
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_cpp import Llama
from statistics import mean, stdev
# ---------- simple reasoning dataset ----------
TEST_PROMPTS = [
("27 * 14 = ?", "378"),
("102 + 589 = ?", "691"),
("If a train travels 60 km in 1 hour, how far in 5 hours?", "300"),
("15^2 = ?", "225"),
("1000 / 25 = ?", "40")
]
CPU_TDP_WATTS = 65 # typical desktop CPU estimate
ELECTRICITY_PRICE_KWH = 0.90 # adjust for your region
def load_model(model_type, repo_id, file_name, n_ctx):
if model_type == "transformers":
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True)
return (tokenizer, model)
else:
if not os.path.exists(file_name):
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="./", local_dir_use_symlinks=False)
llm = Llama(model_path=file_name, n_ctx=n_ctx)
return llm
def generate(model_type, model_obj, prompt, max_new_tokens, temperature, top_p, top_k):
if model_type == "transformers":
tokenizer, model = model_obj
inputs = tokenizer(prompt, return_tensors="pt")
start = time.time()
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
end = time.time()
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
tokens = outputs.shape[-1]
return text, end-start, tokens
else:
start = time.time()
out = model_obj(prompt, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
end = time.time()
text = out["choices"][0]["text"]
tokens = len(text.split())
return text, end-start, tokens
def reasoning_score(model_type, model_obj):
correct = 0
for q, ans in TEST_PROMPTS:
out, _, _ = generate(model_type, model_obj, q, 32, 0.2, 0.9, 40)
if ans in out:
correct += 1
return correct / len(TEST_PROMPTS)
def energy_cost(seconds):
kwh = (CPU_TDP_WATTS * seconds) / (1000 * 3600)
return kwh * ELECTRICITY_PRICE_KWH
def benchmark(model_type, repo_id, file_name="", runs=3):
process = psutil.Process(os.getpid())
model_obj = load_model(model_type, repo_id, file_name, 2048)
times = []
toks = []
for _ in range(runs):
text, t, tok = generate(model_type, model_obj, "Explain gravity simply.", 128, 0.7, 0.9, 40)
times.append(t)
toks.append(tok/t)
avg_time = mean(times)
toksec = mean(toks)
stability = stdev(times) if len(times) > 1 else 0
score = reasoning_score(model_type, model_obj)
energy = energy_cost(avg_time)
return {
"avg_inference_time": round(avg_time,2),
"tokens_per_sec": round(toksec,2),
"stability_std": round(stability,3),
"reasoning_score": round(score,2),
"energy_cost_estimate": round(energy,5)
}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--type", choices=["transformers","gguf"], required=True)
parser.add_argument("--repo", required=True)
parser.add_argument("--file", default="")
args = parser.parse_args()
result = benchmark(args.type, args.repo, args.file)
print(result)