|
|
import torch |
|
|
import argparse |
|
|
import os |
|
|
import json |
|
|
import logging |
|
|
import numpy as np |
|
|
import csv |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from hf_models.opt.modeling_opt_routers_topk import ( |
|
|
SparseOPTForCausalLM, |
|
|
create_hf_mha_router_state_dict, |
|
|
create_hf_mlp_router_state_dict |
|
|
) |
|
|
|
|
|
from hf_models.llama.modeling_sparse_llama_routers import ( |
|
|
SparseLlamaForCausalLM, |
|
|
create_hf_attn_router_state_dict |
|
|
) |
|
|
|
|
|
from hf_models.opt.modeling_sparse_opt_topk import SparseOPTForCausalLM as SparseOPTTopKAttn |
|
|
from hf_models.llama.modeling_sparse_llama_mha_topk import SparseLlamaForCausalLM as SparseLlamaTopKAttn |
|
|
from HybridTensor.benchmarks.opt_attn_sparse_topk_perplexity import _update_model_attn_thresholds |
|
|
from HybridTensor.benchmarks.model_perplexity import compute_attn_layer_sparsity, compute_average_activation |
|
|
from HybridTensor.utils.activations import ActivationThresholds, build_mlp_topk_lookup, _update_hf_mlp_topk, CONFIGS, MODELS |
|
|
from HybridTensor.routers.mlp.mlp_router_optim import load_router_dict_from_csv |
|
|
from HybridTensor.utils.utils import extract_model_name |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
from lm_eval.models.huggingface import HFLM |
|
|
from lm_eval.tasks import TaskManager |
|
|
import lm_eval |
|
|
|
|
|
import pandas as pd |
|
|
from tabulate import tabulate |
|
|
|
|
|
|
|
|
import logging |
|
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR) |
|
|
|
|
|
import warnings |
|
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
|
|
|
|
|
from huggingface_hub import login |
|
|
|
|
|
def read_and_print_results(filepath='results.csv'): |
|
|
""" |
|
|
Reads the CSV file containing evaluation results and prints them in a formatted table. |
|
|
""" |
|
|
if not os.path.exists(filepath): |
|
|
print(f"File '{filepath}' not found.") |
|
|
return |
|
|
|
|
|
df = pd.read_csv(filepath) |
|
|
print(tabulate(df, headers='keys', tablefmt='psql', showindex=False)) |
|
|
|
|
|
def save_results_to_csv(results, attn_topk, filepath='eval_results.csv'): |
|
|
""" |
|
|
Extracts benchmark accuracies from results and saves them along with the attn_topk config. |
|
|
|
|
|
Parameters: |
|
|
results: dict, evaluation results with structure results['results'][<benchmark>]['acc,none'] |
|
|
attn_topk: float, the attention top-k value used for this run |
|
|
filepath: str, CSV file to write to (appends if it exists) |
|
|
""" |
|
|
|
|
|
row = {'attn_topk': attn_topk} |
|
|
for benchmark, data in results['results'].items(): |
|
|
|
|
|
row[benchmark] = data.get('acc,none', None) |
|
|
|
|
|
|
|
|
file_exists = os.path.isfile(filepath) |
|
|
with open(filepath, 'a', newline='') as csvfile: |
|
|
writer = csv.DictWriter(csvfile, fieldnames=row.keys()) |
|
|
if not file_exists: |
|
|
writer.writeheader() |
|
|
writer.writerow(row) |
|
|
|
|
|
def _update_model_attn_sparsity(model, attn_th): |
|
|
num_layers = model.config.num_hidden_layers |
|
|
|
|
|
|
|
|
layers = model.model.decoder.layers if hasattr(model.model, 'decoder') else model.model.layers |
|
|
attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=attn_th) |
|
|
|
|
|
for i in range(num_layers): |
|
|
layers[i].self_attn.sp_threshold = attn_sparsity_map[i] |
|
|
|
|
|
average_act = compute_average_activation(attn_sparsity_map) |
|
|
print(f"Attention sparsity {attn_th}: {attn_sparsity_map}") |
|
|
print(f"Average activation: {average_act:.2f}") |
|
|
|
|
|
return model |
|
|
|
|
|
def _evaluate_model(model, tokenizer, benchmarks: list, device: str, batch_size: int = 8): |
|
|
logging.info("Evaluating on benchmarks: %s", benchmarks) |
|
|
lm_obj = HFLM( |
|
|
pretrained=model, |
|
|
tokenizer=tokenizer, |
|
|
device=device, |
|
|
batch_size=batch_size |
|
|
) |
|
|
task_manager = TaskManager() |
|
|
num_fewshot = 5 |
|
|
print(f"Number of fewshot examples: {num_fewshot}") |
|
|
results = lm_eval.simple_evaluate( |
|
|
model=lm_obj, |
|
|
tasks=benchmarks, |
|
|
num_fewshot=num_fewshot, |
|
|
task_manager=task_manager |
|
|
) |
|
|
logging.info("Evaluation complete.") |
|
|
for benchmark, benchmark_results in results['results'].items(): |
|
|
logging.info("Results for %s: %s", benchmark.upper(), benchmark_results) |
|
|
return results |
|
|
|
|
|
def _load_model(model_name, num_layers, device, args): |
|
|
if args.mode == 'sparse': |
|
|
logging.info("Loading sparse model...") |
|
|
sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th= args.attn_topk, mlp_th=args.mlp_topk) |
|
|
|
|
|
if args.model_index <=8: |
|
|
|
|
|
model = SparseOPTForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
device_map=device, |
|
|
torch_dtype=torch.float16, |
|
|
sp_thresholds=sp_thresholds.activation_threshold, |
|
|
mlp_thresholds=sp_thresholds.mlp_threshold, |
|
|
attn_implementation="flash_attention_2" |
|
|
) |
|
|
logging.info("Loading router states...") |
|
|
mlp_router_state = create_hf_mlp_router_state_dict(args.mlp_ckpt_dir) |
|
|
mha_router_state = create_hf_mha_router_state_dict(args.attn_ckpt_dir) |
|
|
model_state = model.state_dict() |
|
|
model_state.update(mlp_router_state) |
|
|
model_state.update(mha_router_state) |
|
|
model.load_state_dict(model_state) |
|
|
logging.info("Sparse model loaded with routers!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mlp_topk_lookup = load_router_dict_from_csv(args.batch_stats_dir, 1) |
|
|
|
|
|
_update_hf_mlp_topk(model, mlp_topk_lookup) |
|
|
|
|
|
|
|
|
logging.info("Using MLP topk values: %s", mlp_topk_lookup) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.model.decoder.layers[0].self_attn.sp_threshold = 1.0 |
|
|
else: |
|
|
|
|
|
|
|
|
if not args.static_thresholds: |
|
|
attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=args.attn_topk) |
|
|
sp_thresholds.load_thresholds(attn_sparsity_map) |
|
|
average_act = compute_average_activation(attn_sparsity_map) |
|
|
print(f"Layer imporatance weights attention activations {sp_thresholds.activation_threshold}") |
|
|
print(f"Average activation: {average_act:.2f}") |
|
|
|
|
|
model = SparseLlamaForCausalLM.from_pretrained(model_name, |
|
|
device_map = device, |
|
|
torch_dtype=torch.float16, |
|
|
sp_thresholds = sp_thresholds.activation_threshold, |
|
|
attn_implementation="flash_attention_2") |
|
|
logging.info("Loading router states...") |
|
|
model_state = model.state_dict() |
|
|
attn_router_states = create_hf_attn_router_state_dict(args.attn_ckpt_dir) |
|
|
model_state.update(attn_router_states) |
|
|
model.load_state_dict(model_state) |
|
|
logging.info("Sparse model loaded with routers!") |
|
|
|
|
|
|
|
|
_update_model_attn_thresholds(model, args.attn_topk) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif args.mode == 'sparse_attn': |
|
|
logging.info("Loading model with sparse attention") |
|
|
sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th=args.attn_topk) |
|
|
|
|
|
if not args.static_thresholds: |
|
|
attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=args.attn_topk) |
|
|
sp_thresholds.load_thresholds(attn_sparsity_map) |
|
|
average_act = compute_average_activation(attn_sparsity_map) |
|
|
print(f"Layer imporatance weights attention activations {sp_thresholds.activation_threshold}") |
|
|
print(f"Average activation: {average_act:.2f}") |
|
|
|
|
|
if args.model_index <= 8: |
|
|
|
|
|
model = SparseOPTTopKAttn.from_pretrained(model_name, device_map = device, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2") |
|
|
else: |
|
|
|
|
|
model = SparseLlamaTopKAttn.from_pretrained(model_name, device_map = device, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2") |
|
|
else: |
|
|
logging.info("Loading dense model...") |
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.float16) |
|
|
return model |
|
|
|
|
|
def arg_parser(): |
|
|
parser = argparse.ArgumentParser(description='Inference benchmarking') |
|
|
parser.add_argument('--batch_size', type=int, default=8) |
|
|
parser.add_argument('--model_index', type=int, default=5) |
|
|
parser.add_argument('--print_results', type=bool, default=True) |
|
|
parser.add_argument('--results_dir', type=str, default='results/eval') |
|
|
parser.add_argument('--device', type=int, default=100) |
|
|
parser.add_argument('--mode', type=str, default='sparse', choices=['sparse', 'dense', 'sparse_attn']) |
|
|
parser.add_argument('--attn_topk', type=float, default=0.5, help='Attention topk for sparse model') |
|
|
parser.add_argument('--mlp_topk', type=int, default=2048, help='MLP topk for sparse model') |
|
|
parser.add_argument('--delta', type=int, default=128, help='Delta value for MLP topk calculation') |
|
|
parser.add_argument('--mlp_ckpt_dir', type=str, default='<PATH_TO_MLP_ROUTER_CHECKPOINTS>') |
|
|
parser.add_argument('--attn_ckpt_dir', type=str, default='<PATH_TO_ATTENTION_CHECKPOINTS>') |
|
|
parser.add_argument('--batch_stats_dir', type=str, default='configs/mlp_router') |
|
|
parser.add_argument('--data_collection', type=bool, default=False, help='Collect data for different activation thresholds') |
|
|
parser.add_argument('--benchmark', type=str, default='all', help='Options: all, or a single benchmark name') |
|
|
parser.add_argument('--note', type=str, default='', help='Note to add to the results filename') |
|
|
parser.add_argument('--static_thresholds', type=bool, default=True, help='Use static thresholds for attention layers') |
|
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
args = arg_parser() |
|
|
|
|
|
login_token = None |
|
|
assert login_token is not None, "Please provide a valid Hugging Face token." |
|
|
login(token=login_token) |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
model_name = MODELS[args.model_index - 1] |
|
|
|
|
|
logging.info("Evaluating Model: %s", model_name) |
|
|
logging.info("Mode: %s", args.mode) |
|
|
|
|
|
num_layers = CONFIGS[model_name]['num_layer'] |
|
|
device = 'auto' if args.device == 100 else f'cuda:{args.device}' |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
|
|
model = _load_model(model_name, num_layers, device, args) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
if args.benchmark == 'all': |
|
|
benchmarks = ["piqa", "winogrande", "copa", "rte", "openbookqa", "arc_easy", "arc_challenge", "mmlu", "hellaswag"] |
|
|
else: |
|
|
benchmarks = [args.benchmark] |
|
|
|
|
|
model_name_clean = extract_model_name(model_name) |
|
|
|
|
|
if args.data_collection: |
|
|
|
|
|
assert args.mode != 'dense', "Data collection is only available for sparse models" |
|
|
logging.info("Data collection mode enabled.") |
|
|
if args.mode == 'sparse': |
|
|
filepath = f"{args.results_dir}/eval_results_{model_name_clean}_sparse_sweep_dpsd.csv" |
|
|
else: |
|
|
filepath = f"{args.results_dir}/eval_results_{model_name_clean}_attn_sweep_dpsd.csv" |
|
|
|
|
|
if args.note != '': |
|
|
filepath = filepath.replace('.csv', f"_{args.note}.csv") |
|
|
|
|
|
attn_topk_values = [0.9, 0.8, 0.7, 0.6, 0.4, 0.3, 0.2, 0.1] |
|
|
|
|
|
for attn_topk in attn_topk_values: |
|
|
logging.info("Evaluating with attention top-k value: %s", attn_topk) |
|
|
if args.static_thresholds: |
|
|
_update_model_attn_thresholds(model, attn_topk, mode=args.mode) |
|
|
else: |
|
|
_update_model_attn_sparsity(model, attn_topk) |
|
|
|
|
|
results = _evaluate_model( |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
benchmarks=benchmarks, |
|
|
device=device, |
|
|
batch_size=args.batch_size |
|
|
) |
|
|
save_results_to_csv(results, attn_topk, filepath = filepath) |
|
|
else: |
|
|
logging.info("Evaluating with attention top-k value: %s", args.attn_topk) |
|
|
if args.mode == 'dense': |
|
|
filepath = f"{args.results_dir}/eval_results_{model_name_clean}_dense.csv" |
|
|
elif args.mode == 'sparse_attn': |
|
|
filepath = f"{args.results_dir}/eval_results_{model_name_clean}_sparse_attn_{args.attn_topk}_dpsd.csv" |
|
|
else: |
|
|
filepath = f"{args.results_dir}/eval_results_{model_name_clean}_test_attn_{args.attn_topk}_dpsd.csv" |
|
|
if args.note != '': |
|
|
filepath = filepath.replace('.csv', f"_{args.note}.csv") |
|
|
results = _evaluate_model( |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
benchmarks=benchmarks, |
|
|
device=device, |
|
|
batch_size=args.batch_size |
|
|
) |
|
|
save_results_to_csv(results, args.attn_topk, filepath = filepath) |
|
|
|
|
|
if args.print_results: |
|
|
read_and_print_results(filepath=filepath) |