Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoConfig # Required for Hugging Face integration | |
| from calc_params import calc_params # Import calc_params from the new file | |
| import math | |
| # ---- Helper Functions ---- # | |
| def get_hf_model_args(hf_model_name_or_path): | |
| try: | |
| config = AutoConfig.from_pretrained(hf_model_name_or_path, trust_remote_code=True).to_dict() | |
| except Exception as e: | |
| raise gr.Error(f"Error fetching Hugging Face model: {str(e)}") | |
| # Extract relevant values from the config | |
| num_layers = config.get("num_hidden_layers", None) | |
| hidden_size = config.get("hidden_size", None) | |
| num_attention_heads = config.get("num_attention_heads", None) | |
| vocab_size = config.get("vocab_size", None) | |
| sequence_length = config.get("max_position_embeddings", None) | |
| return { | |
| "num_layers": num_layers, | |
| "hidden_size": hidden_size, | |
| "num_attention_heads": num_attention_heads, | |
| "vocab_size": vocab_size, | |
| "sequence_length": sequence_length, | |
| } | |
| # ---- Update Gradio inputs with Hugging Face model config ---- # | |
| def update_from_hf_model(hf_model_name_or_path): | |
| model_params = get_hf_model_args(hf_model_name_or_path) | |
| return (gr.update(value=model_params["num_layers"]), | |
| gr.update(value=model_params["hidden_size"]), | |
| gr.update(value=model_params["num_attention_heads"]), | |
| gr.update(value=model_params["vocab_size"]), | |
| gr.update(value=model_params["sequence_length"]), | |
| "") | |
| # ---- Memory Calculation ---- # | |
| def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib): | |
| model_params = get_hf_model_args(hf_model_name_or_path) if hf_model_name_or_path else None | |
| if model_params: | |
| num_layers = model_params["num_layers"] or num_layers | |
| hidden_size = model_params["hidden_size"] or hidden_size | |
| num_attention_heads = model_params["num_attention_heads"] or num_attention_heads | |
| vocab_size = model_params["vocab_size"] or vocab_size | |
| sequence_length = model_params["sequence_length"] or sequence_length | |
| dp_degree = num_gpus / (tensor_parallel_size * pipeline_parallel_size) | |
| embed_params = 2 * vocab_size * hidden_size | |
| positional_params = hidden_size * sequence_length | |
| ln_params = 8 * hidden_size * num_layers + (2 * hidden_size) | |
| attention_params = int(2 * (1 + ffn_expansion_factor) * num_layers * hidden_size * hidden_size) | |
| mlp_params = ffn_expansion_factor * num_layers * hidden_size * hidden_size | |
| total_params = embed_params + positional_params + ln_params + attention_params + mlp_params | |
| bytes_per_param = 2 if is_mixed_precision else 4 | |
| model_mem = total_params * bytes_per_param | |
| per_gpu_mem_gib = (model_mem / (tensor_parallel_size * pipeline_parallel_size)) / 1024**3 + misc_mem_gib | |
| return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB" | |
| # ---- FLOP Calculation ---- # | |
| def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer): | |
| # An A_(m x k) X B_(k x n) matrix multiplication requires 2m x k x n FLOPs (factor of 2 needed to account for multiplies and adds) | |
| tokens = 1e9 * tokens | |
| # determine the flops factor. | |
| iter_factor = 3 | |
| if checkpoint_activations: | |
| iter_factor += 1 | |
| if infer: | |
| iter_factor = 1 | |
| qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size) | |
| attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size | |
| attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size | |
| linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size | |
| ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size | |
| embedding_flops = 6 * tokens * hidden_size * vocab_size | |
| if moe and topk > 1: | |
| ffn_flops += ffn_flops * topk / expert_interval | |
| if moe: | |
| gating_flops = 2 * num_experts * hidden_size / expert_interval | |
| total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops | |
| if moe: | |
| total_flops += gating_flops | |
| def convert_flops(params): | |
| if params == 0: | |
| return "0" | |
| size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs") | |
| i = int(math.floor(math.log(params, 1000))) | |
| p = math.pow(1000, i) | |
| s = round(params / p, 2) | |
| return f"{s} {size_name[i]}" | |
| return { | |
| 'qkv_flops': convert_flops(qkv_flops), | |
| 'attention_matrix_flops': convert_flops(attention_matrix_flops), | |
| 'attention_over_values_flops': convert_flops(attention_over_values_flops), | |
| 'linear_projection_flops': convert_flops(linear_projection_flops), | |
| 'ffn_flops': convert_flops(ffn_flops), | |
| 'embedding_flops': convert_flops(embedding_flops), | |
| 'total_flops': convert_flops(total_flops) | |
| } | |
| # ---- Gradio Interface ---- # | |
| with gr.Blocks(theme='ysharma/TransformerCalculatorNew') as demo: | |
| with gr.Accordion("Credits and General Idea", open=False): | |
| gr.Markdown(""" | |
| This app is a re-creation of [this calculator](https://github.com/EleutherAI/cookbook/tree/main/calc) from EleutherAI. | |
| Before training or inference even begins, common practical questions about potential models must be answered such as: | |
| 1. How many parameters are we targeting? How should those parameters be allocated within the model? | |
| 1. How many FLOPs does the model from step 1 take to train on t tokens? How about inference? | |
| 1. How much memory does the model from step 1 take to train/infer on d devices? What memory-saving strategies (e.g. parallelism, quantization, etc) are necessary to fit the model on device memory? | |
| """) | |
| with gr.Tab("Memory Calculation"): | |
| #with gr.TabItem("Memory Calculation"): | |
| gr.Markdown(""" | |
| ## Memory Calculation | |
| Memory Calculation calculates the amount of device memory required to train or infer a model. See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how memory overhead is calculated. | |
| Take this estimation with a grain of salt, because every implementation is different and these calculations were written to match the GPT-NeoX library as close as possible. | |
| Even for other training and inference libraries, however, we expect our script to give approximate memory estimations within acceptable error. | |
| (Please see [LLM finetuning memory requirements](https://blog.scottlogic.com/2023/11/24/llm-mem.html) for a treatment of how specific memory costs may vary framework-to-framework). Other good resources that we consulted are the [ZeRO Paper](https://arxiv.org/abs/1910.02054) and [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198.pdf). | |
| """) | |
| with gr.Accordion("How to use it?", open=False): | |
| gr.Markdown(""" | |
| ## To Use | |
| Fill in the required details below and click 'Calculate Memory' to get a result. | |
| """) | |
| with gr.Row(): | |
| with gr.Column("Generatable"): | |
| gr.Markdown("## Generatable") | |
| with gr.Group(): | |
| hf_model_name_or_path = gr.Textbox( | |
| label="HuggingFace Model Name or Path", | |
| info="Name of the HuggingFace Hub repository or the local file path for it" | |
| ) | |
| sequence_length = gr.Number( | |
| label="Sequence Length", | |
| value=2048, | |
| info="Sequence length used for training" | |
| ) | |
| vocab_size = gr.Number( | |
| label="Vocab Size", | |
| value=51200, | |
| info="How many tokens are in the embedding layer" | |
| ) | |
| hidden_size = gr.Number( | |
| label="Hidden Size", | |
| value=6144, | |
| info="Dimension of the model's hidden size" | |
| ) | |
| num_attention_heads = gr.Number( | |
| label="Number of Attention Heads", | |
| value=64, | |
| info="Number of attention heads used in the model" | |
| ) | |
| num_layers = gr.Number( | |
| label="Number of Layers", | |
| value=44, | |
| info="Number of transformer layers used in the model" | |
| ) | |
| with gr.Column("User Defined"): | |
| gr.Markdown("## User Defined") | |
| num_gpus = gr.Number( | |
| label="Number of GPUs", | |
| value=1, | |
| info="Number of GPUs used for training" | |
| ) | |
| tensor_parallel_size = gr.Number( | |
| label="Tensor Parallel Size", | |
| value=1, | |
| info="Tensor parallel degree (1 if not used)" | |
| ) | |
| pipeline_parallel_size = gr.Number( | |
| label="Pipeline Parallel Size", | |
| value=1, | |
| info="Pipeline parallel degree (1 if not used)" | |
| ) | |
| batch_size_per_gpu = gr.Number( | |
| label="Batch Size per GPU", | |
| value=8, | |
| info="Batch size per GPU" | |
| ) | |
| ffn_expansion_factor = gr.Number( | |
| label="FFN Expansion Factor", | |
| value=4, | |
| info="How much the MLP hidden size expands" | |
| ) | |
| is_mixed_precision = gr.Checkbox( | |
| label="Mixed Precision", | |
| value=True, | |
| info="Whether mixed precision is enabled" | |
| ) | |
| misc_mem_gib = gr.Number( | |
| label="Miscellaneous Memory Overhead (GiB)", | |
| value=5, | |
| info="Miscellaneous memory overhead per GPU by DL frameworks, communication libraries, etc." | |
| ) | |
| calc_memory_button = gr.Button("Calculate Memory") | |
| memory_result = gr.Textbox(label="Memory Calculation Result", interactive=False) | |
| calc_memory_button.click( | |
| calc_mem, | |
| inputs=[ | |
| hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib | |
| ], | |
| outputs=memory_result | |
| ) | |
| hf_model_name_or_path.change( | |
| fn=update_from_hf_model, | |
| inputs=[hf_model_name_or_path], | |
| outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length, memory_result] | |
| ) | |
| # Parameter Calculation Tab | |
| with gr.TabItem("Parameter Calculation"): | |
| gr.Markdown(""" | |
| ## Parameter Calculation | |
| Parameter Calculation calculates the number of parameters present in a given model based on its hyperparams. | |
| Such calculations are important to determine memory overheads, FLOPs, or to determine the size of an unknown transformer model. | |
| We also found the following resources helpful: | |
| [How does GPT-3 spend its 175B parameters?](https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters) | |
| and [LLM Parameter Counting](https://kipp.ly/transformer-param-count/). | |
| Note that this exists for `.safetensor` files in the explorer. | |
| ## How To Use | |
| Simply input the model details, such as the hidden size, number of layers, and attention heads, and press 'Calculate Parameters' to get a result. | |
| """) | |
| with gr.Row(): | |
| with gr.Column("Generatable"): | |
| with gr.Group(): | |
| hf_model_name_or_path = gr.Textbox( | |
| label="HuggingFace Model Name or Path", | |
| info="Name of the HuggingFace Hub repository or the local file path for it" | |
| ) | |
| vocab_size = gr.Number( | |
| label="Vocab Size", | |
| value=51200, | |
| info="How many tokens are in the embedding layer" | |
| ) | |
| hidden_size = gr.Number( | |
| label="Hidden Size", | |
| value=6144, | |
| info="Dimension of the model's hidden size" | |
| ) | |
| sequence_length = gr.Number( | |
| label="Sequence Length", | |
| value=2048, | |
| info="Sequence length used for training" | |
| ) | |
| num_layers = gr.Number( | |
| label="Number of Layers", | |
| value=44, | |
| info="Number of transformer layers used in the model" | |
| ) | |
| with gr.Column("User Defined"): | |
| tied_embeddings = gr.Checkbox( | |
| label="Tied Embeddings", | |
| value=False, | |
| info="Whether embeddings are tied (shared between input and output)" | |
| ) | |
| ffn_expansion_factor = gr.Number( | |
| label="FFN Expansion Factor", | |
| value=4, | |
| info="How much the MLP hidden size expands" | |
| ) | |
| num_mlp_linears = gr.Number( | |
| label="Number of Linear Layers per MLP Block", | |
| value=2, | |
| info="How many linear layers per MLP block" | |
| ) | |
| kv_size_ratio = gr.Number( | |
| label="KV Size Ratio", | |
| value=1.0, | |
| info="Ratio of total query heads to key/value heads. 1.0 for MHA, 1/num_attention_heads for MQA" | |
| ) | |
| with gr.Accordion("MoE Parameters", open=False): | |
| moe = gr.Checkbox( | |
| label="MoE", | |
| value=False, | |
| info="Whether the model is MoE" | |
| ) | |
| num_experts = gr.Number( | |
| label="Number of Experts", | |
| value=8, | |
| info="Number of experts for MoE" | |
| ) | |
| expert_interval = gr.Number( | |
| label="Expert Interval", | |
| value=1, | |
| info="Expert interval for MoE" | |
| ) | |
| topk = gr.Number( | |
| label="Top k Routing", | |
| value=1, | |
| info="Top k routing for MoE" | |
| ) | |
| calc_param_button = gr.Button("Calculate Parameters") | |
| param_result = gr.Textbox(label="Parameter Calculation Result", interactive=False) | |
| calc_param_button.click(calc_params, | |
| inputs=[vocab_size, tied_embeddings, hidden_size, sequence_length, num_layers, moe, num_experts, expert_interval, topk, ffn_expansion_factor, num_mlp_linears, kv_size_ratio], | |
| outputs=param_result) | |
| hf_model_name_or_path.change(fn=update_from_hf_model, | |
| inputs=[hf_model_name_or_path], | |
| outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length]) | |
| # New FLOP Calculation Tab | |
| with gr.TabItem("FLOP Calculation"): | |
| gr.Markdown(""" | |
| ## FLOP Calculation | |
| FLOP Calculation calculates the number of theoretical FLOPs required to train a model on t tokens. | |
| See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how FLOPs are calculated. | |
| Other good resources that we consulted are the [Chinchilla Paper](https://arxiv.org/abs/2203.15556) and | |
| [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://people.eecs.berkeley.edu/~matei/papers/2021/sc_megatron_lm.pdf). | |
| """) | |
| with gr.Row(): | |
| with gr.Column("Generatable"): | |
| with gr.Group(): | |
| hf_model_name_or_path = gr.Textbox( | |
| label="HuggingFace Model Name or Path", | |
| info="Name of the HuggingFace Hub repository or the local file path for it" | |
| ) | |
| vocab_size = gr.Number( | |
| label="Vocab Size", | |
| value=51200, | |
| info="How many tokens are in the embedding layer" | |
| ) | |
| hidden_size = gr.Number( | |
| label="Hidden Size", | |
| value=6144, | |
| info="Dimension of the model's hidden size" | |
| ) | |
| sequence_length = gr.Number( | |
| label="Sequence Length", | |
| value=2048, | |
| info="Sequence length used for training" | |
| ) | |
| num_layers = gr.Number( | |
| label="Number of Layers", | |
| value=44, | |
| info="Number of transformer layers used in the model" | |
| ) | |
| with gr.Column("Generatable"): | |
| kv_size_ratio = gr.Number( | |
| label="KV Size Ratio", | |
| value=1.0, | |
| info="Ratio of kv heads to query heads used in model. 1.0 for MHA" | |
| ) | |
| ffn_expansion_factor = gr.Number( | |
| label="FFN Expansion Factor", | |
| value=4, | |
| info="How much the MLP hidden size expands" | |
| ) | |
| batch_size = gr.Number( | |
| label="Batch Size", | |
| value=1, | |
| info="Global batch size in units of samples" | |
| ) | |
| tokens = gr.Number( | |
| label="Number of GigaTokens", | |
| value=300, | |
| info="Total number of GigaTokens for training" | |
| ) | |
| checkpoint_activations = gr.Checkbox( | |
| label="Checkpoint Activations", | |
| value=True, | |
| info="Whether Megatron-style activation checkpointing is being used" | |
| ) | |
| infer = gr.Checkbox( | |
| label="Inference-Only", | |
| value=False, | |
| info="Whether the model is being used for inference-only" | |
| ) | |
| # MoE parameters hidden in accordion | |
| with gr.Accordion("Mixture of Experts (MoE)", open=False): | |
| moe = gr.Checkbox( | |
| label="Mixture of Experts (MoE)", | |
| value=False, | |
| info="Whether the model uses Mixture of Experts" | |
| ) | |
| num_experts = gr.Number( | |
| label="Number of Experts", | |
| value=128, | |
| info="Number of experts for Mixture of Experts (MoE)" | |
| ) | |
| expert_interval = gr.Number( | |
| label="Expert Interval", | |
| value=2, | |
| info="Expert interval for Mixture of Experts (MoE)" | |
| ) | |
| topk = gr.Number( | |
| label="Top K Routing for MoE", | |
| value=1, | |
| info="Top k routing for Mixture of Experts (MoE)" | |
| ) | |
| calc_flops_button = gr.Button("Calculate FLOPs") | |
| flops_result = gr.JSON(label="FLOP Calculation Result") | |
| calc_flops_button.click( | |
| calc_flops, | |
| inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer], | |
| outputs=flops_result | |
| ) | |
| hf_model_name_or_path.change(fn=update_from_hf_model, | |
| inputs=[hf_model_name_or_path], | |
| outputs=[num_layers, hidden_size, vocab_size, sequence_length]) | |
| demo.launch() | |