joaogante's picture
rearrange layout for blog post
a39a042
import matplotlib
matplotlib.use('Agg')
import functools
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
FIGURE_PATH = "plt.png"
FIG_DPI = 300
def get_plot(task, gpu, omit_offload):
# slice the dataframe according to the inputs
df = pd.read_csv("data.csv")
df = df[df["task"] == task]
df = df[df["gpu"] == gpu]
if omit_offload == "Yes":
df = df[df["offload"] == 0]
# combine model name and dtype
df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ')
# fuse the two columns to be compared (original and assisted generation)
df = df.melt(
id_vars=["task", "gpu", "model and dtype", "offload"],
value_vars=["Greedy", "Assisted"],
var_name="generation_type",
value_name="generation_time",
)
g = sns.catplot(
data=df,
kind="bar",
x="model and dtype",
y="generation_time",
hue="generation_type",
palette={"Greedy": "blue", "Assisted": "orange"},
alpha=.9,
)
g.despine(left=True)
g.set_axis_labels("Model size and dtype", "Latency (ms/token)")
g.set_xticklabels(fontsize=7)
g.set_yticklabels(fontsize=7)
g.legend.set_title("Generation Type")
plt.setp(g._legend.get_texts(), fontsize='7') # for legend text
# Add the number to the top of each bar
ax = g.facet_axis(0, 0)
for i in ax.containers:
ax.bar_label(i, fontsize=7)
plt.savefig(FIGURE_PATH, dpi=FIG_DPI)
return FIGURE_PATH
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
# Assisted Generation Benchmark
"""
)
# components shared across tabs
omit_offload_fn = functools.partial(
gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True
)
def gpu_selector_fn(gpu_list):
return gr.Dropdown(
gpu_list, value=gpu_list[-1], label="GPU", interactive=True
)
with gr.Tabs():
with gr.TabItem("OPT: Open"):
plot_fn = functools.partial(get_plot, "OPT: Open Text Generation")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `facebook/opt-125m`
### Model Names:
- 1.3B: `facebook/opt-1.3b`
- 6.7B: `facebook/opt-6.7b`
- 30B: `facebook/opt-30b`
- 66B: `facebook/opt-66b`
### Dataset used as input prompt:
- C4 (en, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("OPT: Summ"):
plot_fn = functools.partial(get_plot, "OPT: Summarization")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `facebook/opt-125m`
### Model Names:
- 1.3B: `facebook/opt-1.3b`
- 6.7B: `facebook/opt-6.7b`
- 30B: `facebook/opt-30b`
- 66B: `facebook/opt-66b`
### Dataset used as input prompt:
- CNN Dailymail (3.0.0, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Whisper: ARS"):
plot_fn = functools.partial(get_plot, "Whisper: ARS")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("T4", "No"))
gr.Markdown(
"""
### Assistant Model
- `openai/whisper-tiny`
### Model Names:
- large-v2: `openai/whisper-large-v2`
### Dataset used as input prompt:
- Librispeech ARS (clean, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("CodeGen: Code"):
plot_fn = functools.partial(get_plot, "CodeGen: Code Generation")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `Salesforce/codegen-350M-mono`
### Model Names:
- 2B: `Salesforce/codegen-2B-mono`
- 6B: `Salesforce/codegen-6B-mono`
- 16B: `Salesforce/codegen-16B-mono`
### Dataset used as input prompt:
- The Stack (python)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Flan-T5: Summ"):
plot_fn = functools.partial(get_plot, "Flan-T5: Summarization")
with gr.Row():
with gr.Column():
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
with gr.Column():
omit_offload = omit_offload_fn()
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
gr.Markdown(
"""
### Assistant Model
- `google/flan-t5-small`
### Model Names:
- large: `google/flan-t5-large`
- xl: `google/flan-t5-xl`
- xxl: `google/flan-t5-xxl`
- ul2: `google/flan-ul2`
### Dataset used as input prompt:
- CNN Dailymail (3.0.0, validation set)
"""
)
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Benchmark Info"):
gr.Dataframe(
headers=["Parameter", "Value"],
value=[
["Transformers Version", "4.29dev0"],
["Pytorch Version", "2.0.0"],
["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"],
["CUDA", "11.8 (3090) / 11.3 (others GPUs)"],
["Number of input samples", "20-100 (depending on the model size)"],
["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"],
],
)
demo.launch()