Spaces:

joaogante
/

assisted_generation_benchmarks

Running

App Files Files Community

assisted_generation_benchmarks / app.py

joaogante

rearrange layout for blog post

a39a042 over 2 years ago

raw

history blame contribute delete

8.56 kB

	import matplotlib
	matplotlib.use('Agg')

	import functools
	import gradio as gr
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pandas as pd


	FIGURE_PATH = "plt.png"
	FIG_DPI = 300


	def get_plot(task, gpu, omit_offload):
	# slice the dataframe according to the inputs
	df = pd.read_csv("data.csv")
	df = df[df["task"] == task]
	df = df[df["gpu"] == gpu]
	if omit_offload == "Yes":
	df = df[df["offload"] == 0]

	# combine model name and dtype
	df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ')

	# fuse the two columns to be compared (original and assisted generation)
	df = df.melt(
	id_vars=["task", "gpu", "model and dtype", "offload"],
	value_vars=["Greedy", "Assisted"],
	var_name="generation_type",
	value_name="generation_time",
	)

	g = sns.catplot(
	data=df,
	kind="bar",
	x="model and dtype",
	y="generation_time",
	hue="generation_type",
	palette={"Greedy": "blue", "Assisted": "orange"},
	alpha=.9,
	)
	g.despine(left=True)
	g.set_axis_labels("Model size and dtype", "Latency (ms/token)")
	g.set_xticklabels(fontsize=7)
	g.set_yticklabels(fontsize=7)
	g.legend.set_title("Generation Type")
	plt.setp(g._legend.get_texts(), fontsize='7') # for legend text

	# Add the number to the top of each bar
	ax = g.facet_axis(0, 0)
	for i in ax.containers:
	ax.bar_label(i, fontsize=7)

	plt.savefig(FIGURE_PATH, dpi=FIG_DPI)
	return FIGURE_PATH


	demo = gr.Blocks()

	with demo:
	gr.Markdown(
	"""
	# Assisted Generation Benchmark
	"""
	)
	# components shared across tabs
	omit_offload_fn = functools.partial(
	gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True
	)

	def gpu_selector_fn(gpu_list):
	return gr.Dropdown(
	gpu_list, value=gpu_list[-1], label="GPU", interactive=True
	)

	with gr.Tabs():
	with gr.TabItem("OPT: Open"):
	plot_fn = functools.partial(get_plot, "OPT: Open Text Generation")
	with gr.Row():
	with gr.Column():
	gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
	with gr.Column():
	omit_offload = omit_offload_fn()

	# Show plot when the gradio app is initialized
	plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
	gr.Markdown(
	"""
	### Assistant Model
	- `facebook/opt-125m`

	### Model Names:
	- 1.3B: `facebook/opt-1.3b`
	- 6.7B: `facebook/opt-6.7b`
	- 30B: `facebook/opt-30b`
	- 66B: `facebook/opt-66b`

	### Dataset used as input prompt:
	- C4 (en, validation set)
	"""
	)
	# Update plot when any of the inputs change
	plot_inputs = [gpu_selector, omit_offload]
	gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	with gr.TabItem("OPT: Summ"):
	plot_fn = functools.partial(get_plot, "OPT: Summarization")
	with gr.Row():
	with gr.Column():
	gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
	with gr.Column():
	omit_offload = omit_offload_fn()

	# Show plot when the gradio app is initialized
	plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
	gr.Markdown(
	"""
	### Assistant Model
	- `facebook/opt-125m`

	### Model Names:
	- 1.3B: `facebook/opt-1.3b`
	- 6.7B: `facebook/opt-6.7b`
	- 30B: `facebook/opt-30b`
	- 66B: `facebook/opt-66b`

	### Dataset used as input prompt:
	- CNN Dailymail (3.0.0, validation set)
	"""
	)
	# Update plot when any of the inputs change
	plot_inputs = [gpu_selector, omit_offload]
	gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	with gr.TabItem("Whisper: ARS"):
	plot_fn = functools.partial(get_plot, "Whisper: ARS")
	with gr.Row():
	with gr.Column():
	gpu_selector = gpu_selector_fn(["3090", "T4"])
	with gr.Column():
	omit_offload = omit_offload_fn()

	# Show plot when the gradio app is initialized
	plot = gr.Image(value=plot_fn("T4", "No"))
	gr.Markdown(
	"""
	### Assistant Model
	- `openai/whisper-tiny`

	### Model Names:
	- large-v2: `openai/whisper-large-v2`

	### Dataset used as input prompt:
	- Librispeech ARS (clean, validation set)



	"""
	)
	# Update plot when any of the inputs change
	plot_inputs = [gpu_selector, omit_offload]
	gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	with gr.TabItem("CodeGen: Code"):
	plot_fn = functools.partial(get_plot, "CodeGen: Code Generation")
	with gr.Row():
	with gr.Column():
	gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
	with gr.Column():
	omit_offload = omit_offload_fn()
	# Show plot when the gradio app is initialized
	plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
	gr.Markdown(
	"""
	### Assistant Model
	- `Salesforce/codegen-350M-mono`

	### Model Names:
	- 2B: `Salesforce/codegen-2B-mono`
	- 6B: `Salesforce/codegen-6B-mono`
	- 16B: `Salesforce/codegen-16B-mono`

	### Dataset used as input prompt:
	- The Stack (python)

	"""
	)
	# Update plot when any of the inputs change
	plot_inputs = [gpu_selector, omit_offload]
	gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	with gr.TabItem("Flan-T5: Summ"):
	plot_fn = functools.partial(get_plot, "Flan-T5: Summarization")
	with gr.Row():
	with gr.Column():
	gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
	with gr.Column():
	omit_offload = omit_offload_fn()

	# Show plot when the gradio app is initialized
	plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
	gr.Markdown(
	"""
	### Assistant Model
	- `google/flan-t5-small`

	### Model Names:
	- large: `google/flan-t5-large`
	- xl: `google/flan-t5-xl`
	- xxl: `google/flan-t5-xxl`
	- ul2: `google/flan-ul2`

	### Dataset used as input prompt:
	- CNN Dailymail (3.0.0, validation set)
	"""
	)
	# Update plot when any of the inputs change
	plot_inputs = [gpu_selector, omit_offload]
	gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
	with gr.TabItem("Benchmark Info"):
	gr.Dataframe(
	headers=["Parameter", "Value"],
	value=[
	["Transformers Version", "4.29dev0"],
	["Pytorch Version", "2.0.0"],
	["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"],
	["CUDA", "11.8 (3090) / 11.3 (others GPUs)"],
	["Number of input samples", "20-100 (depending on the model size)"],
	["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"],
	],
	)

	demo.launch()