Spaces:

elineve
/

H2OTest

Runtime error

App Files Files Community

H2OTest / llm_studio /app_utils /sections /histogram_card.py

elineve

Upload 301 files

07423df over 1 year ago

raw

history blame contribute delete

3.04 kB

	from typing import List

	import pandas as pd
	from h2o_wave import data, ui


	def histogram_card(
	x,
	a=0.1,
	b=0.9,
	x_axis_description="text_length",
	histogram_box="first",
	title="Text Length (split by whitespace)",
	):
	assert " " not in x_axis_description, (
	"x_axis_description in histogram card must not contain spaces, "
	"as the card would not be rendered."
	)
	df_quantile = compute_quantile_df(x, a, b)
	df_quantile = df_quantile.rename(columns={"length": x_axis_description})
	card = ui.plot_card(
	box=histogram_box,
	title=title,
	data=data(
	fields=df_quantile.columns.tolist(),
	rows=df_quantile.values.tolist(),
	pack=True,
	),
	plot=ui.plot(
	marks=[
	ui.mark(
	type="area",
	x=f"={x_axis_description}",
	x_title=f"Total samples: {len(x)}",
	y="=count",
	y_title="Count",
	color="=data_type",
	shape="circle",
	)
	]
	),
	)
	return card


	def compute_quantile_df(x: List[int], a: float, b: float):
	"""
	Compute the quantiles based on the input list x.

	Returns a dataframe with the following columns:
	- length: length of the text
	- count: number of texts with this length
	- data_type: quantile type
	(first (a * 100)% quantile, (a * 100)%-(100 * (1 - b))% quantile,
	last (100 * (1 - b))% quantile)

	Note that quantiles are overlapping on the edges.
	"""
	if not x:
	raise ValueError("Input list x is empty")

	if not 0 <= a <= b <= 1:
	raise ValueError(
	"Values of a and b must be in [0, 1] "
	"and a should be less than or equal to b"
	)

	x_axis_description = "length"
	df = pd.DataFrame(x, columns=[x_axis_description])
	df["count"] = 1
	df_quantile = (
	df.groupby([x_axis_description])
	.sum()
	.reset_index()
	.sort_values(by=x_axis_description)[[x_axis_description, "count"]]
	)
	sorted_data = sorted(x)
	first_quantile = sorted_data[int(len(sorted_data) * a)]
	last_quantile = sorted_data[-int(len(sorted_data) * (1 - b))]

	df_first = df_quantile.loc[df_quantile[x_axis_description] <= first_quantile].copy()
	df_first["data_type"] = f"first {int(a * 100)}% quantile"
	df_last = df_quantile.loc[df_quantile[x_axis_description] >= last_quantile].copy()
	df_last["data_type"] = f"last {100 - int(b * 100)}% quantile"
	df_quantile["data_type"] = f"{int(a * 100)}%-{int(b * 100)}% quantile"
	middle_quantile_min = max(0, len(df_first) - 1)
	middle_quantile_max = (
	min(len(df_quantile), (len(df_quantile) - len(df_last) - 1)) + 1
	)
	df_quantile = pd.concat(
	[
	df_first,
	df_quantile.loc[middle_quantile_min:middle_quantile_max],
	df_last,
	]
	)
	return df_quantile