Spaces:

argmaxinc
/

whisperkit-benchmarks

Running

App Files Files Community

whisperkit-benchmarks / .github /scripts /process_report.py

ardaatahan

Report generation only includes the last two releases

a2b7a6b 3 months ago

raw

history blame contribute delete

18.6 kB

	import json
	import os
	import re
	from datetime import datetime
	from typing import Tuple

	import pandas as pd
	from bs4 import BeautifulSoup


	def format_datetime(dt_str: str) -> str:
	"""
	Format a datetime string for display.

	:param dt_str: String representing a datetime in ISO format
	:return: Formatted datetime string
	"""
	return dt_str.replace("T", " ").split("+")[0]


	def read_json_line_by_line(file_path, commit_hash=None):
	"""
	Read a JSON file line by line, parsing each line as a separate JSON object.
	Optionally filter by commit_hash if provided.

	:param file_path: Path to the JSON file
	:param commit_hash: Optional commit hash to filter data
	:return: List of parsed JSON objects
	"""
	data = []
	with open(file_path, "r") as f:
	for line in f:
	try:
	item = json.loads(line.strip())
	# Filter by commit_hash if provided
	if commit_hash is None or item.get("commit_hash") == commit_hash:
	data.append(item)
	except json.JSONDecodeError:
	print(f"Skipping invalid JSON in {file_path}: {line}")
	return data


	def calculate_change(new: float, old: float, metric_name: str) -> Tuple[float, str]:
	"""Calculate percentage change and return with appropriate emoji."""
	pct_change = new - old
	if abs(pct_change) < 1:
	emoji = "↔️"
	elif pct_change > 0:
	emoji = "🟢" if "wer" not in metric_name.lower() else "❌"
	else:
	emoji = "❌" if "wer" not in metric_name.lower() else "🟢"

	return (pct_change, emoji)


	def has_changes(config, prev_dict, curr_dict):
	"""Check if any metrics have changed."""
	curr = curr_dict[config]
	prev = prev_dict[config]

	metrics = ["speed", "tokens_per_second", "average_wer", "qoi"]
	for key in metrics:
	if key in curr and key in prev:
	curr_val = curr[key]
	prev_val = prev[key]
	if abs(curr_val - prev_val) >= 1: # 1% threshold
	return True
	return False


	def format_metrics_table(config, prev_dict, curr_dict, improved, regressed):
	"""Format metrics into a table string and track improvements/regressions."""
	curr = curr_dict[config]
	prev = prev_dict[config]
	# improved = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
	# regressed = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}

	metrics = [
	("Speed", "speed"),
	("Tok/s", "tokens_per_second"),
	("WER", "average_wer"),
	("QoI", "qoi"),
	]

	table = "```\nMetric Previous Current Change\n--------------------------------\n"
	for metric_name, key in metrics:
	if key in curr and key in prev:
	curr_val = curr[key]
	prev_val = prev[key]
	pct_change, _ = calculate_change(curr_val, prev_val, metric_name)
	if abs(pct_change) >= 1: # Only show metrics with changes
	table += f"{metric_name:<9} {prev_val:<11.2f} {curr_val:<10.2f} {pct_change:.2f}\n"
	# Track improvements/regressions
	if pct_change > 0:
	if "wer" not in metric_name.lower():
	improved[key] += 1
	else:
	regressed[key] += 1
	else:
	if "wer" not in metric_name.lower():
	regressed[key] += 1
	else:
	improved[key] += 1
	table += "```"
	return table


	def extract_status_and_os(cell_value):
	"""
	Extract status and OS versions from a cell, handling both HTML and plain text.
	Returns list of tuples: [(status, os_version), ...]
	"""
	results = []
	cell_value = str(cell_value)

	# First, handle the case where there's no HTML tags
	if cell_value == "Not Supported":
	return results

	# Split the cell into parts (first element and subsequent <p> elements)
	parts = cell_value.split("<p>")

	for part in parts:
	part = part.strip("</p>")
	if not part:
	continue

	# Check if part contains warning symbol
	if "⚠️" in part:
	# Parse HTML to extract OS version from anchor tag
	soup = BeautifulSoup(part, "html.parser")
	# Find text after href that contains OS version
	text = soup.get_text()
	os_match = re.search(r"(iOS\|iPadOS\|macOS)\s+[\d.]+", text)
	if os_match:
	os_version = os_match.group(0)
	results.append(("⚠️", os_version))
	else:
	# For success cases, OS version is directly in the text
	os_match = re.search(r"(iOS\|iPadOS\|macOS)\s+[\d.]+", part)
	if os_match:
	os_version = os_match.group(0)
	results.append(("✅", os_version))

	return results


	def escape_string(s: str) -> str:
	"""Escape a string to be used as a value in JSON."""
	return (
	s.replace("\\", "\\\\")
	.replace('"', '\\"')
	.replace("\n", "\\n")
	.replace("\r", "\\r")
	)


	def analyze_support_changes(prev_csv, curr_csv):
	"""Analyze support changes between CSV files."""
	# Read CSV files
	prev_df = pd.read_csv(prev_csv)
	prev_df.set_index(prev_df.columns[0], inplace=True)

	curr_df = pd.read_csv(curr_csv)
	curr_df.set_index(curr_df.columns[0], inplace=True)

	# Get device lists (excluding first column which is the index)
	prev_devices = sorted(prev_df.columns[1:])
	curr_devices = sorted(curr_df.columns[1:])

	# Calculate device ratio
	device_ratio = len(curr_devices) / len(prev_devices) if prev_devices else 1
	needs_alert = device_ratio < 0.9 # Alert if less than 90% of previous devices

	# Convert to dictionary for easier comparison
	prev_status = {}
	curr_status = {}

	# Process previous data
	for idx in range(len(prev_df)):
	model = prev_df.index[idx]
	for col_idx in range(1, len(prev_df.columns)):
	cell_value = prev_df.iloc[idx, col_idx]
	device = prev_df.columns[col_idx]
	statuses = extract_status_and_os(cell_value)
	for status, os_version in statuses:
	prev_status[(model, device, os_version)] = status

	# Process current data and track new configurations
	new_configs = []
	for idx in range(len(curr_df)):
	model = curr_df.index[idx]
	for col_idx in range(1, len(curr_df.columns)):
	cell_value = curr_df.iloc[idx, col_idx]
	device = curr_df.columns[col_idx]
	statuses = extract_status_and_os(cell_value)
	for status, os_version in statuses:
	curr_status[(model, device, os_version)] = status
	# Check if this is a new configuration
	if (model, device, os_version) not in prev_status:
	new_configs.append((model, device, os_version))

	# Find changes
	fixed_errors = []
	new_errors = []

	# Check all configurations that exist in both datasets
	common_configs = set(prev_status.keys()) & set(curr_status.keys())
	for config in common_configs:
	model, device, os_version = config
	if prev_status[config] == "⚠️" and curr_status[config] == "✅":
	fixed_errors.append((model, device, os_version))
	elif prev_status[config] == "✅" and curr_status[config] == "⚠️":
	new_errors.append((model, device, os_version))

	return fixed_errors, new_errors, new_configs, needs_alert


	def generate_report():
	# Load version data first to get commit hashes
	with open("report_data/version.json", "r") as f:
	version_data = json.load(f)

	# Get the last two commit hashes from releases array
	releases = version_data.get("releases", [])
	if len(releases) >= 2:
	curr_commit_hash = releases[-1] # latest commit
	prev_commit_hash = releases[-2] # previous commit
	else:
	curr_commit_hash = releases[-1] if releases else ""
	prev_commit_hash = ""

	# Load and filter performance data by commit hash
	prev_perf_data = read_json_line_by_line("dashboard_data/performance_data.json", commit_hash=prev_commit_hash)
	curr_perf_data = read_json_line_by_line("report_data/performance_data.json", commit_hash=curr_commit_hash)

	prev_dict = {(d["model"], d["device"], d["os"]): d for d in prev_perf_data}
	curr_dict = {(d["model"], d["device"], d["os"]): d for d in curr_perf_data}
	common_configs = set(curr_dict.keys()) & set(prev_dict.keys())

	# Load version data
	with open("dashboard_data/version.json", "r") as f:
	prev_version = json.load(f)
	with open("report_data/version.json", "r") as f:
	curr_version = json.load(f)

	prev_releases = set(prev_version.get("releases", []))
	curr_releases = set(curr_version.get("releases", []))
	new_releases = curr_releases - prev_releases
	removed_releases = prev_releases - curr_releases

	# Track metrics
	improved_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
	regressed_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}

	new_data_points = len(set(curr_dict.keys()) - set(prev_dict.keys()))

	# Analyze support changes
	fixed_errors, new_errors, new_configs, needs_alert = analyze_support_changes(
	"report_data/support_data.csv", "dashboard_data/support_data.csv"
	)

	# Create Slack blocks
	current_time = datetime.now().strftime("%B %-d, %Y %H:%M:%S")
	prev_release_tag, curr_release_tag = (
	prev_version["versions"][-1] if prev_version["versions"] else "N/A",
	curr_version["versions"][-1],
	)
	slack_blocks = {
	"blocks": [
	{
	"type": "header",
	"text": {
	"type": "plain_text",
	"text": "🔔 WhisperKit Dataset Update Report 🔔",
	"emoji": True,
	},
	},
	{
	"type": "context",
	"elements": [{"text": f"{current_time}", "type": "mrkdwn"}],
	},
	{"type": "divider"},
	{
	"type": "section",
	"text": {"type": "mrkdwn", "text": "ℹ️ CURRENT VERSION INFO ℹ️"},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Last Modified: `{format_datetime(curr_version['last_modified'])}`",
	},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Dataset SHA: `{curr_version['sha']}`",
	},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Current Releases: {', '.join(f'`{r}`' for r in curr_version['releases'])}",
	},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Current Release Tag: `{curr_release_tag}`",
	},
	},
	{"type": "divider"},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "🔄 SUMMARY OF PERFORMANCE UPDATES 🔄",
	},
	},
	]
	}

	# Add release information
	slack_blocks["blocks"].extend(
	[
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Added Releases: {', '.join(sorted(new_releases)) if new_releases else 'None'}",
	},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Removed Releases: {', '.join(sorted(removed_releases)) if removed_releases else 'None'}",
	},
	},
	]
	)
	if prev_release_tag != curr_release_tag:
	slack_blocks["blocks"].append(
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• Release Tag Change: `{prev_release_tag}` → `{curr_release_tag}`",
	},
	}
	)
	slack_blocks["blocks"].extend(
	[
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "\n",
	},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• New Data Points: `{new_data_points}` new configurations",
	},
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "\n",
	},
	},
	]
	)

	# Create performance text as a single mrkdwn string
	if common_configs:
	performance_text = "💡 Performance Updates 💡\n\n"

	# Group by model for better organization
	models = sorted(set(model for model, _, _ in common_configs))

	for model in models:
	model_configs = sorted([cfg for cfg in common_configs if cfg[0] == model])

	for config in model_configs:
	device_info = f"{model} ({config[2]})"

	if not has_changes(config, prev_dict, curr_dict):
	# If no changes, just add the model with a checkmark
	performance_text += f"{device_info} ✅\n\n"
	else:
	# If there are changes, show the metrics
	performance_text += f"{device_info}\n"
	table = format_metrics_table(config, prev_dict, curr_dict, improved_metrics, regressed_metrics)
	performance_text += table
	performance_text += "\n\n"

	# Add metrics summary
	for metric_name, key in [
	("Speed", "speed"),
	("Tok/s", "tokens_per_second"),
	("WER", "average_wer"),
	("QoI", "qoi"),
	]:
	slack_blocks["blocks"].append(
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• {metric_name}: `{improved_metrics[key]}` improved, `{regressed_metrics[key]}` regressed",
	},
	}
	)

	# Add support changes section
	if fixed_errors or new_errors or new_configs:
	slack_blocks["blocks"].extend(
	[
	{"type": "divider"},
	{
	"type": "section",
	"text": {"type": "mrkdwn", "text": "📱 DEVICE SUPPORT CHANGES 📱"},
	},
	]
	)

	if fixed_errors:
	slack_blocks["blocks"].extend(
	[
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "Successful Configurations That Override Previous Failures",
	},
	}
	]
	)
	for model, device, os_version in sorted(fixed_errors):
	slack_blocks["blocks"].append(
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• {model} on {device} ({os_version})",
	},
	}
	)

	if new_errors:
	slack_blocks["blocks"].extend(
	[
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "Failed Configurations That Override Previous Successes",
	},
	}
	]
	)
	for model, device, os_version in sorted(new_errors):
	slack_blocks["blocks"].append(
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• {model} on {device} ({os_version})",
	},
	}
	)

	if new_configs:
	slack_blocks["blocks"].extend(
	[
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "Newly Tested Configurations",
	},
	}
	]
	)
	for model, device, os_version in sorted(new_configs):
	slack_blocks["blocks"].append(
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": f"• {model} on {device} ({os_version})",
	},
	}
	)

	# Add alert if significant decrease in device count
	if needs_alert:
	slack_blocks["blocks"].append(
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "⚠️ ALERT: Current device count is less than 90% of previous version's device count, test on more devices before updating the benchmark website!",
	},
	}
	)

	# Write to GITHUB_OUTPUT
	github_output = os.getenv("GITHUB_OUTPUT")
	if github_output:
	with open(github_output, "a") as f:
	f.write("slack_message_payload<<EOF\n")
	json.dump(slack_blocks, f, indent=2)
	f.write("\nEOF\n")

	with open(github_output, "a") as f:
	escaped_text = escape_string(performance_text)
	print(f"performance_message={escaped_text}", file=f)


	if __name__ == "__main__":
	generate_report()