Spaces:

ruanchaves
/

napolab

Sleeping

App Files Files Community

napolab / extract_portuguese_leaderboard.py

ruanchaves

Upload 14 files

2c482cc verified 5 months ago

raw

history blame contribute delete

6.1 kB

	#!/usr/bin/env python3
	"""
	Script to extract data from JSON files in a repository folder
	and save it as a CSV file for import into the benchmark.
	"""

	import pandas as pd
	import json
	import os
	import sys
	import argparse
	from pathlib import Path

	def is_valid_json_file(file_path):
	"""
	Check if a file is a valid JSON file containing a dict.

	Args:
	file_path (str): Path to the JSON file

	Returns:
	bool: True if valid JSON dict, False otherwise
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	return isinstance(data, dict)
	except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError):
	return False

	def find_json_files(repo_path):
	"""
	Recursively find all JSON files in the repository folder.

	Args:
	repo_path (str): Path to the repository folder

	Returns:
	list: List of paths to valid JSON files
	"""
	json_files = []
	repo_path = Path(repo_path)

	if not repo_path.exists():
	print(f"Error: Repository path '{repo_path}' does not exist.")
	return []

	if not repo_path.is_dir():
	print(f"Error: Repository path '{repo_path}' is not a directory.")
	return []

	print(f"Scanning repository: {repo_path}")

	for file_path in repo_path.rglob("*.json"):
	if is_valid_json_file(file_path):
	json_files.append(file_path)
	print(f"Found valid JSON file: {file_path}")

	print(f"Total valid JSON files found: {len(json_files)}")
	return json_files

	def extract_data_from_json(json_file_path):
	"""
	Extract data from a single JSON file.

	Args:
	json_file_path (Path): Path to the JSON file

	Returns:
	dict or None: Extracted data or None if extraction failed
	"""
	try:
	with open(json_file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Check if required fields exist
	if 'config_general' not in data or 'results' not in data:
	return None

	config_general = data['config_general']
	results = data['results']

	# Extract model information
	model_name = config_general.get('model_name', '')
	model_private = config_general.get('model_private', False)
	model_num_parameters = config_general.get('model_num_parameters', 0)

	# Extract results
	all_grouped = results.get('all_grouped', {})

	# Extract metrics
	assin2_rte = all_grouped.get('assin2_rte', 0.0)
	assin2_sts = all_grouped.get('assin2_sts', 0.0)
	faquad_nli = all_grouped.get('faquad_nli', 0.0)
	hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0)

	# Create row data
	row_data = {
	'json_file': str(json_file_path),
	'model_name': model_name,
	'model_private': model_private,
	'model_num_parameters': model_num_parameters,
	'assin2_rte': assin2_rte,
	'assin2_sts': assin2_sts,
	'faquad_nli': faquad_nli,
	'hatebr_offensive': hatebr_offensive
	}

	return row_data

	except Exception as e:
	print(f"Error processing {json_file_path}: {e}")
	return None

	def extract_portuguese_leaderboard(repo_path):
	"""
	Extract data from JSON files in the repository folder and save as CSV.

	Args:
	repo_path (str): Path to the repository folder
	"""

	print("Scanning repository for JSON files...")

	# Find all JSON files
	json_files = find_json_files(repo_path)

	if not json_files:
	print("No valid JSON files found in the repository.")
	return

	# Prepare data for DataFrame
	data = []

	# Process each JSON file
	for i, json_file in enumerate(json_files):
	print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}")

	row_data = extract_data_from_json(json_file)
	if row_data:
	data.append(row_data)

	# Print progress every 10 files
	if (i + 1) % 10 == 0:
	print(f" Processed {i + 1} files...")

	if not data:
	print("No valid data extracted from JSON files.")
	return

	# Create DataFrame
	df = pd.DataFrame(data)

	# Write to CSV
	output_file = 'portuguese_leaderboard.csv'
	df.to_csv(output_file, index=False)

	print(f"\nSuccessfully extracted {len(df)} models to {output_file}")

	# Show first few entries as preview
	print("\nFirst 5 entries:")
	print(df.head().to_string(index=False))

	# Show some statistics
	if not df.empty:
	print(f"\nStatistics:")
	print(f"Total models: {len(df)}")
	print(f"Private models: {df['model_private'].sum()}")
	print(f"Public models: {(~df['model_private']).sum()}")

	# Average scores
	print(f"\nAverage scores:")
	print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2))

	# Show data types and info
	print(f"\nDataFrame info:")
	print(df.info())

	def main():
	"""Main function to run the extraction."""
	parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files')
	parser.add_argument('repo_path', help='Path to the repository folder containing JSON files')

	args = parser.parse_args()

	print("Portuguese LLM Leaderboard Data Extractor")
	print("=" * 50)

	try:
	extract_portuguese_leaderboard(args.repo_path)
	print("\nExtraction completed successfully!")
	except Exception as e:
	print(f"Error during extraction: {e}")
	sys.exit(1)

	if __name__ == "__main__":
	main()