Maria Castellanos
commited on
Commit
·
b52c947
1
Parent(s):
abd9b44
order and duplicate fix
Browse files- intermediate_leaderboard.py +11 -11
- utils.py +4 -2
intermediate_leaderboard.py
CHANGED
|
@@ -13,7 +13,7 @@ import pandas as pd
|
|
| 13 |
def validate_hf_username(username):
|
| 14 |
username = str(username).strip()
|
| 15 |
hf_url = f"https://huggingface.co/{username}"
|
| 16 |
-
return check_page_exists(hf_url, delay=1)
|
| 17 |
# return True # For testing purposes, assume all usernames are valid
|
| 18 |
|
| 19 |
def validate_model_details(tag):
|
|
@@ -37,6 +37,8 @@ def make_intermediate_lb():
|
|
| 37 |
test_repo=results_repo_test
|
| 38 |
)
|
| 39 |
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# HF username validation
|
| 42 |
hf_usernames = df_latest_raw["hf_username"].unique()
|
|
@@ -53,11 +55,12 @@ def make_intermediate_lb():
|
|
| 53 |
# make sure to only keep the latest submission per user for the 'Average' endpoint
|
| 54 |
df_latest_raw["submission_time"] = pd.to_datetime(df_latest_raw["submission_time"])
|
| 55 |
df_latest_raw = df_latest_raw.query("Endpoint == 'Average'")
|
| 56 |
-
df_latest_raw['latest_time_per_user'] = df_latest_raw.groupby('
|
| 57 |
latest_submissions_df = df_latest_raw[df_latest_raw['submission_time'] == df_latest_raw['latest_time_per_user']].copy()
|
| 58 |
-
|
|
|
|
| 59 |
latest_submissions_df = latest_submissions_df.sort_values(
|
| 60 |
-
['
|
| 61 |
).reset_index(drop=True)
|
| 62 |
|
| 63 |
# Get the unique users in the order of their first appearance
|
|
@@ -75,9 +78,6 @@ def make_intermediate_lb():
|
|
| 75 |
# Apply the mapping to create a new column with prefixed usernames
|
| 76 |
latest_submissions_df['user'] = latest_submissions_df['user'].map(user_mapping)
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
# Perform Tukey's HSD test
|
| 82 |
tukey = pairwise_tukeyhsd(endog=latest_submissions_df['RAE'], groups=latest_submissions_df['user'], alpha=0.05)
|
| 83 |
tukey_df = pd.DataFrame(data=tukey._results_table.data[1:],
|
|
@@ -89,7 +89,6 @@ def make_intermediate_lb():
|
|
| 89 |
cld_df = pd.DataFrame(cld_dict.items(),columns=["group","letter"]).sort_values("group")
|
| 90 |
cld_df.letter = [",".join(x) for x in cld_df.letter]
|
| 91 |
cld_df["user"] = cld_df.group
|
| 92 |
-
cld_df["user_fixed"] = cld_df.group.str.split("___").str[1]
|
| 93 |
|
| 94 |
# clean up CLD letters for extended alphabet (i.e with @ symbols)
|
| 95 |
def clean_up(ser):
|
|
@@ -113,14 +112,15 @@ def make_intermediate_lb():
|
|
| 113 |
metric_stats[f"{metric}_display"] = metric_stats.apply(
|
| 114 |
lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
|
| 115 |
)
|
| 116 |
-
cld_df =
|
| 117 |
|
| 118 |
# re-sort by RAE mean, lowest is best
|
| 119 |
cld_df = cld_df.sort_values(by='RAE_mean', ascending=True).reset_index(drop=True)
|
|
|
|
| 120 |
|
|
|
|
|
|
|
| 121 |
|
| 122 |
-
cld_subset = cld_df[['user_fixed', 'fixed_letter'] + [f'{metric}_display' for metric in METRICS]]
|
| 123 |
-
cld_subset = cld_subset.rename(columns={'user_fixed': 'user', 'fixed_letter': 'CLD'})
|
| 124 |
print(cld_subset.head())
|
| 125 |
cld_subset.to_csv("leaderboard_cld_results.csv", index=False)
|
| 126 |
|
|
|
|
| 13 |
def validate_hf_username(username):
|
| 14 |
username = str(username).strip()
|
| 15 |
hf_url = f"https://huggingface.co/{username}"
|
| 16 |
+
return check_page_exists(hf_url, delay=1, max_retries=10)
|
| 17 |
# return True # For testing purposes, assume all usernames are valid
|
| 18 |
|
| 19 |
def validate_model_details(tag):
|
|
|
|
| 37 |
test_repo=results_repo_test
|
| 38 |
)
|
| 39 |
|
| 40 |
+
# Make all usernames lowercase
|
| 41 |
+
df_latest_raw["hf_username"] = df_latest_raw["hf_username"].str.lower()
|
| 42 |
|
| 43 |
# HF username validation
|
| 44 |
hf_usernames = df_latest_raw["hf_username"].unique()
|
|
|
|
| 55 |
# make sure to only keep the latest submission per user for the 'Average' endpoint
|
| 56 |
df_latest_raw["submission_time"] = pd.to_datetime(df_latest_raw["submission_time"])
|
| 57 |
df_latest_raw = df_latest_raw.query("Endpoint == 'Average'")
|
| 58 |
+
df_latest_raw['latest_time_per_user'] = df_latest_raw.groupby('hf_username')['submission_time'].transform('max')
|
| 59 |
latest_submissions_df = df_latest_raw[df_latest_raw['submission_time'] == df_latest_raw['latest_time_per_user']].copy()
|
| 60 |
+
# Fix to order by the mean RAE and not the RAE of all samples (slight missmatch for some users)
|
| 61 |
+
latest_submissions_df['mean_RAE'] = latest_submissions_df.groupby('hf_username')['RAE'].transform('mean')
|
| 62 |
latest_submissions_df = latest_submissions_df.sort_values(
|
| 63 |
+
by=['mean_RAE', 'Sample'], ascending=True
|
| 64 |
).reset_index(drop=True)
|
| 65 |
|
| 66 |
# Get the unique users in the order of their first appearance
|
|
|
|
| 78 |
# Apply the mapping to create a new column with prefixed usernames
|
| 79 |
latest_submissions_df['user'] = latest_submissions_df['user'].map(user_mapping)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
# Perform Tukey's HSD test
|
| 82 |
tukey = pairwise_tukeyhsd(endog=latest_submissions_df['RAE'], groups=latest_submissions_df['user'], alpha=0.05)
|
| 83 |
tukey_df = pd.DataFrame(data=tukey._results_table.data[1:],
|
|
|
|
| 89 |
cld_df = pd.DataFrame(cld_dict.items(),columns=["group","letter"]).sort_values("group")
|
| 90 |
cld_df.letter = [",".join(x) for x in cld_df.letter]
|
| 91 |
cld_df["user"] = cld_df.group
|
|
|
|
| 92 |
|
| 93 |
# clean up CLD letters for extended alphabet (i.e with @ symbols)
|
| 94 |
def clean_up(ser):
|
|
|
|
| 112 |
metric_stats[f"{metric}_display"] = metric_stats.apply(
|
| 113 |
lambda row: f"{row[f'{metric}_mean']:.4f} ± {row[f'{metric}_std']:.4f}", axis=1
|
| 114 |
)
|
| 115 |
+
cld_df = metric_stats[['user', f'{metric}_mean', f'{metric}_std', f'{metric}_display']].merge(cld_df, on='user', how='left')
|
| 116 |
|
| 117 |
# re-sort by RAE mean, lowest is best
|
| 118 |
cld_df = cld_df.sort_values(by='RAE_mean', ascending=True).reset_index(drop=True)
|
| 119 |
+
cld_df['user'] = cld_df['user'].str.split('___').str[1]
|
| 120 |
|
| 121 |
+
cld_subset = cld_df[['user', 'fixed_letter'] + [f'{metric}_display' for metric in METRICS]]
|
| 122 |
+
cld_subset = cld_subset.rename(columns={'user': 'user', 'fixed_letter': 'CLD'})
|
| 123 |
|
|
|
|
|
|
|
| 124 |
print(cld_subset.head())
|
| 125 |
cld_subset.to_csv("leaderboard_cld_results.csv", index=False)
|
| 126 |
|
utils.py
CHANGED
|
@@ -43,8 +43,10 @@ def check_page_exists(url: str, delay=0.2, max_retries=3, current_retries=0):
|
|
| 43 |
# Check for Rate Limit Error and retry if under the limit
|
| 44 |
if response.status_code == 429:
|
| 45 |
if current_retries < max_retries:
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
# Recurse with an incremented retry counter
|
| 49 |
return check_page_exists(safe_url, delay=delay, max_retries=max_retries, current_retries=current_retries + 1)
|
| 50 |
else:
|
|
|
|
| 43 |
# Check for Rate Limit Error and retry if under the limit
|
| 44 |
if response.status_code == 429:
|
| 45 |
if current_retries < max_retries:
|
| 46 |
+
# Make wait time exponential
|
| 47 |
+
wait_time = 5 * (2 ** current_retries)
|
| 48 |
+
print(f"Warning: Rate limit hit on {safe_url}. Attempt {current_retries + 1}/{max_retries}. Waiting for {wait_time} seconds...")
|
| 49 |
+
time.sleep(wait_time)
|
| 50 |
# Recurse with an incremented retry counter
|
| 51 |
return check_page_exists(safe_url, delay=delay, max_retries=max_retries, current_retries=current_retries + 1)
|
| 52 |
else:
|