Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
·
4eee292
1
Parent(s):
d9c6196
Replaced no-info with None values
Browse files- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +4 -2
- requirements.txt +2 -1
- utils.py +36 -7
__pycache__/utils.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMAR
|
|
| 6 |
from utils import BASE_SUMMARY_METRICS
|
| 7 |
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
| 8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
|
|
|
| 9 |
|
| 10 |
import matplotlib.pyplot as plt
|
| 11 |
import seaborn as sns
|
|
@@ -66,6 +67,7 @@ with data_survey:
|
|
| 66 |
df_summary_metrics = catalog_summary_statistics(df_data_cat)
|
| 67 |
|
| 68 |
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
|
|
|
| 69 |
st.dataframe(df_basic_stats, use_container_width=False)
|
| 70 |
|
| 71 |
st.header("Speech data available across Polish ASR speech datasets")
|
|
@@ -80,9 +82,9 @@ with data_survey:
|
|
| 80 |
# Display distribution of datasets created per year
|
| 81 |
st.header("Polish ASR speech datasets created in 1997-2023")
|
| 82 |
col_groupby = ['Creation year']
|
| 83 |
-
|
| 84 |
|
| 85 |
-
st.dataframe(
|
| 86 |
|
| 87 |
st.header("Institutions contributing Polish ASR speech dataset")
|
| 88 |
col_groupby = ['Publisher']
|
|
|
|
| 6 |
from utils import BASE_SUMMARY_METRICS
|
| 7 |
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
| 8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
| 9 |
+
from utils import left_align, right_align
|
| 10 |
|
| 11 |
import matplotlib.pyplot as plt
|
| 12 |
import seaborn as sns
|
|
|
|
| 67 |
df_summary_metrics = catalog_summary_statistics(df_data_cat)
|
| 68 |
|
| 69 |
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
| 70 |
+
|
| 71 |
st.dataframe(df_basic_stats, use_container_width=False)
|
| 72 |
|
| 73 |
st.header("Speech data available across Polish ASR speech datasets")
|
|
|
|
| 82 |
# Display distribution of datasets created per year
|
| 83 |
st.header("Polish ASR speech datasets created in 1997-2023")
|
| 84 |
col_groupby = ['Creation year']
|
| 85 |
+
df_datasets_per_year = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 86 |
|
| 87 |
+
st.dataframe(df_datasets_per_year, use_container_width=False)
|
| 88 |
|
| 89 |
st.header("Institutions contributing Polish ASR speech dataset")
|
| 90 |
col_groupby = ['Publisher']
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
seaborn
|
| 2 |
matplotlib
|
| 3 |
-
pandas
|
|
|
|
|
|
| 1 |
seaborn
|
| 2 |
matplotlib
|
| 3 |
+
pandas
|
| 4 |
+
librosa
|
utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import requests
|
| 2 |
import pandas as pd
|
| 3 |
import streamlit as st
|
|
|
|
| 4 |
|
| 5 |
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
|
| 6 |
# TODO - extract from the catalog name
|
|
@@ -30,13 +31,14 @@ def download_tsv_from_google_sheet(sheet_url):
|
|
| 30 |
|
| 31 |
# Send a GET request to download the TSV file
|
| 32 |
response = requests.get(tsv_url)
|
| 33 |
-
|
|
|
|
| 34 |
# Check if the request was successful
|
| 35 |
if response.status_code == 200:
|
| 36 |
# Read the TSV content into a pandas DataFrame
|
| 37 |
from io import StringIO
|
| 38 |
tsv_content = StringIO(response.text)
|
| 39 |
-
df = pd.read_csv(tsv_content, sep='\t')
|
| 40 |
return df
|
| 41 |
else:
|
| 42 |
print("Failed to download the TSV file.")
|
|
@@ -71,6 +73,22 @@ def load_bench_taxonomy():
|
|
| 71 |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
| 72 |
return(df_taxonomy)
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
| 76 |
"""
|
|
@@ -144,11 +162,13 @@ def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None
|
|
| 144 |
# Sort by the provided column col_sort
|
| 145 |
col_sort = col_groupby if col_sort is None else col_sort
|
| 146 |
summary.sort_values(by=col_sort, ascending=False, inplace=True)
|
| 147 |
-
|
| 148 |
-
|
| 149 |
for col in col_sum:
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
| 152 |
return summary
|
| 153 |
|
| 154 |
|
|
@@ -210,6 +230,9 @@ def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
|
|
| 210 |
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
|
| 211 |
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
|
| 212 |
|
|
|
|
|
|
|
|
|
|
| 213 |
return(df_meta_all_flat, df_meta_all_pivot)
|
| 214 |
|
| 215 |
|
|
@@ -289,4 +312,10 @@ def catalog_summary_statistics(df_cat):
|
|
| 289 |
metrics_df = pd.DataFrame(metrics_dict)
|
| 290 |
metrics_df.reset_index(drop=True, inplace=True)
|
| 291 |
metrics_df.set_index("Metric", inplace=True)
|
| 292 |
-
return(metrics_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
import pandas as pd
|
| 3 |
import streamlit as st
|
| 4 |
+
import numpy as np
|
| 5 |
|
| 6 |
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
|
| 7 |
# TODO - extract from the catalog name
|
|
|
|
| 31 |
|
| 32 |
# Send a GET request to download the TSV file
|
| 33 |
response = requests.get(tsv_url)
|
| 34 |
+
response.encoding = 'utf-8'
|
| 35 |
+
|
| 36 |
# Check if the request was successful
|
| 37 |
if response.status_code == 200:
|
| 38 |
# Read the TSV content into a pandas DataFrame
|
| 39 |
from io import StringIO
|
| 40 |
tsv_content = StringIO(response.text)
|
| 41 |
+
df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
|
| 42 |
return df
|
| 43 |
else:
|
| 44 |
print("Failed to download the TSV file.")
|
|
|
|
| 73 |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
| 74 |
return(df_taxonomy)
|
| 75 |
|
| 76 |
+
def style_floats(val):
|
| 77 |
+
"""
|
| 78 |
+
Converts float to int if the fractional part is zero, formats floats with two decimal places,
|
| 79 |
+
and leaves strings unchanged.
|
| 80 |
+
"""
|
| 81 |
+
# Check if value is a float and if it can be converted to an int without loss
|
| 82 |
+
if isinstance(val, float):
|
| 83 |
+
if val % 1 == 0:
|
| 84 |
+
return f"{int(val)}" # Convert float with no fractional part to int
|
| 85 |
+
else:
|
| 86 |
+
return f"{val:.2f}" # Format floats with two decimal places
|
| 87 |
+
elif isinstance(val, int):
|
| 88 |
+
return f"{val}" # Handle pure integers separately (though likely unnecessary)
|
| 89 |
+
else:
|
| 90 |
+
return val # Return strings unchanged
|
| 91 |
+
|
| 92 |
|
| 93 |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
| 94 |
"""
|
|
|
|
| 162 |
# Sort by the provided column col_sort
|
| 163 |
col_sort = col_groupby if col_sort is None else col_sort
|
| 164 |
summary.sort_values(by=col_sort, ascending=False, inplace=True)
|
| 165 |
+
|
| 166 |
+
print(col_sum)
|
| 167 |
for col in col_sum:
|
| 168 |
+
print(col)
|
| 169 |
+
#summary[col] = summary[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x))
|
| 170 |
+
summary[col] = summary[col].replace(0, np.nan)
|
| 171 |
+
|
| 172 |
return summary
|
| 173 |
|
| 174 |
|
|
|
|
| 230 |
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
|
| 231 |
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
|
| 232 |
|
| 233 |
+
#df_meta_all_pivot_styled = df_meta_all_pivot.style.map(style_floats)
|
| 234 |
+
#df_meta_all_flat_styled = df_meta_all_flat.style.map(style_floats)
|
| 235 |
+
|
| 236 |
return(df_meta_all_flat, df_meta_all_pivot)
|
| 237 |
|
| 238 |
|
|
|
|
| 312 |
metrics_df = pd.DataFrame(metrics_dict)
|
| 313 |
metrics_df.reset_index(drop=True, inplace=True)
|
| 314 |
metrics_df.set_index("Metric", inplace=True)
|
| 315 |
+
return(metrics_df)
|
| 316 |
+
|
| 317 |
+
def right_align(s, props='text-align: right;'):
|
| 318 |
+
return props
|
| 319 |
+
|
| 320 |
+
def left_align(s, props='text-align: left;'):
|
| 321 |
+
return props
|