Add Gender-Profession and GenBit plots
Browse files- app.py +35 -13
- scripts/genbit.py +54 -1
- scripts/gender_profession_bias.py +48 -16
app.py
CHANGED
|
@@ -7,6 +7,7 @@ from scripts.gender_profession_bias import *
|
|
| 7 |
from scripts.gender_distribution import *
|
| 8 |
|
| 9 |
from datasets import load_dataset as hf_load_dataset
|
|
|
|
| 10 |
|
| 11 |
MAX_THRESHOLD = 1000
|
| 12 |
METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
|
|
@@ -123,6 +124,32 @@ def load_dataset(local_dataset, hf_dataset):
|
|
| 123 |
)
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
|
| 127 |
DATASET["sampling_method"] = dataset_sampling_method
|
| 128 |
DATASET["sampling_size"] = dataset_sampling_size
|
|
@@ -178,16 +205,16 @@ with BiasAware:
|
|
| 178 |
label="Dataset", file_types=["csv"], value=None, visible=True
|
| 179 |
)
|
| 180 |
hf_dataset = gr.Textbox(visible=False)
|
|
|
|
| 181 |
|
| 182 |
dataset_load_btn = gr.Button(visible=False)
|
|
|
|
| 183 |
|
| 184 |
dataset_sampling_method = gr.Radio(visible=False)
|
| 185 |
dataset_sampling_size = gr.Slider(visible=False)
|
| 186 |
dataset_column = gr.Radio(visible=False)
|
| 187 |
dataset_column_corpus = gr.Dataframe(visible=False)
|
| 188 |
|
| 189 |
-
dataset_import_btn = gr.Button(visible=False)
|
| 190 |
-
|
| 191 |
with gr.Column(scale=2):
|
| 192 |
methodology_title = gr.Markdown("## Methodology")
|
| 193 |
|
|
@@ -197,8 +224,6 @@ with BiasAware:
|
|
| 197 |
choices=METHODOLOGIES.keys(),
|
| 198 |
)
|
| 199 |
|
| 200 |
-
methodology_description = gr.Markdown(visible=False)
|
| 201 |
-
|
| 202 |
evaluation_btn = gr.Button(
|
| 203 |
value="Evaluate",
|
| 204 |
interactive=False,
|
|
@@ -206,6 +231,8 @@ with BiasAware:
|
|
| 206 |
visible=True,
|
| 207 |
)
|
| 208 |
|
|
|
|
|
|
|
| 209 |
with gr.Column(scale=2):
|
| 210 |
result_title = gr.Markdown("## Results")
|
| 211 |
|
|
@@ -230,7 +257,7 @@ with BiasAware:
|
|
| 230 |
gr.Textbox(
|
| 231 |
label="HuggingFace Hub",
|
| 232 |
placeholder="Search for a dataset",
|
| 233 |
-
value=
|
| 234 |
interactive=True,
|
| 235 |
visible=True,
|
| 236 |
)
|
|
@@ -268,19 +295,14 @@ with BiasAware:
|
|
| 268 |
)
|
| 269 |
|
| 270 |
hf_dataset.submit(
|
| 271 |
-
fn=
|
| 272 |
-
value=f"Load",
|
| 273 |
-
interactive=True,
|
| 274 |
-
variant="secondary",
|
| 275 |
-
visible=True,
|
| 276 |
-
),
|
| 277 |
inputs=[hf_dataset],
|
| 278 |
-
outputs=[dataset_load_btn],
|
| 279 |
)
|
| 280 |
|
| 281 |
dataset_load_btn.click(
|
| 282 |
fn=load_dataset,
|
| 283 |
-
inputs=[local_dataset,
|
| 284 |
outputs=[
|
| 285 |
dataset_sampling_method,
|
| 286 |
dataset_sampling_size,
|
|
|
|
| 7 |
from scripts.gender_distribution import *
|
| 8 |
|
| 9 |
from datasets import load_dataset as hf_load_dataset
|
| 10 |
+
from huggingface_hub import DatasetFilter, list_datasets
|
| 11 |
|
| 12 |
MAX_THRESHOLD = 1000
|
| 13 |
METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
|
|
|
|
| 124 |
)
|
| 125 |
|
| 126 |
|
| 127 |
+
def show_hf_dataset_search_results(hf_dataset):
|
| 128 |
+
choices = [
|
| 129 |
+
dataset.id
|
| 130 |
+
for dataset in list_datasets(
|
| 131 |
+
filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10
|
| 132 |
+
)
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
return (
|
| 136 |
+
gr.Button(
|
| 137 |
+
value=f"Load",
|
| 138 |
+
interactive=True,
|
| 139 |
+
variant="secondary",
|
| 140 |
+
visible=True,
|
| 141 |
+
),
|
| 142 |
+
gr.Radio(
|
| 143 |
+
label="HuggingFace Hub Search Results",
|
| 144 |
+
info="Select the dataset to be imported",
|
| 145 |
+
choices=choices,
|
| 146 |
+
value=choices[0],
|
| 147 |
+
interactive=True,
|
| 148 |
+
visible=True,
|
| 149 |
+
),
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
|
| 154 |
DATASET["sampling_method"] = dataset_sampling_method
|
| 155 |
DATASET["sampling_size"] = dataset_sampling_size
|
|
|
|
| 205 |
label="Dataset", file_types=["csv"], value=None, visible=True
|
| 206 |
)
|
| 207 |
hf_dataset = gr.Textbox(visible=False)
|
| 208 |
+
hf_dataset_search_results = gr.Radio(visible=False)
|
| 209 |
|
| 210 |
dataset_load_btn = gr.Button(visible=False)
|
| 211 |
+
dataset_import_btn = gr.Button(visible=False)
|
| 212 |
|
| 213 |
dataset_sampling_method = gr.Radio(visible=False)
|
| 214 |
dataset_sampling_size = gr.Slider(visible=False)
|
| 215 |
dataset_column = gr.Radio(visible=False)
|
| 216 |
dataset_column_corpus = gr.Dataframe(visible=False)
|
| 217 |
|
|
|
|
|
|
|
| 218 |
with gr.Column(scale=2):
|
| 219 |
methodology_title = gr.Markdown("## Methodology")
|
| 220 |
|
|
|
|
| 224 |
choices=METHODOLOGIES.keys(),
|
| 225 |
)
|
| 226 |
|
|
|
|
|
|
|
| 227 |
evaluation_btn = gr.Button(
|
| 228 |
value="Evaluate",
|
| 229 |
interactive=False,
|
|
|
|
| 231 |
visible=True,
|
| 232 |
)
|
| 233 |
|
| 234 |
+
methodology_description = gr.Markdown(visible=False)
|
| 235 |
+
|
| 236 |
with gr.Column(scale=2):
|
| 237 |
result_title = gr.Markdown("## Results")
|
| 238 |
|
|
|
|
| 257 |
gr.Textbox(
|
| 258 |
label="HuggingFace Hub",
|
| 259 |
placeholder="Search for a dataset",
|
| 260 |
+
value="amazon_multi",
|
| 261 |
interactive=True,
|
| 262 |
visible=True,
|
| 263 |
)
|
|
|
|
| 295 |
)
|
| 296 |
|
| 297 |
hf_dataset.submit(
|
| 298 |
+
fn=show_hf_dataset_search_results,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
inputs=[hf_dataset],
|
| 300 |
+
outputs=[dataset_load_btn, hf_dataset_search_results],
|
| 301 |
)
|
| 302 |
|
| 303 |
dataset_load_btn.click(
|
| 304 |
fn=load_dataset,
|
| 305 |
+
inputs=[local_dataset, hf_dataset_search_results],
|
| 306 |
outputs=[
|
| 307 |
dataset_sampling_method,
|
| 308 |
dataset_sampling_size,
|
scripts/genbit.py
CHANGED
|
@@ -1,5 +1,58 @@
|
|
| 1 |
from genbit.genbit_metrics import GenBitMetrics
|
| 2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def eval_genbit(data):
|
|
@@ -18,7 +71,7 @@ def eval_genbit(data):
|
|
| 18 |
.rename(columns={"index": "Metric", 0: "Value"})
|
| 19 |
)
|
| 20 |
|
| 21 |
-
result_plot =
|
| 22 |
result_conclusion = ""
|
| 23 |
|
| 24 |
return result_df, result_plot, result_conclusion
|
|
|
|
| 1 |
from genbit.genbit_metrics import GenBitMetrics
|
| 2 |
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
from plotly.subplots import make_subplots
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def plot_genbit(result_json):
|
| 8 |
+
data1 = {
|
| 9 |
+
"Metric": [
|
| 10 |
+
"Female Gender",
|
| 11 |
+
"Male Gender",
|
| 12 |
+
"Non-Binary Gender",
|
| 13 |
+
],
|
| 14 |
+
"Value": [
|
| 15 |
+
result_json["percentage_of_female_gender_definition_words"],
|
| 16 |
+
result_json["percentage_of_male_gender_definition_words"],
|
| 17 |
+
result_json["percentage_of_non_binary_gender_definition_words"],
|
| 18 |
+
],
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
data2 = {
|
| 22 |
+
"Metric": [
|
| 23 |
+
"Trans Gender",
|
| 24 |
+
"Cis Gender",
|
| 25 |
+
],
|
| 26 |
+
"Value": [
|
| 27 |
+
result_json["percentage_of_trans_gender_definition_words"],
|
| 28 |
+
result_json["percentage_of_cis_gender_definition_words"],
|
| 29 |
+
],
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
df1 = pd.DataFrame(data1)
|
| 33 |
+
df2 = pd.DataFrame(data2)
|
| 34 |
+
|
| 35 |
+
fig1 = px.pie(
|
| 36 |
+
df1,
|
| 37 |
+
names="Metric",
|
| 38 |
+
values="Value",
|
| 39 |
+
title="Combined Gender Definition Words Distribution",
|
| 40 |
+
)
|
| 41 |
+
fig1.update_traces(textposition="inside", textinfo="percent+label")
|
| 42 |
+
|
| 43 |
+
# fig2 = px.pie(
|
| 44 |
+
# df2,
|
| 45 |
+
# names="Metric",
|
| 46 |
+
# values="Value",
|
| 47 |
+
# )
|
| 48 |
+
# fig2.update_traces(textposition="inside", textinfo="percent+label")
|
| 49 |
+
|
| 50 |
+
# fig = make_subplots(rows=2, cols=1, specs=[[{"type": "pie"}], [{"type": "pie"}]])
|
| 51 |
+
|
| 52 |
+
# fig.add_trace(fig1.data[0], row=1, col=1)
|
| 53 |
+
# fig.add_trace(fig2.data[0], row=2, col=1)
|
| 54 |
+
|
| 55 |
+
return fig1
|
| 56 |
|
| 57 |
|
| 58 |
def eval_genbit(data):
|
|
|
|
| 71 |
.rename(columns={"index": "Metric", 0: "Value"})
|
| 72 |
)
|
| 73 |
|
| 74 |
+
result_plot = plot_genbit(result_json)
|
| 75 |
result_conclusion = ""
|
| 76 |
|
| 77 |
return result_df, result_plot, result_conclusion
|
scripts/gender_profession_bias.py
CHANGED
|
@@ -13,6 +13,20 @@ nlp = English()
|
|
| 13 |
nlp.add_pipe("sentencizer")
|
| 14 |
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def get_split_text(text):
|
| 17 |
doc = nlp(text)
|
| 18 |
sentences = [sent for sent in doc.sents]
|
|
@@ -71,20 +85,6 @@ def get_gender_prof_match_details(df_text):
|
|
| 71 |
return results
|
| 72 |
|
| 73 |
|
| 74 |
-
def call_multiprocessing_pool(df_text):
|
| 75 |
-
concurrent = 2000
|
| 76 |
-
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
| 77 |
-
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
| 78 |
-
pool.close()
|
| 79 |
-
|
| 80 |
-
flat_return_list = [item for sublist in result_list for item in sublist]
|
| 81 |
-
|
| 82 |
-
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
|
| 83 |
-
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
| 84 |
-
|
| 85 |
-
return return_df
|
| 86 |
-
|
| 87 |
-
|
| 88 |
def get_statistics(result):
|
| 89 |
stats = {
|
| 90 |
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
|
|
@@ -102,8 +102,40 @@ def get_statistics(result):
|
|
| 102 |
return stats
|
| 103 |
|
| 104 |
|
| 105 |
-
def get_plot(
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
def eval_gender_profession(data):
|
|
|
|
| 13 |
nlp.add_pipe("sentencizer")
|
| 14 |
|
| 15 |
|
| 16 |
+
def call_multiprocessing_pool(df_text):
|
| 17 |
+
concurrent = 2000
|
| 18 |
+
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
| 19 |
+
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
| 20 |
+
pool.close()
|
| 21 |
+
|
| 22 |
+
flat_return_list = [item for sublist in result_list for item in sublist]
|
| 23 |
+
|
| 24 |
+
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
|
| 25 |
+
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
| 26 |
+
|
| 27 |
+
return return_df
|
| 28 |
+
|
| 29 |
+
|
| 30 |
def get_split_text(text):
|
| 31 |
doc = nlp(text)
|
| 32 |
sentences = [sent for sent in doc.sents]
|
|
|
|
| 85 |
return results
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
def get_statistics(result):
|
| 89 |
stats = {
|
| 90 |
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
|
|
|
|
| 102 |
return stats
|
| 103 |
|
| 104 |
|
| 105 |
+
def get_plot(result_json):
|
| 106 |
+
both_gender_prof_match = int(result_json["both_gender_prof_match"])
|
| 107 |
+
count_male_pronoun = int(result_json["count_male_pronoun"])
|
| 108 |
+
count_female_pronoun = int(result_json["count_female_pronoun"])
|
| 109 |
+
count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
|
| 110 |
+
count_female_pronoun_profession = int(
|
| 111 |
+
result_json["count_female_pronoun_profession"]
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
data = {
|
| 115 |
+
"Labels": [
|
| 116 |
+
"Both Gender & Profession Match",
|
| 117 |
+
"Male Pronoun",
|
| 118 |
+
"Female Pronoun",
|
| 119 |
+
"Male Pronoun & Profession",
|
| 120 |
+
"Female Pronoun & Profession",
|
| 121 |
+
],
|
| 122 |
+
"Values": [
|
| 123 |
+
both_gender_prof_match,
|
| 124 |
+
count_male_pronoun,
|
| 125 |
+
count_female_pronoun,
|
| 126 |
+
count_male_pronoun_profession,
|
| 127 |
+
count_female_pronoun_profession,
|
| 128 |
+
],
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
fig = px.pie(
|
| 132 |
+
data,
|
| 133 |
+
names="Labels",
|
| 134 |
+
values="Values",
|
| 135 |
+
title="Gender & Profession Match Statistics",
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
return fig
|
| 139 |
|
| 140 |
|
| 141 |
def eval_gender_profession(data):
|