Spaces:

avid-ml
/

biasaware

Sleeping

App Files Files Community

freyam commited on Oct 19, 2023

Commit

d1a2df2

1 Parent(s): 8ed4d84

Add Gender-Profession and GenBit plots

Browse files

Files changed (3) hide show

app.py +35 -13
scripts/genbit.py +54 -1
scripts/gender_profession_bias.py +48 -16

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from scripts.gender_profession_bias import *
 from scripts.gender_distribution import *
 from datasets import load_dataset as hf_load_dataset
 MAX_THRESHOLD = 1000
 METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
@@ -123,6 +124,32 @@ def load_dataset(local_dataset, hf_dataset):
     )
 def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
     DATASET["sampling_method"] = dataset_sampling_method
     DATASET["sampling_size"] = dataset_sampling_size
@@ -178,16 +205,16 @@ with BiasAware:
                 label="Dataset", file_types=["csv"], value=None, visible=True
             )
             hf_dataset = gr.Textbox(visible=False)
             dataset_load_btn = gr.Button(visible=False)
             dataset_sampling_method = gr.Radio(visible=False)
             dataset_sampling_size = gr.Slider(visible=False)
             dataset_column = gr.Radio(visible=False)
             dataset_column_corpus = gr.Dataframe(visible=False)
-            dataset_import_btn = gr.Button(visible=False)
         with gr.Column(scale=2):
             methodology_title = gr.Markdown("## Methodology")
@@ -197,8 +224,6 @@ with BiasAware:
                 choices=METHODOLOGIES.keys(),
             )
-            methodology_description = gr.Markdown(visible=False)
             evaluation_btn = gr.Button(
                 value="Evaluate",
                 interactive=False,
@@ -206,6 +231,8 @@ with BiasAware:
                 visible=True,
             )
         with gr.Column(scale=2):
             result_title = gr.Markdown("## Results")
@@ -230,7 +257,7 @@ with BiasAware:
             gr.Textbox(
                 label="HuggingFace Hub",
                 placeholder="Search for a dataset",
-                value=None,
                 interactive=True,
                 visible=True,
             )
@@ -268,19 +295,14 @@ with BiasAware:
     )
     hf_dataset.submit(
-        fn=lambda _: gr.Button(
-            value=f"Load",
-            interactive=True,
-            variant="secondary",
-            visible=True,
-        ),
         inputs=[hf_dataset],
-        outputs=[dataset_load_btn],
     )
     dataset_load_btn.click(
         fn=load_dataset,
-        inputs=[local_dataset, hf_dataset],
         outputs=[
             dataset_sampling_method,
             dataset_sampling_size,

 from scripts.gender_distribution import *
 from datasets import load_dataset as hf_load_dataset
+from huggingface_hub import DatasetFilter, list_datasets
 MAX_THRESHOLD = 1000
 METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
     )
+def show_hf_dataset_search_results(hf_dataset):
+    choices = [
+        dataset.id
+        for dataset in list_datasets(
+            filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10
+        )
+    ]
+    return (
+        gr.Button(
+            value=f"Load",
+            interactive=True,
+            variant="secondary",
+            visible=True,
+        ),
+        gr.Radio(
+            label="HuggingFace Hub Search Results",
+            info="Select the dataset to be imported",
+            choices=choices,
+            value=choices[0],
+            interactive=True,
+            visible=True,
+        ),
+    )
 def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
     DATASET["sampling_method"] = dataset_sampling_method
     DATASET["sampling_size"] = dataset_sampling_size
                 label="Dataset", file_types=["csv"], value=None, visible=True
             )
             hf_dataset = gr.Textbox(visible=False)
+            hf_dataset_search_results = gr.Radio(visible=False)
             dataset_load_btn = gr.Button(visible=False)
+            dataset_import_btn = gr.Button(visible=False)
             dataset_sampling_method = gr.Radio(visible=False)
             dataset_sampling_size = gr.Slider(visible=False)
             dataset_column = gr.Radio(visible=False)
             dataset_column_corpus = gr.Dataframe(visible=False)
         with gr.Column(scale=2):
             methodology_title = gr.Markdown("## Methodology")
                 choices=METHODOLOGIES.keys(),
             )
             evaluation_btn = gr.Button(
                 value="Evaluate",
                 interactive=False,
                 visible=True,
             )
+            methodology_description = gr.Markdown(visible=False)
         with gr.Column(scale=2):
             result_title = gr.Markdown("## Results")
             gr.Textbox(
                 label="HuggingFace Hub",
                 placeholder="Search for a dataset",
+                value="amazon_multi",
                 interactive=True,
                 visible=True,
             )
     )
     hf_dataset.submit(
+        fn=show_hf_dataset_search_results,
         inputs=[hf_dataset],
+        outputs=[dataset_load_btn, hf_dataset_search_results],
     )
     dataset_load_btn.click(
         fn=load_dataset,
+        inputs=[local_dataset, hf_dataset_search_results],
         outputs=[
             dataset_sampling_method,
             dataset_sampling_size,

scripts/genbit.py CHANGED Viewed

@@ -1,5 +1,58 @@
 from genbit.genbit_metrics import GenBitMetrics
 import pandas as pd
 def eval_genbit(data):
@@ -18,7 +71,7 @@ def eval_genbit(data):
         .rename(columns={"index": "Metric", 0: "Value"})
     )
-    result_plot = None
     result_conclusion = ""
     return result_df, result_plot, result_conclusion

 from genbit.genbit_metrics import GenBitMetrics
 import pandas as pd
+import plotly.express as px
+from plotly.subplots import make_subplots
+def plot_genbit(result_json):
+    data1 = {
+        "Metric": [
+            "Female Gender",
+            "Male Gender",
+            "Non-Binary Gender",
+        ],
+        "Value": [
+            result_json["percentage_of_female_gender_definition_words"],
+            result_json["percentage_of_male_gender_definition_words"],
+            result_json["percentage_of_non_binary_gender_definition_words"],
+        ],
+    }
+    data2 = {
+        "Metric": [
+            "Trans Gender",
+            "Cis Gender",
+        ],
+        "Value": [
+            result_json["percentage_of_trans_gender_definition_words"],
+            result_json["percentage_of_cis_gender_definition_words"],
+        ],
+    }
+    df1 = pd.DataFrame(data1)
+    df2 = pd.DataFrame(data2)
+    fig1 = px.pie(
+        df1,
+        names="Metric",
+        values="Value",
+        title="Combined Gender Definition Words Distribution",
+    )
+    fig1.update_traces(textposition="inside", textinfo="percent+label")
+    # fig2 = px.pie(
+    #     df2,
+    #     names="Metric",
+    #     values="Value",
+    # )
+    # fig2.update_traces(textposition="inside", textinfo="percent+label")
+    # fig = make_subplots(rows=2, cols=1, specs=[[{"type": "pie"}], [{"type": "pie"}]])
+    # fig.add_trace(fig1.data[0], row=1, col=1)
+    # fig.add_trace(fig2.data[0], row=2, col=1)
+    return fig1
 def eval_genbit(data):
         .rename(columns={"index": "Metric", 0: "Value"})
     )
+    result_plot = plot_genbit(result_json)
     result_conclusion = ""
     return result_df, result_plot, result_conclusion

scripts/gender_profession_bias.py CHANGED Viewed

@@ -13,6 +13,20 @@ nlp = English()
 nlp.add_pipe("sentencizer")
 def get_split_text(text):
     doc = nlp(text)
     sentences = [sent for sent in doc.sents]
@@ -71,20 +85,6 @@ def get_gender_prof_match_details(df_text):
     return results
-def call_multiprocessing_pool(df_text):
-    concurrent = 2000
-    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
-    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
-    pool.close()
-    flat_return_list = [item for sublist in result_list for item in sublist]
-    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
-    return_df = pd.DataFrame(flat_return_list, columns=cols)
-    return return_df
 def get_statistics(result):
     stats = {
         "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
@@ -102,8 +102,40 @@ def get_statistics(result):
     return stats
-def get_plot(result_df):
-    return
 def eval_gender_profession(data):

 nlp.add_pipe("sentencizer")
+def call_multiprocessing_pool(df_text):
+    concurrent = 2000
+    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
+    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
+    pool.close()
+    flat_return_list = [item for sublist in result_list for item in sublist]
+    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
+    return_df = pd.DataFrame(flat_return_list, columns=cols)
+    return return_df
 def get_split_text(text):
     doc = nlp(text)
     sentences = [sent for sent in doc.sents]
     return results
 def get_statistics(result):
     stats = {
         "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
     return stats
+def get_plot(result_json):
+    both_gender_prof_match = int(result_json["both_gender_prof_match"])
+    count_male_pronoun = int(result_json["count_male_pronoun"])
+    count_female_pronoun = int(result_json["count_female_pronoun"])
+    count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
+    count_female_pronoun_profession = int(
+        result_json["count_female_pronoun_profession"]
+    )
+    data = {
+        "Labels": [
+            "Both Gender & Profession Match",
+            "Male Pronoun",
+            "Female Pronoun",
+            "Male Pronoun & Profession",
+            "Female Pronoun & Profession",
+        ],
+        "Values": [
+            both_gender_prof_match,
+            count_male_pronoun,
+            count_female_pronoun,
+            count_male_pronoun_profession,
+            count_female_pronoun_profession,
+        ],
+    }
+    fig = px.pie(
+        data,
+        names="Labels",
+        values="Values",
+        title="Gender & Profession Match Statistics",
+    )
+    return fig
 def eval_gender_profession(data):