| import re | |
| import json | |
| import pandas as pd | |
| import plotly.express as px | |
| import multiprocessing.pool | |
| from spacy.lang.en import English | |
| nlp = English() | |
| nlp.add_pipe("sentencizer") | |
| def call_multiprocessing_pool(df_text): | |
| concurrent = multiprocessing.cpu_count() | |
| pool = multiprocessing.pool.ThreadPool(processes=concurrent) | |
| result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1) | |
| pool.close() | |
| flat_return_list = [item for sublist in result_list for item in sublist] | |
| cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"] | |
| return_df = pd.DataFrame(flat_return_list, columns=cols) | |
| return return_df | |
| def get_gender_prof_match_details(df_text): | |
| gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) | |
| profession_lexicons = json.load(open("config/profession_lexicons.json", "r")) | |
| male_pronouns = gender_lexicons.get("male_pronouns") | |
| female_pronouns = gender_lexicons.get("female_pronouns") | |
| professions = profession_lexicons.get("professions") | |
| male_pronoun_pat, female_pronoun_pat, professions_pat = ( | |
| re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE) | |
| for pattern in [male_pronouns, female_pronouns, professions] | |
| ) | |
| doc = nlp(df_text) | |
| split_text = [sent for sent in doc.sents] | |
| results = [] | |
| for text in split_text: | |
| male_pronoun_match = re.findall(male_pronoun_pat, str(text)) | |
| female_pronoun_match = re.findall(female_pronoun_pat, str(text)) | |
| prof_match = re.findall(professions_pat, str(text)) | |
| both_match = "No" | |
| if len(male_pronoun_match) != 0 and len(prof_match) != 0: | |
| both_match = "Yes" | |
| if len(female_pronoun_match) != 0 and len(prof_match) != 0: | |
| both_match = "Yes" | |
| male_pronoun_match = ",".join(male_pronoun_match) | |
| female_pronoun_match = ",".join(female_pronoun_match) | |
| prof_match = ",".join(prof_match) | |
| results.append( | |
| ( | |
| str(text), | |
| male_pronoun_match, | |
| female_pronoun_match, | |
| prof_match, | |
| both_match, | |
| ) | |
| ) | |
| return results | |
| def get_statistics(result): | |
| stats = { | |
| "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()), | |
| "count_male_pronoun": str((result["Male Pronoun"] != "").sum()), | |
| "count_female_pronoun": str((result["Female Pronoun"] != "").sum()), | |
| "count_male_pronoun_profession": str( | |
| ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum() | |
| ), | |
| "count_female_pronoun_profession": str( | |
| ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum() | |
| ), | |
| "total_sentence": str(len(result)), | |
| } | |
| return stats | |
| def get_plot(result_json): | |
| both_gender_prof_match = int(result_json["both_gender_prof_match"]) | |
| count_male_pronoun = int(result_json["count_male_pronoun"]) | |
| count_female_pronoun = int(result_json["count_female_pronoun"]) | |
| count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"]) | |
| count_female_pronoun_profession = int( | |
| result_json["count_female_pronoun_profession"] | |
| ) | |
| data = { | |
| "Labels": [ | |
| "Both Gender & Profession Match", | |
| "Male Pronoun", | |
| "Female Pronoun", | |
| "Male Pronoun & Profession", | |
| "Female Pronoun & Profession", | |
| ], | |
| "Values": [ | |
| both_gender_prof_match, | |
| count_male_pronoun, | |
| count_female_pronoun, | |
| count_male_pronoun_profession, | |
| count_female_pronoun_profession, | |
| ], | |
| } | |
| fig = px.pie( | |
| data, | |
| names="Labels", | |
| values="Values", | |
| title="Gender & Profession Match Statistics", | |
| ) | |
| return fig | |
| def eval_gender_profession(data): | |
| data = data[data.columns[0]].str.lower().str.strip() | |
| result = call_multiprocessing_pool(data) | |
| result_json = get_statistics(result) | |
| result_plot = get_plot(result_json) | |
| result_df = ( | |
| pd.DataFrame.from_dict(result_json, orient="index") | |
| .reset_index() | |
| .rename(columns={"index": "Metric", 0: "Value"}) | |
| ) | |
| return result_df, result_plot | |