| import re | |
| import json | |
| import plotly.express as px | |
| import pandas as pd | |
| with open("config/gender_lexicons.json", "r") as lexicon_file: | |
| gender_lexicons = json.load(lexicon_file) | |
| male_lexicon = set(gender_lexicons.get("male_lexicons")) | |
| female_lexicon = set(gender_lexicons.get("female_lexicons")) | |
| male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))) | |
| female_pattern = re.compile( | |
| r"\b({})\b".format("|".join(map(re.escape, female_lexicon))) | |
| ) | |
| def count_gender_terms(text, gender_pattern): | |
| matches = re.findall(gender_pattern, text) | |
| return len(matches) | |
| def get_gender_tag(count_male_terms, count_female_terms): | |
| total_terms = count_male_terms + count_female_terms | |
| if total_terms == 0: | |
| return "No Gender" | |
| male_proportion = (count_male_terms / total_terms) * 100 | |
| if male_proportion >= 75: | |
| return "Male Strongly Positive Gender" | |
| elif male_proportion >= 50: | |
| return "Male Positive Gender" | |
| female_proportion = (count_female_terms / total_terms) * 100 | |
| if female_proportion >= 75: | |
| return "Female Strongly Positive Gender" | |
| elif female_proportion >= 50: | |
| return "Female Positive Gender" | |
| return "Equal Gender" | |
| def get_gender_category_counts(sample_df): | |
| gender_labels = [ | |
| "No Gender", | |
| "Equal Gender", | |
| "Male Positive Gender", | |
| "Male Strongly Positive Gender", | |
| "Female Positive Gender", | |
| "Female Strongly Positive Gender", | |
| ] | |
| gender_counts = sample_df["gender_category"].value_counts() | |
| result = {label: str(gender_counts.get(label, 0)) for label in gender_labels} | |
| return result | |
| def plot_gender_category_counts(gender_labels): | |
| labels = [ | |
| "No Gender", | |
| "Equal Gender", | |
| "Male Positive Gender", | |
| "Male Strongly Positive Gender", | |
| "Female Positive Gender", | |
| "Female Strongly Positive Gender", | |
| ] | |
| values = [gender_labels[label] for label in labels] | |
| fig = px.pie( | |
| values=values, | |
| names=labels, | |
| title="Gender Distribution", | |
| category_orders={"names": labels}, | |
| ) | |
| fig.update_traces( | |
| pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], | |
| textinfo="percent+label", | |
| marker=dict(line=dict(color="#000000", width=1)), | |
| ) | |
| fig.update_layout(showlegend=False) | |
| return fig | |
| def eval_gender_distribution(data): | |
| data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() | |
| data["count_male_terms"] = data[data.columns[0]].apply( | |
| lambda x: count_gender_terms(x, male_pattern) | |
| ) | |
| data["count_female_terms"] = data[data.columns[0]].apply( | |
| lambda x: count_gender_terms(x, female_pattern) | |
| ) | |
| data["gender_category"] = data.apply( | |
| lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]), | |
| axis=1, | |
| ) | |
| result_json = get_gender_category_counts(data) | |
| result_plot = plot_gender_category_counts(result_json) | |
| result_df = ( | |
| pd.DataFrame.from_dict(result_json, orient="index") | |
| .reset_index() | |
| .rename(columns={"index": "Metric", 0: "Value"}) | |
| ) | |
| result_conclusion = "" | |
| return result_df, result_plot, result_conclusion | |