| import re | |
| import json | |
| import plotly.express as px | |
| import pandas as pd | |
| def load_gender_lexicons(): | |
| with open("config/gender_lexicons.json", "r") as lexicon_file: | |
| gender_lexicons = json.load(lexicon_file) | |
| return gender_lexicons | |
| def count_gender_terms(text, gender_pattern): | |
| return len(gender_pattern.findall(text)) | |
| def get_gender_tag(count_male_terms, count_female_terms): | |
| total_terms = count_male_terms + count_female_terms | |
| if total_terms == 0: | |
| return "No Gender" | |
| male_proportion = (count_male_terms / total_terms) * 100 | |
| female_proportion = (count_female_terms / total_terms) * 100 | |
| if male_proportion >= 75: | |
| return "Male Strongly Positive Gender" | |
| elif male_proportion >= 50: | |
| return "Male Positive Gender" | |
| elif female_proportion >= 75: | |
| return "Female Strongly Positive Gender" | |
| elif female_proportion >= 50: | |
| return "Female Positive Gender" | |
| return "Equal Gender" | |
| def analyze_text(text, gender_lexicons): | |
| male_lexicon = set(gender_lexicons.get("male_lexicons")) | |
| female_lexicon = set(gender_lexicons.get("female_lexicons")) | |
| male_pattern = re.compile( | |
| r"\b({})\b".format("|".join(map(re.escape, male_lexicon))) | |
| ) | |
| female_pattern = re.compile( | |
| r"\b({})\b".format("|".join(map(re.escape, female_lexicon))) | |
| ) | |
| text = text.lower().strip() | |
| count_male_terms = count_gender_terms(text, male_pattern) | |
| count_female_terms = count_gender_terms(text, female_pattern) | |
| gender_category = get_gender_tag(count_male_terms, count_female_terms) | |
| return count_male_terms, count_female_terms, gender_category | |
| def plot_gender_category_counts(labels, values): | |
| fig = px.pie( | |
| values=values, | |
| names=labels, | |
| title="Gender Distribution", | |
| ) | |
| fig.update_traces( | |
| pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], | |
| textinfo="percent+label", | |
| marker=dict( | |
| line=dict(color="#000000", width=1), | |
| ), | |
| ) | |
| fig.update_layout(showlegend=False) | |
| return fig | |
| def eval_gender_distribution(data): | |
| gender_lexicons = load_gender_lexicons() | |
| data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip( | |
| *data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons)) | |
| ) | |
| gender_labels = [ | |
| "No Gender", | |
| "Equal Gender", | |
| "Male Positive Gender", | |
| "Male Strongly Positive Gender", | |
| "Female Positive Gender", | |
| "Female Strongly Positive Gender", | |
| ] | |
| result_json = ( | |
| data["gender_category"].value_counts().reindex(gender_labels, fill_value=0) | |
| ) | |
| result_df = pd.DataFrame({"Metric": result_json.index, "Value": result_json.values}) | |
| result_plot = plot_gender_category_counts(gender_labels, result_json) | |
| return result_df, result_plot | |