Spaces:

avid-ml
/

biasaware

Sleeping

biasaware / scripts /gender_profession_bias.py

freyam

Add sample size limit and AVID report

8ab9329 about 2 years ago

4.31 kB

	import re
	import json

	import pandas as pd
	import plotly.express as px
	import multiprocessing.pool
	from spacy.lang.en import English


	nlp = English()
	nlp.add_pipe("sentencizer")


	def call_multiprocessing_pool(df_text):
	concurrent = multiprocessing.cpu_count()
	pool = multiprocessing.pool.ThreadPool(processes=concurrent)
	result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
	pool.close()

	flat_return_list = [item for sublist in result_list for item in sublist]

	cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
	return_df = pd.DataFrame(flat_return_list, columns=cols)

	return return_df


	def get_gender_prof_match_details(df_text):
	gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
	profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))

	male_pronouns = gender_lexicons.get("male_pronouns")
	female_pronouns = gender_lexicons.get("female_pronouns")
	professions = profession_lexicons.get("professions")

	male_pronoun_pat, female_pronoun_pat, professions_pat = (
	re.compile(r"\b({})\b".format("\|".join(pattern)), flags=re.IGNORECASE)
	for pattern in [male_pronouns, female_pronouns, professions]
	)

	doc = nlp(df_text)
	split_text = [sent for sent in doc.sents]

	results = []

	for text in split_text:
	male_pronoun_match = re.findall(male_pronoun_pat, str(text))
	female_pronoun_match = re.findall(female_pronoun_pat, str(text))

	prof_match = re.findall(professions_pat, str(text))

	both_match = "No"

	if len(male_pronoun_match) != 0 and len(prof_match) != 0:
	both_match = "Yes"

	if len(female_pronoun_match) != 0 and len(prof_match) != 0:
	both_match = "Yes"

	male_pronoun_match = ",".join(male_pronoun_match)
	female_pronoun_match = ",".join(female_pronoun_match)

	prof_match = ",".join(prof_match)

	results.append(
	(
	str(text),
	male_pronoun_match,
	female_pronoun_match,
	prof_match,
	both_match,
	)
	)

	return results


	def get_statistics(result):
	stats = {
	"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
	"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
	"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
	"count_male_pronoun_profession": str(
	((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
	),
	"count_female_pronoun_profession": str(
	((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
	),
	"total_sentence": str(len(result)),
	}

	return stats


	def get_plot(result_json):
	both_gender_prof_match = int(result_json["both_gender_prof_match"])
	count_male_pronoun = int(result_json["count_male_pronoun"])
	count_female_pronoun = int(result_json["count_female_pronoun"])
	count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
	count_female_pronoun_profession = int(
	result_json["count_female_pronoun_profession"]
	)

	data = {
	"Labels": [
	"Both Gender & Profession Match",
	"Male Pronoun",
	"Female Pronoun",
	"Male Pronoun & Profession",
	"Female Pronoun & Profession",
	],
	"Values": [
	both_gender_prof_match,
	count_male_pronoun,
	count_female_pronoun,
	count_male_pronoun_profession,
	count_female_pronoun_profession,
	],
	}

	fig = px.pie(
	data,
	names="Labels",
	values="Values",
	title="Gender & Profession Match Statistics",
	)

	return fig


	def eval_gender_profession(data):
	data = data[data.columns[0]].str.lower().str.strip()

	result = call_multiprocessing_pool(data)

	result_json = get_statistics(result)
	result_plot = get_plot(result_json)

	result_df = (
	pd.DataFrame.from_dict(result_json, orient="index")
	.reset_index()
	.rename(columns={"index": "Metric", 0: "Value"})
	)

	return result_df, result_plot