Spaces:

emmatliu
/

LLMReferenceLetterBias

Runtime error

App Files Files Community

LLMReferenceLetterBias / biases_lexical_content.py

emmatliu

Upload 6 files

a269338 verified over 1 year ago

raw

history blame contribute delete

4.78 kB

	import spacy
	from spacy.matcher import Matcher
	from collections import Counter
	from operator import itemgetter
	import pandas as pd
	from tqdm import tqdm
	import scipy.stats as stats
	from argparse import ArgumentParser

	def calculate_dict(female_array, male_array):
	counter_f_h = Counter(female_array)
	counter_m_h = Counter(male_array)
	# make sure there is no key lookup error
	for key in set(counter_f_h) - set(counter_m_h):
	counter_m_h[key] = 0
	for key in set(counter_m_h) - set(counter_f_h):
	counter_f_h[key] = 0
	return counter_f_h, counter_m_h

	def odds_ratio(f_dict, m_dict, topk=50, threshold=20):
	very_small_value = 0.00001
	if len(f_dict.keys()) != len(m_dict.keys()):
	raise Exception('The category for analyzing the male and female should be the same!')
	else:
	odds_ratio = {}
	total_num_f = sum(f_dict.values())
	total_num_m = sum(m_dict.values())
	for key in f_dict.keys():
	m_num = m_dict[key]
	f_num = f_dict[key]
	non_f_num = total_num_f - f_num
	non_m_num = total_num_m - m_num
	if f_num >= threshold and m_num >= threshold:
	# we only consider the events where there are at least {thresohld} occurences for both gender
	odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2)
	else:
	continue
	return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict(
	sorted(odds_ratio.items(), key=itemgetter(1))[:topk])

	class Word_Extraction:
	def __init__(self, word_types=None):
	self.nlp = spacy.load("en_core_web_sm")
	self.matcher = Matcher(self.nlp.vocab)
	patterns = []

	for word_type in word_types:
	if word_type == 'noun':
	patterns.append([{'POS':'NOUN'}])
	elif word_type == 'adj':
	patterns.append([{'POS':'ADJ'}])
	elif word_type == 'verb':
	patterns.append([{"POS": "VERB"}])
	self.matcher.add("demo", patterns)

	def extract_word(self, doc):
	doc = self.nlp(doc)
	matches = self.matcher(doc)
	vocab = []
	for match_id, start, end in matches:
	string_id = self.nlp.vocab.strings[match_id] # Get string representation
	span = doc[start:end] # The matched span
	vocab.append(span.text)
	return vocab

	def compute_lexical_content(list1, list2, threshold=10):

	noun_f, noun_m = [], []
	adj_f, adj_m = [], []
	len_f, len_m = [], []

	noun_extract = Word_Extraction(['noun'])
	adj_extract = Word_Extraction(['adj'])
	ability_m, standout_m, ability_f, standout_f = 0, 0, 0, 0
	masculine_m, feminine_m, masculine_f, feminine_f = 0, 0, 0, 0
	for i in tqdm(range(len(list1)), ascii=True):
	noun_vocab_f = noun_extract.extract_word(list1[i])
	# For normal analysis
	for v in noun_vocab_f:
	v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
	noun_f.append(v)

	adj_vocab_f = adj_extract.extract_word(list1[i])
	for v in adj_vocab_f:
	v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
	adj_f.append(v)


	for i in tqdm(range(len(list2)), ascii=True):
	noun_vocab_m = noun_extract.extract_word(list2[i])
	# For normal analysis
	for v in noun_vocab_m:
	v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
	noun_m.append(v)

	adj_vocab_m = adj_extract.extract_word(list2[i])
	for v in adj_vocab_m:
	v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
	adj_m.append(v)

	# For normal analysis
	noun_counter_f, noun_counter_m = calculate_dict(noun_f, noun_m)
	noun_res_m, noun_res_f = odds_ratio(noun_counter_f, noun_counter_m, threshold=threshold)
	adj_counter_f, adj_counter_m = calculate_dict(adj_f, adj_m)
	adj_res_m, adj_res_f = odds_ratio(adj_counter_f, adj_counter_m, threshold=threshold)

	output = {}
	output['noun_male'] = ", ".join(list(noun_res_m.keys())[:10])
	output['noun_female'] = ", ".join(list(noun_res_f.keys())[:10])
	output['adj_male'] = ", ".join(list(adj_res_m.keys())[:10])
	output['adj_female'] = ", ".join(list(adj_res_f.keys())[:10])

	# want to make df where cols are key of output and second col is list of values
	data = {
	'male': [output['noun_male'], output['adj_male']],
	'female': [output['noun_female'], output['adj_female']]
	}
	df = pd.DataFrame(data, index=['noun', 'adj'])
	return df