ModerationModelCode / src /config /vader.py

final

a5eaebe 26 days ago

20.8 kB

	# Natural Language Toolkit: vader
	#
	# Copyright (C) 2001-2022 NLTK Project
	# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
	# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
	# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
	# George Berry <geb97@cornell.edu> (modifications)
	# Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT
	#
	# Modifications to the original VADER code have been made in order to
	# integrate it into NLTK. These have involved changes to
	# ensure Python 3 compatibility, and refactoring to achieve greater modularity.

	"""
	If you use the VADER sentiment analysis tools, please cite:

	Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
	Sentiment Analysis of Social Media Text. Eighth International Conference on
	Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
	"""

	import math
	import re
	import string
	from itertools import product

	import nltk.data
	from nltk.util import pairwise


	class VaderConstants:
	"""
	A class to keep the Vader lists and constants.
	"""

	##Constants##
	# (empirically derived mean sentiment intensity rating increase for booster words)
	B_INCR = 0.293
	B_DECR = -0.293

	# (empirically derived mean sentiment intensity rating increase for using
	# ALLCAPs to emphasize a word)
	C_INCR = 0.733

	N_SCALAR = -0.74

	NEGATE = {
	"aint",
	"arent",
	"cannot",
	"cant",
	"couldnt",
	"darent",
	"didnt",
	"doesnt",
	"ain't",
	"aren't",
	"can't",
	"couldn't",
	"daren't",
	"didn't",
	"doesn't",
	"dont",
	"hadnt",
	"hasnt",
	"havent",
	"isnt",
	"mightnt",
	"mustnt",
	"neither",
	"don't",
	"hadn't",
	"hasn't",
	"haven't",
	"isn't",
	"mightn't",
	"mustn't",
	"neednt",
	"needn't",
	"never",
	"none",
	"nope",
	"nor",
	"not",
	"nothing",
	"nowhere",
	"oughtnt",
	"shant",
	"shouldnt",
	"uhuh",
	"wasnt",
	"werent",
	"oughtn't",
	"shan't",
	"shouldn't",
	"uh-uh",
	"wasn't",
	"weren't",
	"without",
	"wont",
	"wouldnt",
	"won't",
	"wouldn't",
	"rarely",
	"seldom",
	"despite",
	}

	# booster/dampener 'intensifiers' or 'degree adverbs'
	# https://en.wiktionary.org/wiki/Category:English_degree_adverbs

	BOOSTER_DICT = {
	"absolutely": B_INCR,
	"amazingly": B_INCR,
	"awfully": B_INCR,
	"completely": B_INCR,
	"considerably": B_INCR,
	"decidedly": B_INCR,
	"deeply": B_INCR,
	"effing": B_INCR,
	"enormously": B_INCR,
	"entirely": B_INCR,
	"especially": B_INCR,
	"exceptionally": B_INCR,
	"extremely": B_INCR,
	"fabulously": B_INCR,
	"flipping": B_INCR,
	"flippin": B_INCR,
	"fricking": B_INCR,
	"frickin": B_INCR,
	"frigging": B_INCR,
	"friggin": B_INCR,
	"fully": B_INCR,
	"fucking": B_INCR,
	"greatly": B_INCR,
	"hella": B_INCR,
	"highly": B_INCR,
	"hugely": B_INCR,
	"incredibly": B_INCR,
	"intensely": B_INCR,
	"majorly": B_INCR,
	"more": B_INCR,
	"most": B_INCR,
	"particularly": B_INCR,
	"purely": B_INCR,
	"quite": B_INCR,
	"really": B_INCR,
	"remarkably": B_INCR,
	"so": B_INCR,
	"substantially": B_INCR,
	"thoroughly": B_INCR,
	"totally": B_INCR,
	"tremendously": B_INCR,
	"uber": B_INCR,
	"unbelievably": B_INCR,
	"unusually": B_INCR,
	"utterly": B_INCR,
	"very": B_INCR,
	"almost": B_DECR,
	"barely": B_DECR,
	"hardly": B_DECR,
	"just enough": B_DECR,
	"kind of": B_DECR,
	"kinda": B_DECR,
	"kindof": B_DECR,
	"kind-of": B_DECR,
	"less": B_DECR,
	"little": B_DECR,
	"marginally": B_DECR,
	"occasionally": B_DECR,
	"partly": B_DECR,
	"scarcely": B_DECR,
	"slightly": B_DECR,
	"somewhat": B_DECR,
	"sort of": B_DECR,
	"sorta": B_DECR,
	"sortof": B_DECR,
	"sort-of": B_DECR,
	}

	# check for special case idioms using a sentiment-laden keyword known to SAGE
	SPECIAL_CASE_IDIOMS = {
	"the shit": 3,
	"the bomb": 3,
	"bad ass": 1.5,
	"yeah right": -2,
	"cut the mustard": 2,
	"kiss of death": -1.5,
	"hand to mouth": -2,
	}

	# for removing punctuation
	REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]")

	PUNC_LIST = [
	".",
	"!",
	"?",
	",",
	";",
	":",
	"-",
	"'",
	'"',
	"!!",
	"!!!",
	"??",
	"???",
	"?!?",
	"!?!",
	"?!?!",
	"!?!?",
	]

	def __init__(self):
	pass

	def negated(self, input_words, include_nt=True):
	"""
	Determine if input contains negation words
	"""
	neg_words = self.NEGATE
	if any(word.lower() in neg_words for word in input_words):
	return True
	if include_nt:
	if any("n't" in word.lower() for word in input_words):
	return True
	for first, second in pairwise(input_words):
	if second.lower() == "least" and first.lower() != "at":
	return True
	return False

	def normalize(self, score, alpha=15):
	"""
	Normalize the score to be between -1 and 1 using an alpha that
	approximates the max expected value
	"""
	norm_score = score / math.sqrt((score * score) + alpha)
	return norm_score

	def scalar_inc_dec(self, word, valence, is_cap_diff):
	"""
	Check if the preceding words increase, decrease, or negate/nullify the
	valence
	"""
	scalar = 0.0
	word_lower = word.lower()
	if word_lower in self.BOOSTER_DICT:
	scalar = self.BOOSTER_DICT[word_lower]
	if valence < 0:
	scalar *= -1
	# check if booster/dampener word is in ALLCAPS (while others aren't)
	if word.isupper() and is_cap_diff:
	if valence > 0:
	scalar += self.C_INCR
	else:
	scalar -= self.C_INCR
	return scalar


	class SentiText:
	"""
	Identify sentiment-relevant string-level properties of input text.
	"""

	def __init__(self, text, punc_list, regex_remove_punctuation):
	if not isinstance(text, str):
	text = str(text.encode("utf-8"))
	self.text = text
	self.PUNC_LIST = punc_list
	self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
	self.words_and_emoticons = self._words_and_emoticons()
	# doesn't separate words from
	# adjacent punctuation (keeps emoticons & contractions)
	self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)

	def _words_plus_punc(self):
	"""
	Returns mapping of form:
	{
	'cat,': 'cat',
	',cat': 'cat',
	}
	"""
	no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
	# removes punctuation (but loses emoticons & contractions)
	words_only = no_punc_text.split()
	# remove singletons
	words_only = {w for w in words_only if len(w) > 1}
	# the product gives ('cat', ',') and (',', 'cat')
	punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
	punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
	words_punc_dict = punc_before
	words_punc_dict.update(punc_after)
	return words_punc_dict

	def _words_and_emoticons(self):
	"""
	Removes leading and trailing puncutation
	Leaves contractions and most emoticons
	Does not preserve punc-plus-letter emoticons (e.g. :D)
	"""
	wes = self.text.split()
	words_punc_dict = self._words_plus_punc()
	wes = [we for we in wes if len(we) > 1]
	for i, we in enumerate(wes):
	if we in words_punc_dict:
	wes[i] = words_punc_dict[we]
	return wes

	def allcap_differential(self, words):
	"""
	Check whether just some words in the input are ALL CAPS

	:param list words: The words to inspect
	:returns: `True` if some but not all items in `words` are ALL CAPS
	"""
	is_different = False
	allcap_words = 0
	for word in words:
	if word.isupper():
	allcap_words += 1
	cap_differential = len(words) - allcap_words
	if 0 < cap_differential < len(words):
	is_different = True
	return is_different


	class SentimentIntensityAnalyzer:
	"""
	Give a sentiment intensity score to sentences.
	"""

	def __init__(
	self,
	lexicon_file="config/vader_lexicon.txt",
	):
	self.lexicon_file = nltk.data.load(lexicon_file)
	self.lexicon = self.make_lex_dict()
	self.constants = VaderConstants()

	def make_lex_dict(self):
	"""
	Convert lexicon file to a dictionary
	"""
	lex_dict = {}
	for line in self.lexicon_file.split("\n"):
	(word, measure) = line.strip().split("\t")[0:2]
	lex_dict[word] = float(measure)
	return lex_dict

	def polarity_scores(self, text):
	"""
	Return a float for sentiment strength based on the input text.
	Positive values are positive valence, negative value are negative
	valence.
	"""
	# text, words_and_emoticons, is_cap_diff = self.preprocess(text)
	sentitext = SentiText(
	text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION
	)
	sentiments = []
	words_and_emoticons = sentitext.words_and_emoticons
	for item in words_and_emoticons:
	valence = 0
	i = words_and_emoticons.index(item)
	if (
	i < len(words_and_emoticons) - 1
	and item.lower() == "kind"
	and words_and_emoticons[i + 1].lower() == "of"
	) or item.lower() in self.constants.BOOSTER_DICT:
	sentiments.append(valence)
	continue

	sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)

	sentiments = self._but_check(words_and_emoticons, sentiments)

	return self.score_valence(sentiments, text)

	def sentiment_valence(self, valence, sentitext, item, i, sentiments):
	is_cap_diff = sentitext.is_cap_diff
	words_and_emoticons = sentitext.words_and_emoticons
	item_lowercase = item.lower()
	if item_lowercase in self.lexicon:
	# get the sentiment valence
	valence = self.lexicon[item_lowercase]

	# check if sentiment laden word is in ALL CAPS (while others aren't)
	if item.isupper() and is_cap_diff:
	if valence > 0:
	valence += self.constants.C_INCR
	else:
	valence -= self.constants.C_INCR

	for start_i in range(0, 3):
	if (
	i > start_i
	and words_and_emoticons[i - (start_i + 1)].lower()
	not in self.lexicon
	):
	# dampen the scalar modifier of preceding words and emoticons
	# (excluding the ones that immediately preceed the item) based
	# on their distance from the current item.
	s = self.constants.scalar_inc_dec(
	words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
	)
	if start_i == 1 and s != 0:
	s = s * 0.95
	if start_i == 2 and s != 0:
	s = s * 0.9
	valence = valence + s
	valence = self._never_check(
	valence, words_and_emoticons, start_i, i
	)
	if start_i == 2:
	valence = self._idioms_check(valence, words_and_emoticons, i)

	# future work: consider other sentiment-laden idioms
	# other_idioms =
	# {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
	# "upper hand": 1, "break a leg": 2,
	# "cooking with gas": 2, "in the black": 2, "in the red": -2,
	# "on the ball": 2,"under the weather": -2}

	valence = self._least_check(valence, words_and_emoticons, i)

	sentiments.append(valence)
	return sentiments

	def _least_check(self, valence, words_and_emoticons, i):
	# check for negation case using "least"
	if (
	i > 1
	and words_and_emoticons[i - 1].lower() not in self.lexicon
	and words_and_emoticons[i - 1].lower() == "least"
	):
	if (
	words_and_emoticons[i - 2].lower() != "at"
	and words_and_emoticons[i - 2].lower() != "very"
	):
	valence = valence * self.constants.N_SCALAR
	elif (
	i > 0
	and words_and_emoticons[i - 1].lower() not in self.lexicon
	and words_and_emoticons[i - 1].lower() == "least"
	):
	valence = valence * self.constants.N_SCALAR
	return valence

	def _but_check(self, words_and_emoticons, sentiments):
	words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons]
	but = {"but"} & set(words_and_emoticons)
	if but:
	bi = words_and_emoticons.index(next(iter(but)))
	for sidx, sentiment in enumerate(sentiments):
	if sidx < bi:
	sentiments[sidx] = sentiment * 0.5
	elif sidx > bi:
	sentiments[sidx] = sentiment * 1.5
	return sentiments

	def _idioms_check(self, valence, words_and_emoticons, i):
	onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}"

	twoonezero = "{} {} {}".format(
	words_and_emoticons[i - 2],
	words_and_emoticons[i - 1],
	words_and_emoticons[i],
	)

	twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}"

	threetwoone = "{} {} {}".format(
	words_and_emoticons[i - 3],
	words_and_emoticons[i - 2],
	words_and_emoticons[i - 1],
	)

	threetwo = "{} {}".format(
	words_and_emoticons[i - 3], words_and_emoticons[i - 2]
	)

	sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

	for seq in sequences:
	if seq in self.constants.SPECIAL_CASE_IDIOMS:
	valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
	break

	if len(words_and_emoticons) - 1 > i:
	zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}"
	if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
	valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
	if len(words_and_emoticons) - 1 > i + 1:
	zeroonetwo = "{} {} {}".format(
	words_and_emoticons[i],
	words_and_emoticons[i + 1],
	words_and_emoticons[i + 2],
	)
	if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
	valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]

	# check for booster/dampener bi-grams such as 'sort of' or 'kind of'
	if (
	threetwo in self.constants.BOOSTER_DICT
	or twoone in self.constants.BOOSTER_DICT
	):
	valence = valence + self.constants.B_DECR
	return valence

	def _never_check(self, valence, words_and_emoticons, start_i, i):
	if start_i == 0:
	if self.constants.negated([words_and_emoticons[i - 1]]):
	valence = valence * self.constants.N_SCALAR
	if start_i == 1:
	if words_and_emoticons[i - 2] == "never" and (
	words_and_emoticons[i - 1] == "so"
	or words_and_emoticons[i - 1] == "this"
	):
	valence = valence * 1.5
	elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
	valence = valence * self.constants.N_SCALAR
	if start_i == 2:
	if (
	words_and_emoticons[i - 3] == "never"
	and (
	words_and_emoticons[i - 2] == "so"
	or words_and_emoticons[i - 2] == "this"
	)
	or (
	words_and_emoticons[i - 1] == "so"
	or words_and_emoticons[i - 1] == "this"
	)
	):
	valence = valence * 1.25
	elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
	valence = valence * self.constants.N_SCALAR
	return valence

	def _punctuation_emphasis(self, sum_s, text):
	# add emphasis from exclamation points and question marks
	ep_amplifier = self._amplify_ep(text)
	qm_amplifier = self._amplify_qm(text)
	punct_emph_amplifier = ep_amplifier + qm_amplifier
	return punct_emph_amplifier

	def _amplify_ep(self, text):
	# check for added emphasis resulting from exclamation points (up to 4 of them)
	ep_count = text.count("!")
	if ep_count > 4:
	ep_count = 4
	# (empirically derived mean sentiment intensity rating increase for
	# exclamation points)
	ep_amplifier = ep_count * 0.292
	return ep_amplifier

	def _amplify_qm(self, text):
	# check for added emphasis resulting from question marks (2 or 3+)
	qm_count = text.count("?")
	qm_amplifier = 0
	if qm_count > 1:
	if qm_count <= 3:
	# (empirically derived mean sentiment intensity rating increase for
	# question marks)
	qm_amplifier = qm_count * 0.18
	else:
	qm_amplifier = 0.96
	return qm_amplifier

	def _sift_sentiment_scores(self, sentiments):
	# want separate positive versus negative sentiment scores
	pos_sum = 0.0
	neg_sum = 0.0
	neu_count = 0
	for sentiment_score in sentiments:
	if sentiment_score > 0:
	pos_sum += (
	float(sentiment_score) + 1
	) # compensates for neutral words that are counted as 1
	if sentiment_score < 0:
	neg_sum += (
	float(sentiment_score) - 1
	) # when used with math.fabs(), compensates for neutrals
	if sentiment_score == 0:
	neu_count += 1
	return pos_sum, neg_sum, neu_count

	def score_valence(self, sentiments, text):
	if sentiments:
	sum_s = float(sum(sentiments))
	# compute and add emphasis from punctuation in text
	punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
	if sum_s > 0:
	sum_s += punct_emph_amplifier
	elif sum_s < 0:
	sum_s -= punct_emph_amplifier

	compound = self.constants.normalize(sum_s)
	# discriminate between positive, negative and neutral sentiment scores
	pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

	if pos_sum > math.fabs(neg_sum):
	pos_sum += punct_emph_amplifier
	elif pos_sum < math.fabs(neg_sum):
	neg_sum -= punct_emph_amplifier

	total = pos_sum + math.fabs(neg_sum) + neu_count
	pos = math.fabs(pos_sum / total)
	neg = math.fabs(neg_sum / total)
	neu = math.fabs(neu_count / total)

	else:
	compound = 0.0
	pos = 0.0
	neg = 0.0
	neu = 0.0

	sentiment_dict = {
	"neg": round(neg, 3),
	"neu": round(neu, 3),
	"pos": round(pos, 3),
	"compound": round(compound, 4),
	}

	return sentiment_dict