Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /helper.py

pmkhanh7890

refactor code + fix bug of label after grouping url

00b1038 9 months ago

raw

history blame

15.3 kB

	"""
	Author: Khanh Phan
	Date: 2024-12-04
	"""

	import re
	import string
	from collections import Counter
	from difflib import SequenceMatcher

	from nltk.tokenize import (
	sent_tokenize,
	word_tokenize,
	)
	from nltk.util import ngrams
	from sklearn.feature_extraction.text import TfidfVectorizer

	from src.application.config import PREFIX


	def clean_text(text: str) -> str:
	"""
	Cleans and preprocesses a given text string.

	Args:
	text (str): The input text to be cleaned.

	Returns:
	str: The cleaned and preprocessed text, containing the first 18 words.
	"""
	# Define a set of punctuation characters to exclude,
	# exclude comma and period due to numbers
	punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{\|}~"""

	# Lowering text
	text = text.lower()

	# Removing punctuation
	text = "".join([c for c in text if c not in punctuations])

	# Removing whitespace and newlines
	text = re.sub(r"\s+", " ", text)

	# Replace £ with * because Google search doesn't recognize £
	text.replace("£", " * ")

	# Split the text into a list of words.
	words = text.split()

	# Join the first 18 words back into a string
	text = " ".join(words[:18]) # TODO: consider another number

	return text


	def remove_punctuation(text: str) -> str:
	"""
	Removes all punctuation characters from a string, except for periods (.).

	Args:
	text (str): The input string.

	Returns:
	str: The string with all punctuation characters removed,
	except for periods.
	"""
	# Create a string containing all punctuation characters,
	# except for periods.
	punctuation_without_dot = string.punctuation.replace(".", "")

	# Create a translation table to remove the specified punctuation chars.
	translator = str.maketrans("", "", punctuation_without_dot)

	# Apply the translation table to the input text and return the result.
	return text.translate(translator)


	def get_keywords(text, num_keywords=5):
	"""
	Extracts the top k keywords from a document using the TF-IDF method.

	Args:
	text (str): The input text from which to extract keywords.
	num_keywords (int, optional): The number of top keywords to return.

	Returns:
	list: A list of the top keywords extracted from the text.
	"""

	# Create a TF-IDF Vectorizer
	vectorizer = TfidfVectorizer(stop_words="english")

	# Fit and transform the text
	tfidf_matrix = vectorizer.fit_transform([text])

	# Get feature names (words)
	feature_names = vectorizer.get_feature_names_out()

	# Get TF-IDF scores
	tfidf_scores = tfidf_matrix.toarray()[0]

	# Sort words by TF-IDF score
	word_scores = list(zip(feature_names, tfidf_scores))
	word_scores.sort(key=lambda x: x[1], reverse=True)

	# Return top keywords
	return [word for word, score in word_scores[:num_keywords]]


	def get_important_sentences(
	sentence: str,
	keywords: list[str],
	num_sentences: int = 3,
	) -> list[str]:
	"""
	Selects important sentences based on a list of keywords.

	Args:
	sentence (str): The input sentence.
	keywords (list[str]): List of important keywords.
	num_sentences (int): Number of sentences to return (default is 3).

	Returns:
	list: A list of important sentences.
	"""
	# Clean and split the sentence into sentences
	sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s]

	# Calculate the importance score for each sentence
	sentence_scores = []
	for sentence in sentences:
	processed_sentence = clean_text(sentence)
	score = 0
	words = processed_sentence.lower().split()
	word_count = Counter(words)

	for keyword in keywords:
	if keyword.lower() in word_count:
	score += word_count[keyword.lower()]

	sentence_scores.append((sentence, score))

	# Sort sentences by their scores in descending order
	sentence_scores.sort(key=lambda x: x[1], reverse=True)

	# Return the top N sentences
	return [sentence for sentence, score in sentence_scores[:num_sentences]]


	def extract_important_phrases(
	text: str,
	keywords: list[str],
	phrase_length: int = 5,
	) -> list[str]:
	"""
	Extracts important phrases based on a list of keywords.
	Phrase length is auto-determined, and overlapped parts are less than 20%.

	Args:
	text (str): The input text.
	keywords (list[str]): List of important keywords.
	phrase_length (int): Length of phrases to extract (default: 5 words).

	Returns:
	list: A list of important phrases.
	"""
	# Tokenize the text into words
	words = word_tokenize(text.lower())

	# Determine phrase length (between 3 and 7 words)
	phrase_length = min(max(len(words) // 10, 5), 7)

	# Generate n-grams (phrases) from the text
	phrases = list(ngrams(words, phrase_length))

	important_phrases = []
	used_indices = set()

	for i, phrase in enumerate(phrases):
	# Check if the phrase contains any keyword
	if any(keyword.lower() in phrase for keyword in keywords):
	# Check overlap with previously selected phrases
	if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
	important_phrases.append(clean_text(" ".join(phrase)))
	used_indices.add(i)

	return important_phrases


	def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
	"""
	Extracts the indices of equal text segments between two strings.

	Args:
	text1 (str): The first input string.
	text2 (str): The second input string.

	Returns:
	tuple[
	list[dict{"start": int, "end": int}],
	list[dict{"start": int, "end": int}]
	]
	- list: the start and end indices of equal segments in text1.
	- list: the start and end indices of equal segments in text2.
	"""

	def cleanup(text: str) -> str:
	"""
	Cleans up a text string by converting to lowercase
	and removing punctuation.

	Args:
	text (str): The input text.

	Returns:
	str: The cleaned text.
	"""
	text = text.lower()
	text = text.translate(str.maketrans("", "", string.punctuation))
	return text

	# Clean and split the input texts into lists of words.
	splited_text1 = cleanup(text1).split()
	splited_text2 = cleanup(text2).split()

	# Create a SequenceMatcher object to compare the cleaned word lists.
	s = SequenceMatcher(None, splited_text1, splited_text2)

	equal_idx_1 = []
	equal_idx_2 = []

	# Split the original texts into lists of words (without cleaning).
	text1 = text1.split()
	text2 = text2.split()
	for tag, i1, i2, j1, j2 in s.get_opcodes():
	if tag == "equal":
	# Append the start and end indices of the equal segment
	# to the respective lists.
	equal_idx_1.append({"start": i1, "end": i2})
	equal_idx_2.append({"start": j1, "end": j2})

	# subtext_1 = " ".join(text1[i1:i2])
	# subtext_2 = " ".join(text2[j1:j2])
	# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
	# f'{subtext_1!r:>55} --> {subtext_2!r}')
	return equal_idx_1, equal_idx_2


	def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
	"""
	Connects consecutive integers in a list.

	Args:
	nums (list): A list of integers.

	Returns:
	list: A list of lists,
	where each inner list represents a consecutive range.
	For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
	"""

	if not nums: # Handle empty input
	return []

	result = []
	start = nums[0]
	end = nums[0]

	for i in range(1, len(nums)):
	# Check if the current number is consecutive to the previous end.
	if nums[i] == end + 1:
	end = nums[i] # Extend the current range.
	else:
	# Add the current range to the result and start a new range.
	result.append([start, end])
	start = nums[i]
	end = nums[i]

	# Add the last range to the result.
	result.append([start, end])
	return result


	def postprocess_label(labels: list[str]) -> str:
	"""
	Creates a label string with the format
	"Partially generated by [label1] and [label2] and ...".
	Removes duplicate labels while preserving the original order.

	Args:
	labels: A list of strings representing labels.

	Returns:
	A string with the formatted label.
	"""

	for index, label in enumerate(labels):
	# if label.startswith(PREFIX):
	# labels[index] = label[len(PREFIX) :]
	if PREFIX in label:
	labels[index] = label.replace(PREFIX, "")

	labels = list(set(labels))

	label = ""

	if len(labels) == 1:
	label += labels[0]
	elif len(labels) == 2:
	label += f"{labels[0]} and {labels[1]}"
	else:
	combination = ", ".join(labels[0 : len(labels) - 1])
	label += f"{combination}, and {labels[-1]}"
	return label


	def split_into_sentences(input_text: str) -> list[str]:
	"""
	Splits input text into sentences by newlines
	and then tokenizes each paragraph into sentences.

	Args:
	input_text (str): The input text as a string.

	Returns:
	list: A list of sentences.
	Returns an empty list if input is not a string.
	"""
	if not isinstance(input_text, str):
	return []

	# Split the input text into paragraphs based on newline characters,
	# keeping the newline characters.
	paragraphs = input_text.splitlines(keepends=True)
	sentences = []
	for paragraph in paragraphs:
	# Remove leading/trailing whitespace
	paragraph = paragraph.strip()

	if paragraph and paragraph != "\n":
	# Tokenize the paragraph into sentences
	sentences.extend(sent_tokenize(paragraph))

	return sentences


	def split_into_paragraphs(input_text: str) -> list[str]:
	"""
	Splits input text into paragraphs based on newline characters.

	Args:
	input_text (str): The input text as a string.

	Returns:
	list: A list of paragraphs.
	Returns an empty list if input is not a string.
	"""
	if not isinstance(input_text, str):
	return []

	# Split the input text into paragraphs based on newline characters,
	# keeping the newline characters.
	paragraphs = input_text.splitlines(keepends=True)
	out_paragraphs = []

	for paragraph in paragraphs:
	# Remove leading/trailing whitespace
	# paragraph = paragraph.strip()

	if paragraph and paragraph != "\n":
	# Append the cleaned paragraph to the output list.
	out_paragraphs.append(paragraph)

	return out_paragraphs


	def extract_starts_ends(
	colored_idx: list[dict],
	) -> tuple[list[int], list[int]]:
	"""
	Extracts start and end indices from a list of dictionaries.

	Args:
	colored_idx (list[dict]): A list of dictionaries,
	where each dictionary has 'start' and 'end' keys.

	Returns:
	tuple: A tuple containing two lists:
	- starts (list[int]): A list of start indices.
	- ends (list[int]): A list of end indices.
	"""
	starts = []
	ends = []
	for index in colored_idx:
	starts.append(index["start"])
	ends.append(index["end"])
	return starts, ends


	def filter_indices(
	starts: list[int],
	ends: list[int],
	ignore_indices: list[int],
	):
	"""
	Filters start and end indices to exclude any indices present in the
	ignore_indices list.

	Args:
	starts (list[int]): A list of starting indices.
	ends (list[int]): A list of ending indices.
	Must be the same length as starts.
	ignore_indices (list[int]): A list of indices to exclude.

	Returns:
	A tuple of two lists of integers:
	- filtered_starts
	- filtered_ends
	Returns empty lists if the input is invalid
	or if all ranges are filtered out.

	Examples:
	starts = [0, 5, 10]
	ends = [3, 7, 12] # words at the end will not be colored.
	ignore_indices = [1, 2, 12, 17]

	# Output:
	starts = [0, 3, 5, 10]
	ends = [1, 4, 7, 12]

	"""

	if len(starts) != len(ends):
	print(
	"Error: The 'starts' & 'ends' lists must have the same length.",
	)
	return [], []

	filtered_starts = []
	filtered_ends = []

	for i in range(len(starts)):
	start = starts[i]
	end = ends[i]

	if end < start:
	print(
	f"Error: End index {end} < start index {start} at position {i}.", # noqa: E501
	)
	return [], []

	start_end = list(range(start, end + 1, 1))
	start_end = list(set(start_end) - set(ignore_indices))
	# new_start, new_end = self.extract_sequences(start_end)
	new_start, new_end = extract_new_startend(
	start,
	end,
	ignore_indices,
	)
	filtered_starts.extend(new_start)
	filtered_ends.extend(new_end)

	return filtered_starts, filtered_ends


	def replace_leading_spaces(text: str) -> str:
	"""
	Replaces leading spaces in a string with ' '.

	Args:
	text: The input string.

	Returns:
	The string with leading spaces replaced by ' '.
	"""

	if text is None:
	return None

	leading_spaces = 0
	for char in text:
	if char == " ":
	leading_spaces += 1
	else:
	break

	if leading_spaces > 0:
	return " " * leading_spaces + text[leading_spaces:]
	else:
	return text


	def extract_new_startend(
	start: int,
	end: int,
	ignore_indices: list[int],
	) -> tuple[list[int], list[int]]:
	"""
	Extracts new start and end indices by splitting a range based on
	ignored indices.

	Args:
	start (int): The starting index of the range.
	end (int): The ending index of the range (exclusive).
	ignore_indices (list): indices to ignore within the range.

	Returns:
	tuple: A tuple containing two lists:
	- new_starts (list): Starting indices for the sub-ranges.
	- new_ends (list): Ending indices for the sub-ranges.
	"""
	# Sort the set of ignore_indices in ascending order.
	indexes = list(set(ignore_indices))
	indexes.sort()

	new_starts = []
	new_ends = []
	new_start = start

	# If no indices to ignore, return the original range.
	if indexes is None or len(indexes) < 1:
	new_starts.append(start)
	new_ends.append(end)
	return new_starts, new_ends

	for index in indexes:
	# Skip indices that are outside the range [start, end).
	if index < start:
	continue
	elif index >= end:
	continue

	new_starts.append(new_start)
	new_ends.append(index)

	new_start = index + 1

	new_starts.append(new_start)
	new_ends.append(end)

	return new_starts, new_ends