Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2024-12-04 | |
| """ | |
| import re | |
| import string | |
| from collections import Counter | |
| from difflib import SequenceMatcher | |
| from nltk.tokenize import ( | |
| sent_tokenize, | |
| word_tokenize, | |
| ) | |
| from nltk.util import ngrams | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from src.application.config import PREFIX | |
| def clean_text(text: str) -> str: | |
| """ | |
| Cleans and preprocesses a given text string. | |
| Args: | |
| text (str): The input text to be cleaned. | |
| Returns: | |
| str: The cleaned and preprocessed text, containing the first 18 words. | |
| """ | |
| # Define a set of punctuation characters to exclude, | |
| # exclude comma and period due to numbers | |
| punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~""" | |
| # Lowering text | |
| text = text.lower() | |
| # Removing punctuation | |
| text = "".join([c for c in text if c not in punctuations]) | |
| # Removing whitespace and newlines | |
| text = re.sub(r"\s+", " ", text) | |
| # Replace £ with * because Google search doesn't recognize £ | |
| text.replace("£", " * ") | |
| # Split the text into a list of words. | |
| words = text.split() | |
| # Join the first 18 words back into a string | |
| text = " ".join(words[:18]) # TODO: consider another number | |
| return text | |
| def remove_punctuation(text: str) -> str: | |
| """ | |
| Removes all punctuation characters from a string, except for periods (.). | |
| Args: | |
| text (str): The input string. | |
| Returns: | |
| str: The string with all punctuation characters removed, | |
| except for periods. | |
| """ | |
| # Create a string containing all punctuation characters, | |
| # except for periods. | |
| punctuation_without_dot = string.punctuation.replace(".", "") | |
| # Create a translation table to remove the specified punctuation chars. | |
| translator = str.maketrans("", "", punctuation_without_dot) | |
| # Apply the translation table to the input text and return the result. | |
| return text.translate(translator) | |
| def get_keywords(text, num_keywords=5): | |
| """ | |
| Extracts the top k keywords from a document using the TF-IDF method. | |
| Args: | |
| text (str): The input text from which to extract keywords. | |
| num_keywords (int, optional): The number of top keywords to return. | |
| Returns: | |
| list: A list of the top keywords extracted from the text. | |
| """ | |
| # Create a TF-IDF Vectorizer | |
| vectorizer = TfidfVectorizer(stop_words="english") | |
| # Fit and transform the text | |
| tfidf_matrix = vectorizer.fit_transform([text]) | |
| # Get feature names (words) | |
| feature_names = vectorizer.get_feature_names_out() | |
| # Get TF-IDF scores | |
| tfidf_scores = tfidf_matrix.toarray()[0] | |
| # Sort words by TF-IDF score | |
| word_scores = list(zip(feature_names, tfidf_scores)) | |
| word_scores.sort(key=lambda x: x[1], reverse=True) | |
| # Return top keywords | |
| return [word for word, score in word_scores[:num_keywords]] | |
| def get_important_sentences( | |
| sentence: str, | |
| keywords: list[str], | |
| num_sentences: int = 3, | |
| ) -> list[str]: | |
| """ | |
| Selects important sentences based on a list of keywords. | |
| Args: | |
| sentence (str): The input sentence. | |
| keywords (list[str]): List of important keywords. | |
| num_sentences (int): Number of sentences to return (default is 3). | |
| Returns: | |
| list: A list of important sentences. | |
| """ | |
| # Clean and split the sentence into sentences | |
| sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s] | |
| # Calculate the importance score for each sentence | |
| sentence_scores = [] | |
| for sentence in sentences: | |
| processed_sentence = clean_text(sentence) | |
| score = 0 | |
| words = processed_sentence.lower().split() | |
| word_count = Counter(words) | |
| for keyword in keywords: | |
| if keyword.lower() in word_count: | |
| score += word_count[keyword.lower()] | |
| sentence_scores.append((sentence, score)) | |
| # Sort sentences by their scores in descending order | |
| sentence_scores.sort(key=lambda x: x[1], reverse=True) | |
| # Return the top N sentences | |
| return [sentence for sentence, score in sentence_scores[:num_sentences]] | |
| def extract_important_phrases( | |
| text: str, | |
| keywords: list[str], | |
| phrase_length: int = 5, | |
| ) -> list[str]: | |
| """ | |
| Extracts important phrases based on a list of keywords. | |
| Phrase length is auto-determined, and overlapped parts are less than 20%. | |
| Args: | |
| text (str): The input text. | |
| keywords (list[str]): List of important keywords. | |
| phrase_length (int): Length of phrases to extract (default: 5 words). | |
| Returns: | |
| list: A list of important phrases. | |
| """ | |
| # Tokenize the text into words | |
| words = word_tokenize(text.lower()) | |
| # Determine phrase length (between 3 and 7 words) | |
| phrase_length = min(max(len(words) // 10, 5), 7) | |
| # Generate n-grams (phrases) from the text | |
| phrases = list(ngrams(words, phrase_length)) | |
| important_phrases = [] | |
| used_indices = set() | |
| for i, phrase in enumerate(phrases): | |
| # Check if the phrase contains any keyword | |
| if any(keyword.lower() in phrase for keyword in keywords): | |
| # Check overlap with previously selected phrases | |
| if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices): | |
| important_phrases.append(clean_text(" ".join(phrase))) | |
| used_indices.add(i) | |
| return important_phrases | |
| def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]: | |
| """ | |
| Extracts the indices of equal text segments between two strings. | |
| Args: | |
| text1 (str): The first input string. | |
| text2 (str): The second input string. | |
| Returns: | |
| tuple[ | |
| list[dict{"start": int, "end": int}], | |
| list[dict{"start": int, "end": int}] | |
| ] | |
| - list: the start and end indices of equal segments in text1. | |
| - list: the start and end indices of equal segments in text2. | |
| """ | |
| def cleanup(text: str) -> str: | |
| """ | |
| Cleans up a text string by converting to lowercase | |
| and removing punctuation. | |
| Args: | |
| text (str): The input text. | |
| Returns: | |
| str: The cleaned text. | |
| """ | |
| text = text.lower() | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| return text | |
| # Clean and split the input texts into lists of words. | |
| splited_text1 = cleanup(text1).split() | |
| splited_text2 = cleanup(text2).split() | |
| # Create a SequenceMatcher object to compare the cleaned word lists. | |
| s = SequenceMatcher(None, splited_text1, splited_text2) | |
| equal_idx_1 = [] | |
| equal_idx_2 = [] | |
| # Split the original texts into lists of words (without cleaning). | |
| text1 = text1.split() | |
| text2 = text2.split() | |
| for tag, i1, i2, j1, j2 in s.get_opcodes(): | |
| if tag == "equal": | |
| # Append the start and end indices of the equal segment | |
| # to the respective lists. | |
| equal_idx_1.append({"start": i1, "end": i2}) | |
| equal_idx_2.append({"start": j1, "end": j2}) | |
| # subtext_1 = " ".join(text1[i1:i2]) | |
| # subtext_2 = " ".join(text2[j1:j2]) | |
| # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] ' | |
| # f'{subtext_1!r:>55} --> {subtext_2!r}') | |
| return equal_idx_1, equal_idx_2 | |
| def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]: | |
| """ | |
| Connects consecutive integers in a list. | |
| Args: | |
| nums (list): A list of integers. | |
| Returns: | |
| list: A list of lists, | |
| where each inner list represents a consecutive range. | |
| For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]]. | |
| """ | |
| if not nums: # Handle empty input | |
| return [] | |
| result = [] | |
| start = nums[0] | |
| end = nums[0] | |
| for i in range(1, len(nums)): | |
| # Check if the current number is consecutive to the previous end. | |
| if nums[i] == end + 1: | |
| end = nums[i] # Extend the current range. | |
| else: | |
| # Add the current range to the result and start a new range. | |
| result.append([start, end]) | |
| start = nums[i] | |
| end = nums[i] | |
| # Add the last range to the result. | |
| result.append([start, end]) | |
| return result | |
| def postprocess_label(labels: list[str]) -> str: | |
| """ | |
| Creates a label string with the format | |
| "Partially generated by [label1] and [label2] and ...". | |
| Removes duplicate labels while preserving the original order. | |
| Args: | |
| labels: A list of strings representing labels. | |
| Returns: | |
| A string with the formatted label. | |
| """ | |
| for index, label in enumerate(labels): | |
| # if label.startswith(PREFIX): | |
| # labels[index] = label[len(PREFIX) :] | |
| if PREFIX in label: | |
| labels[index] = label.replace(PREFIX, "") | |
| labels = list(set(labels)) | |
| label = "" | |
| if len(labels) == 1: | |
| label += labels[0] | |
| elif len(labels) == 2: | |
| label += f"{labels[0]} and {labels[1]}" | |
| else: | |
| combination = ", ".join(labels[0 : len(labels) - 1]) | |
| label += f"{combination}, and {labels[-1]}" | |
| return label | |
| def split_into_sentences(input_text: str) -> list[str]: | |
| """ | |
| Splits input text into sentences by newlines | |
| and then tokenizes each paragraph into sentences. | |
| Args: | |
| input_text (str): The input text as a string. | |
| Returns: | |
| list: A list of sentences. | |
| Returns an empty list if input is not a string. | |
| """ | |
| if not isinstance(input_text, str): | |
| return [] | |
| # Split the input text into paragraphs based on newline characters, | |
| # keeping the newline characters. | |
| paragraphs = input_text.splitlines(keepends=True) | |
| sentences = [] | |
| for paragraph in paragraphs: | |
| # Remove leading/trailing whitespace | |
| paragraph = paragraph.strip() | |
| if paragraph and paragraph != "\n": | |
| # Tokenize the paragraph into sentences | |
| sentences.extend(sent_tokenize(paragraph)) | |
| return sentences | |
| def split_into_paragraphs(input_text: str) -> list[str]: | |
| """ | |
| Splits input text into paragraphs based on newline characters. | |
| Args: | |
| input_text (str): The input text as a string. | |
| Returns: | |
| list: A list of paragraphs. | |
| Returns an empty list if input is not a string. | |
| """ | |
| if not isinstance(input_text, str): | |
| return [] | |
| # Split the input text into paragraphs based on newline characters, | |
| # keeping the newline characters. | |
| paragraphs = input_text.splitlines(keepends=True) | |
| out_paragraphs = [] | |
| for paragraph in paragraphs: | |
| # Remove leading/trailing whitespace | |
| # paragraph = paragraph.strip() | |
| if paragraph and paragraph != "\n": | |
| # Append the cleaned paragraph to the output list. | |
| out_paragraphs.append(paragraph) | |
| return out_paragraphs | |
| def extract_starts_ends( | |
| colored_idx: list[dict], | |
| ) -> tuple[list[int], list[int]]: | |
| """ | |
| Extracts start and end indices from a list of dictionaries. | |
| Args: | |
| colored_idx (list[dict]): A list of dictionaries, | |
| where each dictionary has 'start' and 'end' keys. | |
| Returns: | |
| tuple: A tuple containing two lists: | |
| - starts (list[int]): A list of start indices. | |
| - ends (list[int]): A list of end indices. | |
| """ | |
| starts = [] | |
| ends = [] | |
| for index in colored_idx: | |
| starts.append(index["start"]) | |
| ends.append(index["end"]) | |
| return starts, ends | |
| def filter_indices( | |
| starts: list[int], | |
| ends: list[int], | |
| ignore_indices: list[int], | |
| ): | |
| """ | |
| Filters start and end indices to exclude any indices present in the | |
| ignore_indices list. | |
| Args: | |
| starts (list[int]): A list of starting indices. | |
| ends (list[int]): A list of ending indices. | |
| Must be the same length as starts. | |
| ignore_indices (list[int]): A list of indices to exclude. | |
| Returns: | |
| A tuple of two lists of integers: | |
| - filtered_starts | |
| - filtered_ends | |
| Returns empty lists if the input is invalid | |
| or if all ranges are filtered out. | |
| Examples: | |
| starts = [0, 5, 10] | |
| ends = [3, 7, 12] # words at the end will not be colored. | |
| ignore_indices = [1, 2, 12, 17] | |
| # Output: | |
| starts = [0, 3, 5, 10] | |
| ends = [1, 4, 7, 12] | |
| """ | |
| if len(starts) != len(ends): | |
| print( | |
| "Error: The 'starts' & 'ends' lists must have the same length.", | |
| ) | |
| return [], [] | |
| filtered_starts = [] | |
| filtered_ends = [] | |
| for i in range(len(starts)): | |
| start = starts[i] | |
| end = ends[i] | |
| if end < start: | |
| print( | |
| f"Error: End index {end} < start index {start} at position {i}.", # noqa: E501 | |
| ) | |
| return [], [] | |
| start_end = list(range(start, end + 1, 1)) | |
| start_end = list(set(start_end) - set(ignore_indices)) | |
| # new_start, new_end = self.extract_sequences(start_end) | |
| new_start, new_end = extract_new_startend( | |
| start, | |
| end, | |
| ignore_indices, | |
| ) | |
| filtered_starts.extend(new_start) | |
| filtered_ends.extend(new_end) | |
| return filtered_starts, filtered_ends | |
| def replace_leading_spaces(text: str) -> str: | |
| """ | |
| Replaces leading spaces in a string with ' '. | |
| Args: | |
| text: The input string. | |
| Returns: | |
| The string with leading spaces replaced by ' '. | |
| """ | |
| if text is None: | |
| return None | |
| leading_spaces = 0 | |
| for char in text: | |
| if char == " ": | |
| leading_spaces += 1 | |
| else: | |
| break | |
| if leading_spaces > 0: | |
| return " " * leading_spaces + text[leading_spaces:] | |
| else: | |
| return text | |
| def extract_new_startend( | |
| start: int, | |
| end: int, | |
| ignore_indices: list[int], | |
| ) -> tuple[list[int], list[int]]: | |
| """ | |
| Extracts new start and end indices by splitting a range based on | |
| ignored indices. | |
| Args: | |
| start (int): The starting index of the range. | |
| end (int): The ending index of the range (exclusive). | |
| ignore_indices (list): indices to ignore within the range. | |
| Returns: | |
| tuple: A tuple containing two lists: | |
| - new_starts (list): Starting indices for the sub-ranges. | |
| - new_ends (list): Ending indices for the sub-ranges. | |
| """ | |
| # Sort the set of ignore_indices in ascending order. | |
| indexes = list(set(ignore_indices)) | |
| indexes.sort() | |
| new_starts = [] | |
| new_ends = [] | |
| new_start = start | |
| # If no indices to ignore, return the original range. | |
| if indexes is None or len(indexes) < 1: | |
| new_starts.append(start) | |
| new_ends.append(end) | |
| return new_starts, new_ends | |
| for index in indexes: | |
| # Skip indices that are outside the range [start, end). | |
| if index < start: | |
| continue | |
| elif index >= end: | |
| continue | |
| new_starts.append(new_start) | |
| new_ends.append(index) | |
| new_start = index + 1 | |
| new_starts.append(new_start) | |
| new_ends.append(end) | |
| return new_starts, new_ends | |