Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2024-12-04 | |
| """ | |
| import string | |
| from collections import Counter | |
| import requests | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from src.application.config import ( | |
| CHUNK_SIZE, | |
| GOOGLE_API_KEY, | |
| GOOGLE_ENDPOINT_URL, | |
| NUM_CHUNKS, | |
| NUM_FREQUENT_WORDS, | |
| NUM_KEYWORDS, | |
| SEARCH_ENGINE_ID, | |
| STOPWORDS_LANG, | |
| TOP_SEARCH_RESUTLS, | |
| ) | |
| from src.application.text.entity import extract_entities | |
| def search_by_google( | |
| query, | |
| num_results=TOP_SEARCH_RESUTLS, | |
| is_exact_terms=False, | |
| ) -> dict: | |
| """ | |
| Performs a Google Custom Search API query. | |
| Args: | |
| query (str): The search query string. | |
| num_results (int, optional): The number of search results to return. | |
| Defaults to TOP_SEARCH_RESUTLS. | |
| is_exact_terms (bool, optional): use an exact phrase search or not. | |
| Defaults to False. | |
| Returns: | |
| dict: JSON response from the Google Custom Search API, | |
| None if an error occurs. | |
| """ | |
| params = { | |
| "key": GOOGLE_API_KEY, | |
| "cx": SEARCH_ENGINE_ID, | |
| "num": num_results, | |
| } | |
| if is_exact_terms: | |
| params["exactTerms"] = query | |
| else: | |
| params["q"] = query.replace('"', "") | |
| response = requests.get(GOOGLE_ENDPOINT_URL, params=params) | |
| if response.status_code == 200: | |
| return response.json() | |
| else: | |
| print(f"Error: {response.status_code}, {response.text}") | |
| return None | |
| def get_most_frequent_words( | |
| input_text: str, | |
| number_word: int = NUM_FREQUENT_WORDS, | |
| ) -> str: | |
| """ | |
| Extracts the most frequent words from the input text | |
| and forms a search phrase. | |
| Args: | |
| input_text (str): The text from which to extract frequent words. | |
| number_word (int, optional): The number of frequent words to extract. | |
| Returns: | |
| str: A search phrase consisting of the most frequent words. | |
| """ | |
| # Check if the input text is valid | |
| if not isinstance(input_text, str) or not input_text: | |
| return None | |
| # Tokenize the input text into words and convert to lowercase | |
| words = word_tokenize(input_text.lower()) | |
| # Get the set of stop words for the specified language | |
| stop_words = set(stopwords.words(STOPWORDS_LANG)) | |
| # Get the set of punctuation characters | |
| punctuation = set(string.punctuation) | |
| # Filter out stop words, punctuation, and non-alphanumeric words | |
| filtered_words = [ | |
| word | |
| for word in words | |
| if word.isalnum() | |
| and word not in stop_words | |
| and word not in punctuation | |
| ] | |
| # Count the frequency of each filtered word | |
| word_frequencies = Counter(filtered_words) | |
| # Get the most common words and their frequencies | |
| top_words = word_frequencies.most_common(number_word) | |
| for top_word in top_words: | |
| words.append(top_word[0]) | |
| # Construct the search phrase | |
| if len(words) > NUM_FREQUENT_WORDS: | |
| search_phrase = " ".join(words[:NUM_FREQUENT_WORDS]) | |
| else: | |
| search_phrase = " ".join(words[:number_word]) | |
| return search_phrase | |
| def get_chunk( | |
| input_text: str, | |
| chunk_size: int = CHUNK_SIZE, | |
| num_chunk: int = NUM_CHUNKS, | |
| ) -> list[str]: | |
| """ | |
| Splits the input text into chunks of a specified size. | |
| Args: | |
| input_text (str): The text to be chunked. | |
| chunk_size (int, optional): The number of words per chunk. | |
| num_chunk (int, optional): The number of chunks to generate. | |
| Returns: | |
| list: A list of chunks of the input text. | |
| """ | |
| if not isinstance(input_text, str): | |
| return [] | |
| chunks = [] | |
| input_words = input_text.split() # Split by any whitespace | |
| for i in range(num_chunk): | |
| # Calculate the start and end indices for the current chunk | |
| start_index = i * chunk_size | |
| end_index = (i + 1) * chunk_size | |
| # Extract the words for the current chunk and join them into a string | |
| chunk = " ".join(input_words[start_index:end_index]) | |
| if chunk: # Only append non-empty chunks | |
| chunks.append(chunk) | |
| return chunks | |
| def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]: | |
| """ | |
| Extracts the top keywords from a given text using the TF-IDF method. | |
| Args: | |
| text (str): The input text from which to extract keywords. | |
| num_keywords (int, optional): The number of top keywords to return. | |
| Returns: | |
| list: A list of strings representing the top keywords extracted | |
| from the text. | |
| """ | |
| # Create a TF-IDF Vectorizer | |
| vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG) | |
| # Fit and transform the text | |
| tfidf_matrix = vectorizer.fit_transform([text]) | |
| # Get feature names (words) | |
| feature_names = vectorizer.get_feature_names_out() | |
| # Get TF-IDF scores | |
| tfidf_scores = tfidf_matrix.toarray()[0] | |
| # Sort words by TF-IDF score | |
| word_scores = list(zip(feature_names, tfidf_scores)) | |
| word_scores.sort(key=lambda x: x[1], reverse=True) | |
| # Return top keywords | |
| return [word for word, score in word_scores[:num_keywords]] | |
| def generate_search_phrases(input_text: str) -> list[str]: | |
| """ | |
| Generates different types of phrases for search purposes. | |
| Args: | |
| input_text: The input text. | |
| Returns: | |
| A list containing: | |
| - A list of most frequent words. | |
| - The original input text. | |
| - A list of text chunks. | |
| - A text without entities. | |
| """ | |
| if not isinstance(input_text, str): | |
| return [] | |
| search_phrases = [] | |
| # Method 1: Get most frequent words | |
| search_phrases.append(get_most_frequent_words(input_text)) | |
| # Method 2: Get the whole text | |
| search_phrases.append(input_text) | |
| # Method 3: Split text by chunks | |
| search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes | |
| # Method 4: Remove identities and key words | |
| entities = extract_entities(input_text) | |
| text_without_entities = remove_identities_from_text(input_text, entities) | |
| search_phrases.append(text_without_entities) | |
| # keywords = get_keywords(input_text, 16) | |
| # search_phrase = " ".join(entities) + " " + " ".join(keywords) | |
| # search_phrases.append(search_phrase) # TODO: for demo purposes | |
| return search_phrases | |
| def remove_identities_from_text(input_text: str, entities: list[str]) -> str: | |
| """ | |
| Removes entities from the input text. | |
| Args: | |
| input_text: The input text as a string. | |
| entities: A list of entities to be removed. | |
| """ | |
| for entity in entities: | |
| input_text = input_text.replace(entity, "") | |
| return input_text | |