Spaces:
Sleeping
Sleeping
| import abc | |
| from typing import List | |
| import numpy as np | |
| import pandas as pd | |
| import sklearn | |
| import streamlit as st | |
| from sentence_transformers.cross_encoder import CrossEncoder | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sparse_dot_topn import awesome_cossim_topn | |
| from src.models import Chapter | |
| class Retriever: | |
| def retrieve(self, query, n=10) -> List[Chapter]: | |
| pass | |
| class SemanticRetriever: | |
| def __init__( | |
| self, | |
| bible_df, | |
| embeddings_manager, | |
| threshold=0.4, | |
| cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-12-v2", | |
| ): | |
| self.bible_df = bible_df | |
| self.embeddings_manager = embeddings_manager | |
| self.threshold = threshold | |
| self.cross_encoder_model = ( | |
| CrossEncoder(cross_encoder_model) if cross_encoder_model else None | |
| ) | |
| # 'cross-encoder/stsb-distilroberta-base' | |
| # cross-encoder/ms-marco-MiniLM-L-12-v2 | |
| def retrieve(self, query, n=10) -> List[Chapter]: | |
| verse_candidates_df = self.semantic_search( | |
| query=query, | |
| texts=self.bible_df["text"].tolist(), | |
| embeddings_manager=self.embeddings_manager, | |
| n=n * 2, | |
| threshold=self.threshold, | |
| ) | |
| if len(verse_candidates_df) == 0: | |
| return [] | |
| if self.cross_encoder_model is not None: | |
| verse_candidates_df = self.cross_encode( | |
| query, verse_candidates_df["text"].tolist() | |
| ) | |
| # TODO: revisit this logic as some verses can have the same exact text | |
| # For now, workaround is to drop duplicates | |
| verse_candidates_df.drop_duplicates(subset="text", inplace=True) | |
| # Join back verse metadata | |
| verse_candidates_df = pd.merge( | |
| verse_candidates_df, self.bible_df, how="left", on="text" | |
| ) | |
| # DEBUG | |
| # st.write(verse_candidates_df) | |
| chapter_candidates = self.extract_chapters_from_verses( | |
| self.bible_df, verse_candidates_df | |
| ) | |
| return chapter_candidates | |
| def cross_encode(self, query, texts): | |
| combinations = [[query, text] for text in texts] | |
| sim_scores = self.cross_encoder_model.predict(combinations) | |
| sim_scores = MinMaxScaler().fit_transform(sim_scores.reshape(-1, 1)).flatten() | |
| reranked_texts_scores = sorted( | |
| zip(texts, sim_scores), key=lambda x: x[1], reverse=True | |
| ) | |
| df = pd.DataFrame(reranked_texts_scores, columns=["text", "score"]) | |
| return df | |
| def semantic_search(self, query, texts, embeddings_manager, n=None, threshold=0): | |
| embeddings = embeddings_manager.get_embeddings(texts) | |
| query_embedding = embeddings_manager.get_embeddings([query]) | |
| sim_scores = sklearn.metrics.pairwise.cosine_similarity( | |
| query_embedding, embeddings | |
| )[0] | |
| # Results is a list of tuples: [(text, score)] | |
| results = sorted(list(zip(texts, sim_scores)), key=lambda x: x[1], reverse=True) | |
| # Take top n only if specified | |
| if n: | |
| results = results[:n] | |
| # Apply a threshold to filter irrelevant results | |
| if threshold: | |
| results = [x for x in results if x[1] >= threshold] | |
| df = pd.DataFrame(results, columns=["text", "score"]) | |
| return df | |
| def extract_chapters_from_verses(self, bible_df, verse_results_df) -> List[Chapter]: | |
| # Simple, naive assumption now is to just follow order of first appearance | |
| # I.e. The per-verse scores dictate the order | |
| # TODO: Revisit ranking | |
| # The goal here is to extract all the unique chapters based on the top verse results | |
| verse_results_df = verse_results_df.copy() | |
| verse_results_df["book_chapter"] = ( | |
| verse_results_df["book"] + " " + verse_results_df["chapter"].astype(str) | |
| ) | |
| unique_chapters = verse_results_df["book_chapter"].unique() | |
| bible_df = bible_df.copy() | |
| bible_df["book_chapter"] = ( | |
| bible_df["book"] + " " + bible_df["chapter"].astype(str) | |
| ) | |
| chapters = [] | |
| for unique_chapter in unique_chapters: | |
| chapter_verses_df = bible_df[bible_df["book_chapter"] == unique_chapter] | |
| book = chapter_verses_df["book"].tolist()[0] | |
| chapter = chapter_verses_df["chapter"].tolist()[0] | |
| # Keep track of the matched verses as highlight verses | |
| highlight_verses_df = pd.merge( | |
| chapter_verses_df, | |
| verse_results_df[["text", "score", "book", "chapter"]], | |
| how="inner", | |
| on=["text", "book", "chapter"], | |
| ) | |
| chapter = Chapter( | |
| book_name=book, | |
| chapter_num=chapter, | |
| verses_df=chapter_verses_df, | |
| highlight_verses_df=highlight_verses_df, | |
| ) | |
| chapters.append(chapter) | |
| return chapters | |
| class TfIdfRetriever(Retriever): | |
| def __init__(self, texts, preprocessors=[]) -> None: | |
| self.vectorizer = TfidfVectorizer(analyzer="word", stop_words="english") | |
| self.preprocessors = preprocessors | |
| # TODO: pre-process the texts | |
| self.tfidf_vectors = self.vectorizer.fit_transform(texts) | |
| self.tfidf_vectors_transposed = self.tfidf_vectors.transpose() | |
| def search(self, query, n=10): | |
| query_tfidf_vector = self.vectorizer.transform([query]) | |
| results = awesome_cossim_topn( | |
| query_tfidf_vector, self.tfidf_vectors_transposed, n, 0.01 | |
| ) | |
| return results | |