Spaces:
No application file
No application file
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import pickle | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| class QuestionSimilarityModel: | |
| def __init__(self, dataset_path, cache_path='embeddings_cache.pkl'): | |
| self.dataset_path = dataset_path | |
| self.cache_path = cache_path | |
| self.dataset = pd.read_csv(dataset_path) | |
| self.model = SentenceTransformer('all-MiniLM-L6-v2') | |
| self.embeddings = self._load_or_generate_embeddings() | |
| def _generate_embeddings(self, questions): | |
| combined_text = questions.apply(lambda x: f"{x['title']} Difficulty: {x['difficulty']}", axis=1) | |
| return self.model.encode(combined_text.tolist(), convert_to_tensor=True) | |
| def _load_or_generate_embeddings(self): | |
| if os.path.exists(self.cache_path): | |
| with open(self.cache_path, 'rb') as f: | |
| print("Loading cached embeddings...") | |
| return pickle.load(f) | |
| else: | |
| print("Generating new embeddings...") | |
| embeddings = self._generate_embeddings(self.dataset) | |
| with open(self.cache_path, 'wb') as f: | |
| pickle.dump(embeddings, f) | |
| return embeddings | |
| def _preprocess(self, text): | |
| tokens = word_tokenize(text.lower()) | |
| return ' '.join(tokens) | |
| def check_similarity(self, new_questions): | |
| results = [] | |
| for question in new_questions: | |
| preprocessed = self._preprocess(question) | |
| new_embedding = self.model.encode(preprocessed, convert_to_tensor=True) | |
| similarities = cosine_similarity([new_embedding], self.embeddings)[0] | |
| max_score = np.max(similarities) | |
| max_index = np.argmax(similarities) | |
| matched_indices = np.where(similarities >= 0.7)[0] # Threshold for strong match | |
| matched_sources = self.dataset.iloc[matched_indices][['title', 'difficulty']].to_dict('records') | |
| best_match = self.dataset.iloc[max_index] | |
| results.append({ | |
| 'input_question': question, | |
| 'relevance_score': float(max_score), | |
| 'matched_sources': matched_sources, | |
| 'best_match': { | |
| 'index': int(max_index), | |
| 'title': best_match['title'], | |
| 'difficulty': best_match['difficulty'] | |
| } | |
| }) | |
| return results |