Spaces:

tronskel
/

interview_question

No application file

App Files Files Community

interview_question / src /modules /module3_compare /model.py

tronskel

Upload 46 files

2875866 verified 11 months ago

raw

history blame contribute delete

2.56 kB

	import pandas as pd
	import numpy as np
	import os
	import pickle
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer
	import nltk
	from nltk.tokenize import word_tokenize

	class QuestionSimilarityModel:
	def __init__(self, dataset_path, cache_path='embeddings_cache.pkl'):
	self.dataset_path = dataset_path
	self.cache_path = cache_path
	self.dataset = pd.read_csv(dataset_path)
	self.model = SentenceTransformer('all-MiniLM-L6-v2')
	self.embeddings = self._load_or_generate_embeddings()

	def _generate_embeddings(self, questions):
	combined_text = questions.apply(lambda x: f"{x['title']} Difficulty: {x['difficulty']}", axis=1)
	return self.model.encode(combined_text.tolist(), convert_to_tensor=True)

	def _load_or_generate_embeddings(self):
	if os.path.exists(self.cache_path):
	with open(self.cache_path, 'rb') as f:
	print("Loading cached embeddings...")
	return pickle.load(f)
	else:
	print("Generating new embeddings...")
	embeddings = self._generate_embeddings(self.dataset)
	with open(self.cache_path, 'wb') as f:
	pickle.dump(embeddings, f)
	return embeddings

	def _preprocess(self, text):
	tokens = word_tokenize(text.lower())
	return ' '.join(tokens)

	def check_similarity(self, new_questions):
	results = []
	for question in new_questions:
	preprocessed = self._preprocess(question)
	new_embedding = self.model.encode(preprocessed, convert_to_tensor=True)
	similarities = cosine_similarity([new_embedding], self.embeddings)[0]
	max_score = np.max(similarities)
	max_index = np.argmax(similarities)
	matched_indices = np.where(similarities >= 0.7)[0] # Threshold for strong match
	matched_sources = self.dataset.iloc[matched_indices][['title', 'difficulty']].to_dict('records')
	best_match = self.dataset.iloc[max_index]
	results.append({
	'input_question': question,
	'relevance_score': float(max_score),
	'matched_sources': matched_sources,
	'best_match': {
	'index': int(max_index),
	'title': best_match['title'],
	'difficulty': best_match['difficulty']
	}
	})
	return results