Spaces:

harao-ml
/

QuickPulse

Sleeping

App Files Files Community

QuickPulse / cluster_news.py

harao-ml

Upload 6 files

7c3be27 verified 7 months ago

raw

history blame

9 kB

	import numpy as np
	import pandas as pd
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.metrics import silhouette_score
	from collections import defaultdict
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity


	def generate_embeddings(df, content_column):
	"""
	Generate embeddings for the content using SentenceTransformer.
	"""
	print("🔢 Generating embeddings for clustering...")
	model = SentenceTransformer('all-MiniLM-L6-v2')
	embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
	return embeddings


	def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
	"""
	Determine the optimum number of clusters using silhouette analysis.
	"""
	print("🔍 Determining the optimum number of clusters using silhouette analysis...")
	n_samples = len(embeddings)
	if n_samples < 2:
	raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")

	# Adjust max_clusters to ensure it does not exceed n_samples - 1
	max_clusters = min(max_clusters, n_samples - 1)

	best_num_clusters = min_clusters
	best_score = -1

	for n_clusters in range(min_clusters, max_clusters + 1):
	try:
	kmeans = KMeans(n_clusters=n_clusters, random_state=42)
	cluster_labels = kmeans.fit_predict(embeddings)
	score = silhouette_score(embeddings, cluster_labels)
	print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")

	if score > best_score:
	best_score = score
	best_num_clusters = n_clusters
	except ValueError as e:
	print(f"Skipping {n_clusters} clusters due to error: {e}")

	print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
	return best_num_clusters


	def cluster_embeddings(embeddings, num_clusters):
	"""
	Perform KMeans clustering on the embeddings.
	"""
	print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	kmeans.fit(embeddings)
	return kmeans.labels_, kmeans


	def extract_tfidf_labels(df, content_column, cluster_labels):
	"""
	Extract top TF-IDF keywords for each cluster.
	"""
	print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
	grouped = defaultdict(list)
	for idx, label in enumerate(cluster_labels):
	grouped[label].append(df.iloc[idx][content_column])

	tfidf_labels = {}
	for cluster_id, texts in grouped.items():
	vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
	tfidf_matrix = vectorizer.fit_transform(texts)
	avg_tfidf = tfidf_matrix.mean(axis=0).A1
	top_indices = np.argsort(avg_tfidf)[::-1][:3]
	top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
	tfidf_labels[cluster_id] = ", ".join(top_terms)

	return tfidf_labels

	def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
	"""
	Apply topic modeling (LDA) within each cluster to refine and describe topics.
	"""
	print("🔍 Applying topic modeling within each cluster...")
	grouped = defaultdict(list)
	for idx, label in enumerate(cluster_labels):
	grouped[label].append(df.iloc[idx][content_column])

	topic_labels = {}
	for cluster_id, texts in grouped.items():
	vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
	tfidf_matrix = vectorizer.fit_transform(texts)

	lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
	lda.fit(tfidf_matrix)

	# Extract top words for each topic
	feature_names = vectorizer.get_feature_names_out()
	topics = []
	for topic_idx, topic in enumerate(lda.components_):
	top_indices = topic.argsort()[:-4:-1]
	topics.append(", ".join([feature_names[i] for i in top_indices]))
	topic_labels[cluster_id] = " \| ".join(topics)

	return topic_labels


	def filter_similar_topics(topic_keywords_list, threshold=0.75):
	"""
	Filter out similar topics based on cosine similarity of their embeddings.
	"""
	print("🔄 Filtering similar topics...")
	model = SentenceTransformer('all-MiniLM-L6-v2')
	topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
	embeddings = model.encode(topic_sentences)
	unique_indices = []
	for i, emb in enumerate(embeddings):
	if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
	unique_indices.append(i)
	return [topic_keywords_list[i] for i in unique_indices]


	def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
	"""
	Get the most representative summary for each cluster based on proximity to the cluster centroid.
	"""
	print("🔄 Refining cluster labels using representative summaries...")
	representatives = {}
	for i in range(kmeans.n_clusters):
	indices = [j for j, label in enumerate(cluster_labels) if label == i]
	if not indices:
	continue
	cluster_embeddings = embeddings[indices]
	centroid = kmeans.cluster_centers_[i]
	distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
	closest_idx = indices[np.argmin(distances)]
	representatives[i] = df.iloc[closest_idx][summary_column]

	return representatives


	def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
	"""
	Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
	Display detected topics for each cluster with Primary focus and Related topics.
	"""
	if df.empty:
	print("No articles to cluster.")
	return None

	# Step 1: Generate embeddings
	embeddings = generate_embeddings(df, content_column)

	# Step 2: Determine the optimum number of clusters
	num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)

	# Step 3: Perform clustering
	cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
	df['cluster_label'] = cluster_labels

	# Step 4: Extract TF-IDF matrix
	print("🔠 Extracting TF-IDF matrix for clusters...")
	vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
	tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
	feature_names = vectorizer.get_feature_names_out()

	# Step 5: Process each cluster
	print("🔍 Processing clusters for TF-IDF and topic modeling...")
	grouped = defaultdict(list)
	for idx, label in enumerate(cluster_labels):
	grouped[label].append(idx)

	refined_labels = [""] * num_clusters # Initialize refined_labels with empty strings
	detected_topics = {}
	for cluster_id, indices in grouped.items():
	cluster_texts = tfidf_matrix[indices]

	# Extract TF-IDF keywords
	avg_tfidf = cluster_texts.mean(axis=0).A1
	top_indices = np.argsort(avg_tfidf)[::-1][:3]
	tfidf_keywords = [feature_names[i] for i in top_indices]

	# Generate a cluster label using the top TF-IDF keywords
	cluster_label_tfidf = ", ".join(tfidf_keywords)

	# Apply topic modeling
	lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
	lda.fit(cluster_texts)
	topics = []
	topic_weights = []
	for topic_idx, topic in enumerate(lda.components_):
	top_topic_indices = topic.argsort()[:-4:-1]
	topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
	topic_weights.append(topic.sum()) # Sum of weights for ranking

	# Rank topics by importance
	ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]

	# Generate Primary focus and Related topics
	primary_focus = ranked_topics[0] if ranked_topics else "N/A"
	related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []

	# Store detected topics for user display
	detected_topics[cluster_label_tfidf] = {
	"primary_focus": primary_focus,
	"related_topics": related_topics,
	}

	# Assign the TF-IDF keywords as the cluster label
	refined_labels[cluster_id] = cluster_label_tfidf

	# Assign refined labels to clusters
	df['cluster_label'] = [refined_labels[label] for label in cluster_labels]

	print("✅ Clustering and labeling complete!")
	return {
	"dataframe": df,
	"detected_topics": detected_topics,
	"number_of_clusters": num_clusters,
	}