Spaces:
Runtime error
Runtime error
| # extractive.py | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import sent_tokenize | |
| import networkx as nx | |
| import numpy as np | |
| import torch | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| def preprocess_text(text): | |
| sentences = sent_tokenize(text) | |
| return sentences | |
| def get_sentence_embeddings(sentences, model, tokenizer): | |
| embeddings = [] | |
| with torch.no_grad(): | |
| for sentence in sentences: | |
| inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| outputs = model(**inputs) | |
| sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1) | |
| embeddings.append(sentence_embedding.squeeze().numpy()) | |
| return np.array(embeddings) | |
| def build_semantic_graph(embeddings, similarity_threshold=0.75): | |
| graph = nx.Graph() | |
| for i, emb1 in enumerate(embeddings): | |
| for j, emb2 in enumerate(embeddings): | |
| if i != j: | |
| similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) | |
| if similarity >= similarity_threshold: | |
| graph.add_edge(i, j, weight=similarity) | |
| return graph | |
| def apply_textrank(graph, sentences, damping_factor=0.85, max_iter=100): | |
| num_nodes = len(sentences) | |
| personalization = {i: 1 / num_nodes for i in range(num_nodes)} | |
| scores = nx.pagerank(graph, personalization=personalization, max_iter=max_iter) | |
| ranked_sentences = sorted(((score, idx) for idx, score in scores.items()), reverse=True) | |
| return ranked_sentences | |
| def generate_summary(ranked_sentences, sentences, max_length_ratio=0.5): | |
| stop_words = set(stopwords.words('english')) | |
| summary = [] | |
| current_length = 0 | |
| total_length = sum(len(sentence.split()) for sentence in sentences) | |
| max_length = int(total_length * max_length_ratio) | |
| for score, idx in ranked_sentences: | |
| sentence = sentences[idx] | |
| sentence_length = len(sentence.split()) | |
| sentence_words = [word for word in sentence.split() if word.lower() not in stop_words] | |
| if current_length + sentence_length <= max_length: | |
| summary.append(" ".join(sentence_words)) | |
| current_length += sentence_length | |
| else: | |
| break | |
| return " ".join(summary) | |