File size: 4,424 Bytes
02ac4f2
fb6f6f9
02ac4f2
 
 
7f3cfe8
44f0002
7f3cfe8
 
 
 
 
 
 
 
52022f7
7f3cfe8
 
 
 
52022f7
 
7f3cfe8
44f0002
 
7f3cfe8
 
52022f7
7f3cfe8
 
 
 
 
 
 
94ab8d9
7f3cfe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f9e880
7f3cfe8
2f9e880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f3cfe8
2f9e880
 
 
7f3cfe8
2f9e880
7f3cfe8
2f9e880
7f3cfe8
2f9e880
7f3cfe8
 
4158e04
7f3cfe8
02ac4f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

import time
import pandas as pd

import torch
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from keyphrase_vectorizers import KeyphraseCountVectorizer
from transformers import T5ForConditionalGeneration,T5Tokenizer

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')

from huggingface_hub import snapshot_download, HfFolder
import streamlit as st

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

HfFolder.save_token(st.secrets["hf-auth-token"])

# Load KeyBert Model
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
kw_extractor = KeyBERT(tmp_model)

# Load T5 for Paraphrasing
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = t5_model.to(device)

def get_keybert_results_with_vectorizer(text, number_of_results=20):
    keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
    return keywords

def t5_paraphraser(text, number_of_results=5):
    text =  "paraphrase: " + text + " </s>"
    max_len = 2048
    encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    beam_outputs = t5_model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=2048,
        top_k=50,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=number_of_results
    )
    
    final_outputs =[]
    for beam_output in beam_outputs:
        sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        final_outputs.append(sent)
    
    return final_outputs
    
  
  #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again

def extract_paraphrased_sentences(article):
     
    start1 = time.time()
    with st.spinner('Extraction Keywords from Original Document...'):
        original_keywords = [(i[0], i[1]) for i in get_keybert_results_with_vectorizer(article)]
    
        article_sentences = sent_tokenize(article)
        target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]        
    st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))

    
    start2 = time.time()
    with st.spinner('Extraction Keywords from Paraphrased Target Sentences...'):
        t5_paraphrasing_keywords = []
    
        for sent in target_sentences:
            ### T5
            t5_paraphrased = t5_paraphraser(sent)
            t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
            t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
    
            t5_paraphrasing_keywords.extend(t5_keywords)
    st.success('Keyword Extraction from araphrased Target Sentences finished in {}'.format(time.time() - start2))

    original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
    
    t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')

    unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first')

    total_end = time.time()-start1

    return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
    

doc = st.text_area("Enter a custom document")

if doc:
    t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc)
    
    # extract_paraphrased_article(input_list[0])
    st.text(f'T5 PARAPHRASING RUNTIME: {total_end}\n')
    
    st.subheader('\nOriginal Keywords Extracted:\n\n')
    st.dataframe(original_keywords_df)
    
    st.subheader('\nT5 Keywords Extracted:\n\n')
    st.dataframe(t5_keywords_df)
    
    st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
    st.dataframe(unique_keywords_df)