Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import streamlit as st | |
| import re | |
| import string | |
| from collections import Counter | |
| from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
| from gensim.models import Word2Vec | |
| from string import punctuation | |
| import transformers | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| from sklearn.model_selection import train_test_split | |
| import time | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| import pickle | |
| import torch | |
| from torch.utils.data import DataLoader, TensorDataset | |
| import torch.nn as nn | |
| import torchutils as tu | |
| from torchmetrics.classification import BinaryAccuracy | |
| from data.rnn_preprocessing import ( | |
| data_preprocessing, | |
| preprocess_single_string | |
| ) | |
| def main(): | |
| device = 'cpu' | |
| df = pd.read_csv('data/imdb.csv') | |
| df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0) | |
| reviews = df['review'].tolist() | |
| preprocessed = [data_preprocessing(review) for review in reviews] | |
| wv = Word2Vec.load('models/word2vec32.model') | |
| words_list = [word for review in preprocessed for word in review.lower().split()] | |
| for i in words_list: | |
| ''.join([j for j in i if j not in punctuation]) | |
| # делаем множество уникальных слов. | |
| unique_words = set(words_list) | |
| # word -> index | |
| vocab_to_int = {word: idx+1 for idx, word in enumerate(sorted(unique_words))} | |
| word_seq = [i.split() for i in preprocessed] | |
| VOCAB_SIZE = len(vocab_to_int) + 1 # add 1 for the padding token | |
| EMBEDDING_DIM = 32 | |
| HIDDEN_DIM = 64 | |
| SEQ_LEN = 32 | |
| embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM)) | |
| for word, i in vocab_to_int.items(): | |
| try: | |
| embedding_vector = wv.wv[word] | |
| embedding_matrix[i] = embedding_vector | |
| except KeyError: | |
| pass | |
| embedding_layer32 = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix)) | |
| class LSTMClassifierBi32(nn.Module): | |
| def __init__(self, embedding_dim: int, hidden_size:int = 32) -> None: | |
| super().__init__() | |
| self.embedding_dim = embedding_dim | |
| self.hidden_size = hidden_size | |
| self.embedding = embedding_layer32 | |
| self.lstm = nn.LSTM( | |
| input_size=self.embedding_dim, | |
| hidden_size=self.hidden_size, | |
| batch_first=True, | |
| bidirectional=True | |
| ) | |
| self.clf = nn.Sequential(nn.Linear(self.hidden_size*2, 128), | |
| nn.Dropout(), | |
| nn.Sigmoid(), | |
| nn.Linear(128, 64), | |
| nn.Dropout(), | |
| nn.Sigmoid(), | |
| nn.Linear(64, 1) | |
| ) | |
| def forward(self, x): | |
| embeddings = self.embedding(x) | |
| out, (_, _) = self.lstm(embeddings) | |
| out = self.clf(out[:,-1,:]) | |
| return out | |
| model = LSTMClassifierBi32(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM) | |
| model.load_state_dict(torch.load('models/ltsm_bi1.pt')) | |
| model.eval() | |
| def predict_sentence(text:str, model: nn.Module): | |
| result = model.to(device)(preprocess_single_string(text, seq_len=SEQ_LEN, vocab_to_int=vocab_to_int).unsqueeze(0)).sigmoid().round().item() | |
| return 'negative' if result == 0.0 else 'positive' | |
| #Bag Tfidf | |
| # bagvectorizer = CountVectorizer(max_df=0.5, | |
| # min_df=5, | |
| # stop_words="english",) | |
| # bvect = bagvectorizer.fit(preprocessed) | |
| # X_bag = bvect.transform(preprocessed) | |
| tfid_vectorizer = TfidfVectorizer( | |
| max_df=0.5, | |
| min_df=5) | |
| vect = tfid_vectorizer.fit(preprocessed) | |
| X_tfidf = vect.transform(preprocessed) | |
| tfidf_model = pickle.load(open('models/modeltfidf.sav', 'rb')) | |
| # bag_model = pickle.load(open('models/modelbag.sav', 'rb')) | |
| # def predictbag(text): | |
| # result = bag_model.predict(vect.transform([text])) | |
| # return 'negative' if result == [0] else 'positive' | |
| def predicttf(text): | |
| result = tfidf_model.predict(vect.transform([text])) | |
| return 'negative' if result == [0] else 'positive' | |
| review = st.text_input('Enter review') | |
| start1 = time.time() | |
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") | |
| config = AutoConfig.from_pretrained('distilbert-base-uncased', num_labels=2) | |
| automodel = AutoModelForSequenceClassification.from_config(config) | |
| autotoken = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english') | |
| input_tokens = autotoken( | |
| review, | |
| return_tensors='pt', | |
| padding=True, | |
| max_length=10 | |
| ) | |
| outputs = automodel(**input_tokens) | |
| st.write('Sentiment Predictions') | |
| st.write(f'\nBERT: {[automodel.config.id2label[i.item()] for i in outputs.logits.argmax(-1)]}') | |
| end1 = time.time() | |
| st.write(f'{(end1 - start1):.2f} sec') | |
| start2 = time.time() | |
| st.write(f'LTSM: {predict_sentence(review, model)}') | |
| end2 = time.time() | |
| st.write(f'{(end2 - start2):.2f} sec') | |
| # start3 = time.time() | |
| # st.write(f'bag+log: {predictbag(review)}') | |
| # end3 = time.time() | |
| # st.write(f'{(end3 - start3):.2f} sec') | |
| start4 = time.time() | |
| st.write(f'tfidf+log: {predicttf(review)}') | |
| end4 = time.time() | |
| st.write(f'{(end4 - start4):.2f} sec') | |
| if __name__ == '__main__': | |
| main() |