In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/IMDB Dataset.csv", engine="python", on_bad_lines="skip")
print(df.shape)
print(df.head())



(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 3. Remove punctuation & numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # 4. Tokenize
    tokens = nltk.word_tokenize(text)

    # 5. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)



In [None]:
df["cleaned_review"] = df["review"].apply(preprocess_text)


print(df[["review", "cleaned_review", "sentiment"]].tail())

                                                  review  \
49995  I thought this movie did a down right good job...   
49996  Bad plot, bad dialogue, bad acting, idiotic di...   
49997  I am a Catholic taught in parochial elementary...   
49998  I'm going to have to disagree with the previou...   
49999  No one expects the Star Trek movies to be high...   

                                          cleaned_review sentiment  
49995  thought movie right good job wasnt creative or...  positive  
49996  bad plot bad dialogue bad acting idiotic direc...  negative  
49997  catholic taught parochial elementary school nu...  negative  
49998  im going disagree previous comment side maltin...  negative  
49999  one expects star trek movie high art fan expec...  negative  


In [None]:
!pip install transformers torch sentence-transformers




In [None]:
!pip install -U torch torchvision torchaudio
!pip install -U transformers sentence-transformers




In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["cleaned_review"])

y = df["sentiment"].map({"positive": 1, "negative": 0}).values
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3894163 stored elements and shape (50000, 5000)>
  Coords	Values
  (0, 3086)	0.022306340769509683
  (0, 3687)	0.0716098594890523
  (0, 2778)	0.06997013472849029
  (0, 4832)	0.08220463762202762
  (0, 3137)	0.47260726598173247
  (0, 1467)	0.11279543497089747
  (0, 4990)	0.05792759900745007
  (0, 2119)	0.0884283468115394
  (0, 3707)	0.09072957775856476
  (0, 1513)	0.06118924586115793
  (0, 1993)	0.06126011378849219
  (0, 1697)	0.06827831217826244
  (0, 4468)	0.034484905907198556
  (0, 4258)	0.1771814513806625
  (0, 543)	0.09851073786514736
  (0, 3831)	0.03333634078441469
  (0, 4763)	0.25095990028835147
  (0, 3914)	0.04650048507467132
  (0, 4938)	0.05348789066286241
  (0, 1883)	0.07135314796729297
  (0, 4610)	0.07763035600809258
  (0, 3975)	0.15304406167950377
  (0, 3451)	0.06917492387135953
  (0, 3456)	0.08224792976382196
  (0, 3583)	0.08260003424873928
  :	:
  (49999, 4949)	0.09967306284736142
  (49999, 1608)	0.188954702264313

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer


**MODEL**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix





In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:

clf = LogisticRegression(max_iter=200)

clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)


In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))


print(classification_report(y_test, y_pred))


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8845
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

Confusion Matrix:
 [[4318  643]
 [ 512 4527]]


In [None]:
# Uninstall conflicting packages
!pip uninstall -y shap scipy

# Install compatible versions
!pip install scipy==1.10.1
!pip install shap==0.42.1

Found existing installation: shap 0.42.1
Uninstalling shap-0.42.1:
  Successfully uninstalled shap-0.42.1
Found existing installation: scipy 1.16.2
Uninstalling scipy-1.16.2:
  Successfully uninstalled scipy-1.16.2
[31mERROR: Ignored the following yanked versions: 1.11.0, 1.14.0rc1[0m[31m
[0m[31mERROR: Ignored the following versions that require a different python version: 1.10.0 Requires-Python <3.12,>=3.8; 1.10.0rc1 Requires-Python <3.12,>=3.8; 1.10.0rc2 Requires-Python <3.12,>=3.8; 1.10.1 Requires-Python <3.12,>=3.8; 1.6.2 Requires-Python >=3.7,<3.10; 1.6.3 Requires-Python >=3.7,<3.10; 1.7.0 Requires-Python >=3.7,<3.10; 1.7.1 Requires-Python >=3.7,<3.10; 1.7.2 Requires-Python >=3.7,<3.11; 1.7.3 Requires-Python >=3.7,<3.11; 1.8.0 Requires-Python >=3.8,<3.11; 1.8.0rc1 Requires-Python >=3.8,<3.11; 1.8.0rc2 Requires-Python >=3.8,<3.11; 1.8.0rc3 Requires-Python >=3.8,<3.11; 1.8.0rc4 Requires-Python >=3.8,<3.11; 1.8.1 Requires-Python >=3.8,<3.11; 1.9.0 Requires-Python >=3.8,<3.12; 1.

In [None]:
!pip install numpy==1.26.0 shap==0.42.1





In [None]:
import numpy as np


np.obj2sctype = lambda obj: np.dtype(obj).type

import shap


MODEL 2


In [None]:


from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["cleaned_review"])
sequences = tokenizer.texts_to_sequences(df["cleaned_review"])
X = pad_sequences(sequences, maxlen=max_len)

y = df["sentiment"].map({"positive": 1, "negative": 0}).values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.1
)



loss, accuracy = model.evaluate(X_test, y_test)
print("LSTM Test Accuracy:", accuracy)




In [None]:

!pip uninstall -y shap scipy


!pip install scipy==1.10.1
!pip install shap==0.42.1

In [None]:
import shap
import numpy as np

X_background = X_train[:100]

explainer = shap.DeepExplainer(model, X_background)


sample_review = X_test[0:1]


shap_values = explainer.shap_values(sample_review)


index_word = {v: k for k, v in tokenizer.word_index.items()}
words = [index_word.get(i, '') for i in sample_review[0] if i != 0]

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], words)


In [None]:
import shap

In [None]:
Logistic Regression is a simple baseline model for sentiment analysis, using features like bag-of-words or TF-IDF but ignores word order. LSTM, on the other hand, is a deep learning model that captures sequence and context, making it more accurate for understanding movie reviews