Bhattaram Manojkumar
4 min readMay 24, 2021

Movie Reviews (Text) Classification Using NLTK

This post will give you a code walkthrough (Suitable for beginners in NLP) of a text classification example using movie reviews corpus from NLTK Library.

Since it is a coding example and not much theory included, I recommend to copy the code step by step to your Jupyter notebook or Colab and run it parallelly for better understanding.

Steps

  1. Downloads
  2. Imports
  3. Data loading
  4. Data Preprocessing
    a. Convert words in Sentences to single sentence (just for better understanding) and then do word tokenize
    c. remove non alphabetic
    b. convert to lower case
    d. remove stop words
    e. lemmatization/stemming
  5. Bag of words using Count Vectorizer/TfidVectorizer (or) Gensim → Word2Vec embeddings
  6. convert y into categorical
  7. create a data frame with X and y
  8. Train test Split
  9. Train an ML classification Model
    a. Logistic Regression
    b. Decission Tree
    c. KNN Classifier
    d. Random Forest
    e. SVM and f. MultinomialNB
  10. Print accuracy, confusion matrix, classification report

#1. Downloads

import nltk

nltk.download(‘movie_reviews’)

nltk.download(‘punkt’)

nltk.download(‘stopwords’)

#2 Imports

import numpy as np

import pandas as pd

import nltk

from nltk.corpus import movie_reviews

from nltk.corpus import stopwords

from nltk import word_tokenize,sent_tokenize

from nltk.stem import PorterStemmer,LancasterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

#3 Data Loading

mr = movie_reviews.sents()

#print(mr)

#print(len(mr))

#print(movie_reviews.categories())

#Negative Reviews

nr =[]

for fileid in movie_reviews.fileids(‘neg’):

words= movie_reviews.words(fileid)

nr.append(“ “.join(words))

#Negative Reviews

nr =[]

for fileid in movie_reviews.fileids(‘neg’):

words= movie_reviews.words(fileid)

nr.append(“ “.join(words))

pr =[]

for fileid in movie_reviews.fileids(‘pos’):

words= movie_reviews.words(fileid)

pr.append(“ “.join(words))

print (len(nr),len(pr))

#total_reviews

tr = pr+nr

print(len(tr))

df = pd.DataFrame(data = tr,columns=[‘review’])

print(df.shape)

print(df.head())

#4.b remove non alphabetic

#4.c. convert to lower case

#4.d. remove stop words

#4.e. lemmatization/stemming

enriched_reviews=[]

ps=PorterStemmer()

stp = stopwords.words(‘english’)

counter=1

for sent in df[‘review’]:

word_list = [ps.stem(w.lower()) for w in nltk.word_tokenize(sent) if w.isalpha() and w not in stp and len(w)>2]

new_sent = “ “.join(word_list)

enriched_reviews.append(new_sent)

print(counter,end=”,”)

if np.mod(counter,100) ==0:

print()

counter +=1

df[‘enriched_reviews’]=enriched_reviews

#5. Bag of words using Count Vectorizer/TfidVectorizer (or) Gensim → Word2Vec embeddings

cv = CountVectorizer(min_df=2)

X_cv = cv.fit_transform(df[‘enriched_reviews’])

X_names = cv.get_feature_names()

#print(X_cv.toarray())

print(X_names)

print(‘ — — — — — — — — — — — — — — — — — — — — — — — — — ‘)

X = pd.DataFrame(X_cv.toarray(),columns = X_names)

print(X.head())

#6. convert y into categorical

pflag = np.ones(1000,dtype=int)

nflag = np.zeros(1000,dtype= int)

tflag = np.concatenate((pflag,nflag),axis=0)

print(tflag.shape)

y = pd.Series(tflag)

#7. create a data frame with X and y

#8. Train test Split

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5,test_size=0.3)

print(X_train.shape,X_test.shape)

“””

9. Train an ML classification Model

<BR/> a. Logistic Regression

<BR/> b. Decission Tree

<BR/> c. KNN Classifier

<BR/> d. Random Forest

<BR/> e. SVM

10. Print accuracy, confusion matrix, classification report

“””

lr = LogisticRegression(solver=’lbfgs’, max_iter=500,n_jobs=10,verbose=1 )

lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

acc = accuracy_score(y_test,y_pred)

print(‘accuracy:{}’.format(acc))

print(confusion_matrix(y_test,y_pred))

dt = DecisionTreeClassifier()

dt.fit(X_train,y_train)

y_pred = dt.predict(X_test)

acc = accuracy_score(y_test,y_pred)

print(‘accuracy:{}’.format(acc))

print(confusion_matrix(y_test,y_pred))

knn = KNeighborsClassifier()

knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)

acc = accuracy_score(y_test,y_pred)

print(‘accuracy:{}’.format(acc))

print(confusion_matrix(y_test,y_pred))

rf = RandomForestClassifier(verbose=1,n_estimators=200,n_jobs=100)

rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

acc = accuracy_score(y_test,y_pred)

print(‘accuracy:{}’.format(acc))

print(confusion_matrix(y_test,y_pred))

nb = MultinomialNB()

nb.fit(X_train,y_train)

y_pred = nb.predict(X_test)

acc = accuracy_score(y_test,y_pred)

print(‘accuracy:{}’.format(acc))

print(confusion_matrix(y_test,y_pred))

No responses yet