Movie Reviews (Text) Classification Using NLTK
This post will give you a code walkthrough (Suitable for beginners in NLP) of a text classification example using movie reviews corpus from NLTK Library.
Since it is a coding example and not much theory included, I recommend to copy the code step by step to your Jupyter notebook or Colab and run it parallelly for better understanding.
Steps
- Downloads
- Imports
- Data loading
- Data Preprocessing
a. Convert words in Sentences to single sentence (just for better understanding) and then do word tokenize
c. remove non alphabetic
b. convert to lower case
d. remove stop words
e. lemmatization/stemming - Bag of words using Count Vectorizer/TfidVectorizer (or) Gensim → Word2Vec embeddings
- convert y into categorical
- create a data frame with X and y
- Train test Split
- Train an ML classification Model
a. Logistic Regression
b. Decission Tree
c. KNN Classifier
d. Random Forest
e. SVM and f. MultinomialNB - Print accuracy, confusion matrix, classification report
#1. Downloads
import nltk
nltk.download(‘movie_reviews’)
nltk.download(‘punkt’)
nltk.download(‘stopwords’)
#2 Imports
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
#3 Data Loading
mr = movie_reviews.sents()
#print(mr)
#print(len(mr))
#print(movie_reviews.categories())
#Negative Reviews
nr =[]
for fileid in movie_reviews.fileids(‘neg’):
words= movie_reviews.words(fileid)
nr.append(“ “.join(words))
#Negative Reviews
nr =[]
for fileid in movie_reviews.fileids(‘neg’):
words= movie_reviews.words(fileid)
nr.append(“ “.join(words))
pr =[]
for fileid in movie_reviews.fileids(‘pos’):
words= movie_reviews.words(fileid)
pr.append(“ “.join(words))
print (len(nr),len(pr))
#total_reviews
tr = pr+nr
print(len(tr))
df = pd.DataFrame(data = tr,columns=[‘review’])
print(df.shape)
print(df.head())
#4.b remove non alphabetic
#4.c. convert to lower case
#4.d. remove stop words
#4.e. lemmatization/stemming
enriched_reviews=[]
ps=PorterStemmer()
stp = stopwords.words(‘english’)
counter=1
for sent in df[‘review’]:
word_list = [ps.stem(w.lower()) for w in nltk.word_tokenize(sent) if w.isalpha() and w not in stp and len(w)>2]
new_sent = “ “.join(word_list)
enriched_reviews.append(new_sent)
print(counter,end=”,”)
if np.mod(counter,100) ==0:
print()
counter +=1
df[‘enriched_reviews’]=enriched_reviews
#5. Bag of words using Count Vectorizer/TfidVectorizer (or) Gensim → Word2Vec embeddings
cv = CountVectorizer(min_df=2)
X_cv = cv.fit_transform(df[‘enriched_reviews’])
X_names = cv.get_feature_names()
#print(X_cv.toarray())
print(X_names)
print(‘ — — — — — — — — — — — — — — — — — — — — — — — — — ‘)
X = pd.DataFrame(X_cv.toarray(),columns = X_names)
print(X.head())
#6. convert y into categorical
pflag = np.ones(1000,dtype=int)
nflag = np.zeros(1000,dtype= int)
tflag = np.concatenate((pflag,nflag),axis=0)
print(tflag.shape)
y = pd.Series(tflag)
#7. create a data frame with X and y
#8. Train test Split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=5,test_size=0.3)
print(X_train.shape,X_test.shape)
“””
9. Train an ML classification Model
<BR/> a. Logistic Regression
<BR/> b. Decission Tree
<BR/> c. KNN Classifier
<BR/> d. Random Forest
<BR/> e. SVM
10. Print accuracy, confusion matrix, classification report
“””
lr = LogisticRegression(solver=’lbfgs’, max_iter=500,n_jobs=10,verbose=1 )
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(‘accuracy:{}’.format(acc))
print(confusion_matrix(y_test,y_pred))
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(‘accuracy:{}’.format(acc))
print(confusion_matrix(y_test,y_pred))
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(‘accuracy:{}’.format(acc))
print(confusion_matrix(y_test,y_pred))
rf = RandomForestClassifier(verbose=1,n_estimators=200,n_jobs=100)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(‘accuracy:{}’.format(acc))
print(confusion_matrix(y_test,y_pred))
nb = MultinomialNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print(‘accuracy:{}’.format(acc))
print(confusion_matrix(y_test,y_pred))