# Text Classification with spaCy



In [1]:
from google.colab import drive
drive.mount('/content/drive')  # Add My Drive/<>

import os
os.chdir('drive/My Drive')
os.chdir('Books_Writings/NLPBook/')

Mounted at /content/drive


In [2]:
%%capture
%pylab inline
import pandas as pd
import os

## Using spaCy

[spaCy](https://spacy.io) has an excellent pipeline for doing text classification. We will learn about this pipeline here.

We will also use scikit learn. https://scikit-learn.org/stable/

In [3]:
%pylab inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [4]:
# Loading CSV file
df = pd.read_csv("NLP_data/movie_review.csv")
# View data information
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         5000 non-null   object
 1   sentiment  5000 non-null   int64 
 2   review     5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB
None


Unnamed: 0,id,sentiment,review
0,10000_8,1,Homelessness (or Houselessness as George Carli...
1,10001_4,0,This film lacked something I couldn't put my f...
2,10004_3,0,"\""It appears that many critics find the idea o..."
3,10004_8,1,"This isn't the comedic Robin Williams, nor is ..."
4,10006_4,0,"I don't know who to blame, the timid writers o..."


In [5]:
# Feedback Value count
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,2517
0,2483


In [6]:
!pip install spacy --quiet
!python -m spacy download en_core_web_sm
!python -m spacy download en

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and ins

In [7]:
# Set up various spaCY stuff
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [8]:
# Custom transformer using spaCy
class prepare_data(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [9]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [10]:
df.columns

Index(['id', 'sentiment', 'review'], dtype='object')

In [11]:
from sklearn.model_selection import train_test_split

X = df['review'] # the features we want to analyze
ylabels = df['sentiment'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [12]:
%%time
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty=None, max_iter=1000, tol=0.001)

CPU times: user 197 ms, sys: 23.1 ms, total: 220 ms
Wall time: 359 ms


In [13]:
%%time
# Create pipeline
pipe = Pipeline([("cleaner", prepare_data()),
                 ('vectorizer', tfidf_vector),   # replace with tf_idf, or bow to try
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



CPU times: user 3min 16s, sys: 3.5 s, total: 3min 19s
Wall time: 3min 36s


In [14]:
%%time
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.838
Logistic Regression Precision: 0.8333333333333334
Logistic Regression Recall: 0.8560311284046692
CPU times: user 1min 20s, sys: 264 ms, total: 1min 20s
Wall time: 1min 28s


In [15]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predicted)
acc = sum(diag(cm))/sum(cm)
print("acc =",acc)
print(cm)

acc = 0.838
[[597 132]
 [111 660]]
