Natural Language Processing (NLP) with Python

 In [1]:

import nltk #package for nlp
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neerajshinde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[1]:
True
In [2]:
from nltk.tokenize import sent_tokenize
In [3]:
text = """Hello, Mr. Neeraj! How are you doing? The weather is Great and city is awesome.
The sky is pinkish-blue. You should not eat a mango"""
In [4]:
tokenized_sent = sent_tokenize(text)
tokenized_sent
Out[4]:
['Hello, Mr. Neeraj!',
 'How are you doing?',
 'The weather is Great and city is awesome.',
 'The sky is pinkish-blue.',
 'You should not eat a mango']
In [5]:
from nltk.tokenize import word_tokenize
word_tokens = word_tokenize(text)
word_tokens
Out[5]:
['Hello',
 ',',
 'Mr.',
 'Neeraj',
 '!',
 'How',
 'are',
 'you',
 'doing',
 '?',
 'The',
 'weather',
 'is',
 'Great',
 'and',
 'city',
 'is',
 'awesome',
 '.',
 'The',
 'sky',
 'is',
 'pinkish-blue',
 '.',
 'You',
 'should',
 'not',
 'eat',
 'a',
 'mango']
In [6]:
import gc
gc.collect() #helps us speed up the process 
Out[6]:
44
In [7]:
import inspect
inspect.getsource(word_tokenize)
Out[7]:
'def word_tokenize(text, language="english", preserve_line=False):\n    """\n    Return a tokenized copy of *text*,\n    using NLTK\'s recommended word tokenizer\n    (currently an improved :class:`.TreebankWordTokenizer`\n    along with :class:`.PunktSentenceTokenizer`\n    for the specified language).\n\n    :param text: text to split into words\n    :type text: str\n    :param language: the model name in the Punkt corpus\n    :type language: str\n    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.\n    :type preserve_line: bool\n    """\n    sentences = [text] if preserve_line else sent_tokenize(text, language)\n    return [\n        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)\n    ]\n'
In [8]:
print(inspect.getsource(word_tokenize))
def word_tokenize(text, language="english", preserve_line=False):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool
    """
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [
        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    ]

In [9]:
from nltk.probability import FreqDist
fdist = FreqDist(word_tokens)
print(fdist)
<FreqDist with 26 samples and 30 outcomes>
In [10]:
fdist.most_common(3)
Out[10]:
[('is', 3), ('The', 2), ('.', 2)]
In [11]:
import matplotlib.pyplot as plt
fdist.plot(30, cumulative=False)
plt.show()
In [12]:
fdist.plot(30,cumulative=True)
plt.show()
In [13]:
len(word_tokens)
Out[13]:
30
In [14]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)
{'yourself', 'this', 'does', 'any', "aren't", "couldn't", 'and', 'shouldn', 'of', "hasn't", 'where', 've', 'that', 'for', 'so', 'these', 'do', 'which', 'will', 'my', 'while', 'a', "you're", 'm', "shouldn't", 'each', 'down', 't', 'after', 'then', 'won', 'by', 'his', 'needn', 'if', 'over', 'aren', 'isn', 'into', 'o', 'shan', "don't", 'being', 'most', 'very', "didn't", 'doesn', "it's", 'couldn', "you'd", 'just', 'll', "hadn't", 'or', 'to', 'him', 'on', 'further', 'am', 'during', 'up', 'not', 'wasn', 'now', 'our', 'it', 'their', 'against', 'ourselves', 'hadn', 'only', 'the', "needn't", 'didn', 'i', 'too', "isn't", 'been', "shan't", 'theirs', 'ma', 'some', 'from', 'can', 'at', 'yours', 'myself', 'had', 'doing', 'are', 'more', 'herself', 'having', 'whom', 'here', 'ours', 'out', 'be', 'both', "she's", 's', "haven't", 'she', 'them', 'again', "wasn't", 'before', 'because', 'weren', 'mightn', 'is', 're', "doesn't", "that'll", 'they', 'such', 'themselves', "should've", 'about', 'same', 'nor', 'but', 'other', 'no', 'haven', 'hers', 'all', 'those', 'when', 'should', "you'll", 'has', 'until', 'we', 'between', 'above', 'why', 'what', 'he', 'below', 'her', 'was', 'have', 'were', 'off', 'mustn', 'through', "weren't", 'don', "mustn't", 'as', 'itself', 'himself', 'your', 'how', 'few', "wouldn't", 'who', 'did', 'in', 'you', 'with', 'wouldn', 'its', 'own', 'me', 'an', 'y', 'hasn', 'd', "you've", 'ain', 'once', 'yourselves', 'under', "won't", "mightn't", 'than', 'there'}
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ns45237\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [15]:
filtered_words = []
for w in word_tokens:
    if w.lower not in stop_words:
        filtered_words.append(w)
        
print("word token:", word_tokens)
print("filtered token:", filtered_words)
word token: ['Hello', ',', 'Mr.', 'Neeraj', '!', 'How', 'are', 'you', 'doing', '?', 'The', 'weather', 'is', 'Great', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'a', 'mango']
filtered token: ['Hello', ',', 'Mr.', 'Neeraj', '!', 'How', 'are', 'you', 'doing', '?', 'The', 'weather', 'is', 'Great', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'a', 'mango']
In [16]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_word = []
for w in filtered_words:
    stemmed_word.append(ps.stem(w))
    
print(stemmed_word)
['hello', ',', 'mr.', 'neeraj', '!', 'how', 'are', 'you', 'do', '?', 'the', 'weather', 'is', 'great', 'and', 'citi', 'is', 'awesom', '.', 'the', 'sky', 'is', 'pinkish-blu', '.', 'you', 'should', 'not', 'eat', 'a', 'mango']
In [17]:
from nltk.stem.wordnet import WordNetLemmatizer #used to operate on words with different adjctives, tenses etc.
nltk.download('wordnet')
lem = WordNetLemmatizer()
word = 'ran' #ran gets transformed to run
lem.lemmatize(word, 'v')
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ns45237\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[17]:
'run'
In [18]:
lem.lemmatize('better','a') # better gets transformed to good - a=adjective
Out[18]:
'good'
In [19]:
lem.lemmatize('worse','a') #worse gets transformed to bad - a=adjective
Out[19]:
'bad'
In [20]:
lem.lemmatize('flew','v') #flew gets transformed to fly v=verb
Out[20]:
'fly'
In [21]:
#Consider another example
sent = "Albert Einstein was born in Ulm, Germanyin 1879."
tokens = word_tokenize(sent)
print(tokens)
['Albert', 'Einstein', 'was', 'born', 'in', 'Ulm', ',', 'Germanyin', '1879', '.']
In [22]:
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens) #tells whether a word is a noun, verb
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ns45237\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Out[22]:
[('Albert', 'NNP'),
 ('Einstein', 'NNP'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('in', 'IN'),
 ('Ulm', 'NNP'),
 (',', ','),
 ('Germanyin', 'NNP'),
 ('1879', 'CD'),
 ('.', '.')]
In [23]:
import pandas as pd
import numpy as np
In [24]:
data = pd.read_csv('C:/Users/ns45237/working_neeraj/nlp_example/train.tsv', sep='\t') 
In [25]:
data.shape
Out[25]:
(156060, 4)
In [26]:
data.head()
Out[26]:
PhraseIdSentenceIdPhraseSentiment
011A series of escapades demonstrating the adage ...1
121A series of escapades demonstrating the adage ...2
231A series2
341A2
451series2
In [27]:
data.Sentiment.value_counts() #review/phase sentiments 1 is negative 2 is neutral 3 is slightly positive 4 is positive
Out[27]:
2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64
In [28]:
Sentiment_count = data.groupby('Sentiment').count()
Sentiment_count.head()
Out[28]:
PhraseIdSentenceIdPhrase
Sentiment
0707270727072
1272732727327273
2795827958279582
3329273292732927
4920692069206
In [29]:
plt.bar(Sentiment_count.index, Sentiment_count['Phrase'])
plt.xlabel('Sentiment')
plt.ylabel('No. of reviews')
plt.show()
In [30]:
plt.bar(['negative','somewhat negative','neutral','somewhat positive','positive'], Sentiment_count['Phrase'])
plt.xlabel('Sentiment')
plt.ylabel('No. of reviews')
plt.show()
In [31]:
data['Phrase'].iloc[0]
Out[31]:
'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

Bag of Words Algorithm

In [41]:
#bag of words will interpret the documents/statements into the word counts
from sklearn.feature_extraction.text import CountVectorizer #used to create a vector for word counts
from nltk.tokenize import RegexpTokenizer #used to remove any non-alphanumeric characters from the token
token = RegexpTokenizer(r'[a-zA-Z0-9]+') #removed all special chars . , % etc. (r is raw string)
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)
text_counts = cv.fit_transform(data['Phrase'])
In [40]:
type(text_counts) #output of Bag of Words is a sparse Matrix which we cannot use hence, we go for confusion matrix
Out[40]:
scipy.sparse.csr.csr_matrix
In [34]:
text_counts.shape #this is the data which we will feed to our algorithm to consume
Out[34]:
(156060, 14988)

Solving a Classification Problem using NLP (Confusion Matrix)

In [35]:
#Let us classify the fruit name (Sentiment) based on its probabilities of being sweet, big size etc.
#First let us test the Model and try to split it into training & test datasets
In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, data['Sentiment'], test_size = 0.3, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(109242, 14988)
(46818, 14988)
(109242,)
(46818,)
In [37]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(accuracy_score(y_pred=y_pred, y_true=y_test)) #shows that our Model is 60.64% accurate
[[  588  1032   430    57     6]
 [  505  3417  3826   446    34]
 [  169  2093 18609  2525   192]
 [   28   406  3985  4977   645]
 [    3    46   467  1528   804]]
0.6064975009611688
In [42]:
y_test
Out[42]:
95722     2
147312    4
36991     2
150211    2
140655    1
         ..
39479     2
136980    2
50777     3
75758     3
2424      1
Name: Sentiment, Length: 46818, dtype: int64

Sentiment Analysis: In various documents, certain words may repeat multiple times, also certain words may repeat multiple times in multiple documents. The sentiment and significant of these words will depend on their common occurances in documents.

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer #inverse document feature 
tf = TfidfVectorizer()
text_tf = tf.fit_transform(data['Phrase'])
In [46]:
type(text_tf) #output is a spare matrix i.e. cannot be directly used
Out[46]:
scipy.sparse.csr.csr_matrix
In [47]:
text_tf.shape
Out[47]:
(156060, 15240)
In [48]:
X_train, X_test, y_train, y_test = train_test_split(text_tf, data['Sentiment'], test_size = 0.3, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(109242, 15240)
(46818, 15240)
(109242,)
(46818,)
In [49]:
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
In [52]:
print(confusion_matrix(y_pred=y_pred, y_true=y_test)) #confusion matrix provides usable output
print(accuracy_score(y_pred=y_pred, y_true=y_test)) 
#accuracy is gone down to 58.15% which means that TfidfVectorizer is not better than MultinomialNB
[[   60   845  1177    31     0]
 [   29  1986  6011   201     1]
 [    9   884 21286  1390    19]
 [    0   114  6109  3768    50]
 [    0    12  1034  1676   126]]
0.5815284719552309
In [ ]:
 

Comments

Popular posts from this blog

Types of Data Analysis

MovieLens Case Study with Python

Data Pre-processing for Machine Learning