import nltk #package for nlp
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neerajshinde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

from nltk.tokenize import sent_tokenize

text = """Hello, Mr. Neeraj! How are you doing? The weather is Great and city is awesome.
The sky is pinkish-blue. You should not eat a mango"""

tokenized_sent = sent_tokenize(text)
tokenized_sent

['Hello, Mr. Neeraj!',
 'How are you doing?',
 'The weather is Great and city is awesome.',
 'The sky is pinkish-blue.',
 'You should not eat a mango']

from nltk.tokenize import word_tokenize
word_tokens = word_tokenize(text)
word_tokens

['Hello',
 ',',
 'Mr.',
 'Neeraj',
 '!',
 'How',
 'are',
 'you',
 'doing',
 '?',
 'The',
 'weather',
 'is',
 'Great',
 'and',
 'city',
 'is',
 'awesome',
 '.',
 'The',
 'sky',
 'is',
 'pinkish-blue',
 '.',
 'You',
 'should',
 'not',
 'eat',
 'a',
 'mango']

import gc
gc.collect() #helps us speed up the process

44

import inspect
inspect.getsource(word_tokenize)

'def word_tokenize(text, language="english", preserve_line=False):\n    """\n    Return a tokenized copy of *text*,\n    using NLTK\'s recommended word tokenizer\n    (currently an improved :class:`.TreebankWordTokenizer`\n    along with :class:`.PunktSentenceTokenizer`\n    for the specified language).\n\n    :param text: text to split into words\n    :type text: str\n    :param language: the model name in the Punkt corpus\n    :type language: str\n    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.\n    :type preserve_line: bool\n    """\n    sentences = [text] if preserve_line else sent_tokenize(text, language)\n    return [\n        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)\n    ]\n'

print(inspect.getsource(word_tokenize))

def word_tokenize(text, language="english", preserve_line=False):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool
    """
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [
        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    ]

from nltk.probability import FreqDist
fdist = FreqDist(word_tokens)
print(fdist)

<FreqDist with 26 samples and 30 outcomes>

fdist.most_common(3)

[('is', 3), ('The', 2), ('.', 2)]

import matplotlib.pyplot as plt
fdist.plot(30, cumulative=False)
plt.show()

fdist.plot(30,cumulative=True)
plt.show()

len(word_tokens)

30

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

{'yourself', 'this', 'does', 'any', "aren't", "couldn't", 'and', 'shouldn', 'of', "hasn't", 'where', 've', 'that', 'for', 'so', 'these', 'do', 'which', 'will', 'my', 'while', 'a', "you're", 'm', "shouldn't", 'each', 'down', 't', 'after', 'then', 'won', 'by', 'his', 'needn', 'if', 'over', 'aren', 'isn', 'into', 'o', 'shan', "don't", 'being', 'most', 'very', "didn't", 'doesn', "it's", 'couldn', "you'd", 'just', 'll', "hadn't", 'or', 'to', 'him', 'on', 'further', 'am', 'during', 'up', 'not', 'wasn', 'now', 'our', 'it', 'their', 'against', 'ourselves', 'hadn', 'only', 'the', "needn't", 'didn', 'i', 'too', "isn't", 'been', "shan't", 'theirs', 'ma', 'some', 'from', 'can', 'at', 'yours', 'myself', 'had', 'doing', 'are', 'more', 'herself', 'having', 'whom', 'here', 'ours', 'out', 'be', 'both', "she's", 's', "haven't", 'she', 'them', 'again', "wasn't", 'before', 'because', 'weren', 'mightn', 'is', 're', "doesn't", "that'll", 'they', 'such', 'themselves', "should've", 'about', 'same', 'nor', 'but', 'other', 'no', 'haven', 'hers', 'all', 'those', 'when', 'should', "you'll", 'has', 'until', 'we', 'between', 'above', 'why', 'what', 'he', 'below', 'her', 'was', 'have', 'were', 'off', 'mustn', 'through', "weren't", 'don', "mustn't", 'as', 'itself', 'himself', 'your', 'how', 'few', "wouldn't", 'who', 'did', 'in', 'you', 'with', 'wouldn', 'its', 'own', 'me', 'an', 'y', 'hasn', 'd', "you've", 'ain', 'once', 'yourselves', 'under', "won't", "mightn't", 'than', 'there'}

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ns45237\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

filtered_words = []
for w in word_tokens:
    if w.lower not in stop_words:
        filtered_words.append(w)
        
print("word token:", word_tokens)
print("filtered token:", filtered_words)

word token: ['Hello', ',', 'Mr.', 'Neeraj', '!', 'How', 'are', 'you', 'doing', '?', 'The', 'weather', 'is', 'Great', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'a', 'mango']
filtered token: ['Hello', ',', 'Mr.', 'Neeraj', '!', 'How', 'are', 'you', 'doing', '?', 'The', 'weather', 'is', 'Great', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'eat', 'a', 'mango']

from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_word = []
for w in filtered_words:
    stemmed_word.append(ps.stem(w))
    
print(stemmed_word)

['hello', ',', 'mr.', 'neeraj', '!', 'how', 'are', 'you', 'do', '?', 'the', 'weather', 'is', 'great', 'and', 'citi', 'is', 'awesom', '.', 'the', 'sky', 'is', 'pinkish-blu', '.', 'you', 'should', 'not', 'eat', 'a', 'mango']

from nltk.stem.wordnet import WordNetLemmatizer #used to operate on words with different adjctives, tenses etc.
nltk.download('wordnet')
lem = WordNetLemmatizer()
word = 'ran' #ran gets transformed to run
lem.lemmatize(word, 'v')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ns45237\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

'run'

lem.lemmatize('better','a') # better gets transformed to good - a=adjective

'good'

lem.lemmatize('worse','a') #worse gets transformed to bad - a=adjective

'bad'

lem.lemmatize('flew','v') #flew gets transformed to fly v=verb

'fly'

#Consider another example
sent = "Albert Einstein was born in Ulm, Germanyin 1879."
tokens = word_tokenize(sent)
print(tokens)

['Albert', 'Einstein', 'was', 'born', 'in', 'Ulm', ',', 'Germanyin', '1879', '.']

nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens) #tells whether a word is a noun, verb

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ns45237\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!

[('Albert', 'NNP'),
 ('Einstein', 'NNP'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('in', 'IN'),
 ('Ulm', 'NNP'),
 (',', ','),
 ('Germanyin', 'NNP'),
 ('1879', 'CD'),
 ('.', '.')]

import pandas as pd
import numpy as np

data = pd.read_csv('C:/Users/ns45237/working_neeraj/nlp_example/train.tsv', sep='\t')

data.shape

(156060, 4)

data.head()

data.Sentiment.value_counts() #review/phase sentiments 1 is negative 2 is neutral 3 is slightly positive 4 is positive

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

Sentiment_count = data.groupby('Sentiment').count()
Sentiment_count.head()

plt.bar(Sentiment_count.index, Sentiment_count['Phrase'])
plt.xlabel('Sentiment')
plt.ylabel('No. of reviews')
plt.show()

plt.bar(['negative','somewhat negative','neutral','somewhat positive','positive'], Sentiment_count['Phrase'])
plt.xlabel('Sentiment')
plt.ylabel('No. of reviews')
plt.show()

data['Phrase'].iloc[0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

Bag of Words Algorithm

#bag of words will interpret the documents/statements into the word counts
from sklearn.feature_extraction.text import CountVectorizer #used to create a vector for word counts
from nltk.tokenize import RegexpTokenizer #used to remove any non-alphanumeric characters from the token
token = RegexpTokenizer(r'[a-zA-Z0-9]+') #removed all special chars . , % etc. (r is raw string)
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)
text_counts = cv.fit_transform(data['Phrase'])

type(text_counts) #output of Bag of Words is a sparse Matrix which we cannot use hence, we go for confusion matrix

scipy.sparse.csr.csr_matrix

text_counts.shape #this is the data which we will feed to our algorithm to consume

(156060, 14988)

Solving a Classification Problem using NLP (Confusion Matrix)

#Let us classify the fruit name (Sentiment) based on its probabilities of being sweet, big size etc.
#First let us test the Model and try to split it into training & test datasets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, data['Sentiment'], test_size = 0.3, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(109242, 14988)
(46818, 14988)
(109242,)
(46818,)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(accuracy_score(y_pred=y_pred, y_true=y_test)) #shows that our Model is 60.64% accurate

[[  588  1032   430    57     6]
 [  505  3417  3826   446    34]
 [  169  2093 18609  2525   192]
 [   28   406  3985  4977   645]
 [    3    46   467  1528   804]]
0.6064975009611688

y_test

95722     2
147312    4
36991     2
150211    2
140655    1
         ..
39479     2
136980    2
50777     3
75758     3
2424      1
Name: Sentiment, Length: 46818, dtype: int64

Sentiment Analysis: In various documents, certain words may repeat multiple times, also certain words may repeat multiple times in multiple documents. The sentiment and significant of these words will depend on their common occurances in documents.

from sklearn.feature_extraction.text import TfidfVectorizer #inverse document feature 
tf = TfidfVectorizer()
text_tf = tf.fit_transform(data['Phrase'])

type(text_tf) #output is a spare matrix i.e. cannot be directly used

scipy.sparse.csr.csr_matrix

text_tf.shape

(156060, 15240)

X_train, X_test, y_train, y_test = train_test_split(text_tf, data['Sentiment'], test_size = 0.3, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(109242, 15240)
(46818, 15240)
(109242,)
(46818,)

clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)

print(confusion_matrix(y_pred=y_pred, y_true=y_test)) #confusion matrix provides usable output
print(accuracy_score(y_pred=y_pred, y_true=y_test)) 
#accuracy is gone down to 58.15% which means that TfidfVectorizer is not better than MultinomialNB

[[   60   845  1177    31     0]
 [   29  1986  6011   201     1]
 [    9   884 21286  1390    19]
 [    0   114  6109  3768    50]
 [    0    12  1034  1676   126]]
0.5815284719552309

Search This Blog

Data Science Experiments

Natural Language Processing (NLP) with Python

Comments

Post a Comment

Popular posts from this blog

MovieLens Case Study with Python

Types of Data Analysis

Data Pre-processing for Machine Learning

	PhraseId	SentenceId	Phrase	Sentiment
0	1	1	A series of escapades demonstrating the adage ...	1
1	2	1	A series of escapades demonstrating the adage ...	2
2	3	1	A series	2
3	4	1	A	2
4	5	1	series	2

	PhraseId	SentenceId	Phrase
Sentiment
0	7072	7072	7072
1	27273	27273	27273
2	79582	79582	79582
3	32927	32927	32927
4	9206	9206	9206