Natural Language Processing (NLP) with Python
In [1]:
import nltk #package for nlp
nltk.download('punkt')
Out[1]:
In [2]:
from nltk.tokenize import sent_tokenize
In [3]:
text = """Hello, Mr. Neeraj! How are you doing? The weather is Great and city is awesome.
The sky is pinkish-blue. You should not eat a mango"""
In [4]:
tokenized_sent = sent_tokenize(text)
tokenized_sent
Out[4]:
In [5]:
from nltk.tokenize import word_tokenize
word_tokens = word_tokenize(text)
word_tokens
Out[5]:
In [6]:
import gc
gc.collect() #helps us speed up the process
Out[6]:
In [7]:
import inspect
inspect.getsource(word_tokenize)
Out[7]:
In [8]:
print(inspect.getsource(word_tokenize))
In [9]:
from nltk.probability import FreqDist
fdist = FreqDist(word_tokens)
print(fdist)
In [10]:
fdist.most_common(3)
Out[10]:
In [11]:
import matplotlib.pyplot as plt
fdist.plot(30, cumulative=False)
plt.show()
In [12]:
fdist.plot(30,cumulative=True)
plt.show()
In [13]:
len(word_tokens)
Out[13]:
In [14]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)
In [15]:
filtered_words = []
for w in word_tokens:
if w.lower not in stop_words:
filtered_words.append(w)
print("word token:", word_tokens)
print("filtered token:", filtered_words)
In [16]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_word = []
for w in filtered_words:
stemmed_word.append(ps.stem(w))
print(stemmed_word)
In [17]:
from nltk.stem.wordnet import WordNetLemmatizer #used to operate on words with different adjctives, tenses etc.
nltk.download('wordnet')
lem = WordNetLemmatizer()
word = 'ran' #ran gets transformed to run
lem.lemmatize(word, 'v')
Out[17]:
In [18]:
lem.lemmatize('better','a') # better gets transformed to good - a=adjective
Out[18]:
In [19]:
lem.lemmatize('worse','a') #worse gets transformed to bad - a=adjective
Out[19]:
In [20]:
lem.lemmatize('flew','v') #flew gets transformed to fly v=verb
Out[20]:
In [21]:
#Consider another example
sent = "Albert Einstein was born in Ulm, Germanyin 1879."
tokens = word_tokenize(sent)
print(tokens)
In [22]:
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens) #tells whether a word is a noun, verb
Out[22]:
In [23]:
import pandas as pd
import numpy as np
In [24]:
data = pd.read_csv('C:/Users/ns45237/working_neeraj/nlp_example/train.tsv', sep='\t')
In [25]:
data.shape
Out[25]:
In [26]:
data.head()
Out[26]:
In [27]:
data.Sentiment.value_counts() #review/phase sentiments 1 is negative 2 is neutral 3 is slightly positive 4 is positive
Out[27]:
In [28]:
Sentiment_count = data.groupby('Sentiment').count()
Sentiment_count.head()
Out[28]:
In [29]:
plt.bar(Sentiment_count.index, Sentiment_count['Phrase'])
plt.xlabel('Sentiment')
plt.ylabel('No. of reviews')
plt.show()
In [30]:
plt.bar(['negative','somewhat negative','neutral','somewhat positive','positive'], Sentiment_count['Phrase'])
plt.xlabel('Sentiment')
plt.ylabel('No. of reviews')
plt.show()
In [31]:
data['Phrase'].iloc[0]
Out[31]:
Bag of Words Algorithm
In [41]:
#bag of words will interpret the documents/statements into the word counts
from sklearn.feature_extraction.text import CountVectorizer #used to create a vector for word counts
from nltk.tokenize import RegexpTokenizer #used to remove any non-alphanumeric characters from the token
token = RegexpTokenizer(r'[a-zA-Z0-9]+') #removed all special chars . , % etc. (r is raw string)
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)
text_counts = cv.fit_transform(data['Phrase'])
In [40]:
type(text_counts) #output of Bag of Words is a sparse Matrix which we cannot use hence, we go for confusion matrix
Out[40]:
In [34]:
text_counts.shape #this is the data which we will feed to our algorithm to consume
Out[34]:
Solving a Classification Problem using NLP (Confusion Matrix)
In [35]:
#Let us classify the fruit name (Sentiment) based on its probabilities of being sweet, big size etc.
#First let us test the Model and try to split it into training & test datasets
In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, data['Sentiment'], test_size = 0.3, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [37]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(accuracy_score(y_pred=y_pred, y_true=y_test)) #shows that our Model is 60.64% accurate
In [42]:
y_test
Out[42]:
Sentiment Analysis: In various documents, certain words may repeat multiple times, also certain words may repeat multiple times in multiple documents. The sentiment and significant of these words will depend on their common occurances in documents.
In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer #inverse document feature
tf = TfidfVectorizer()
text_tf = tf.fit_transform(data['Phrase'])
In [46]:
type(text_tf) #output is a spare matrix i.e. cannot be directly used
Out[46]:
In [47]:
text_tf.shape
Out[47]:
In [48]:
X_train, X_test, y_train, y_test = train_test_split(text_tf, data['Sentiment'], test_size = 0.3, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [49]:
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
In [52]:
print(confusion_matrix(y_pred=y_pred, y_true=y_test)) #confusion matrix provides usable output
print(accuracy_score(y_pred=y_pred, y_true=y_test))
#accuracy is gone down to 58.15% which means that TfidfVectorizer is not better than MultinomialNB
In [ ]:
Comments
Post a Comment