MovieLens Case Study with Python
Problem Objective :
Here, we ask you to perform the analysis using the Exploratory Data Analysis technique. You need to find features affecting the ratings of any particular movie and build a model to predict the movie ratings. Domain: Entertainment
Dataset Description :
These files contain 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 MovieLens users who joined MovieLens in 2000.
Analysis Tasks to be performed:
1. Import the three datasets
import numpy as np #import the numpy library for numerical operations
import pandas as pd #import the pandas library for data wrangling operations
colnames_movies = ['MovieID','Title','Genres']
df_movies = pd.read_csv('movies.dat', delimiter='::', engine='python', names=colnames_movies) #open the movies.dat datafile
df_movies.head()
df_movies.shape #movies.dat has 3882 observations & 3 variables
colnames_ratings=['UserID','MovieID','Rating','Timestamp']
df_ratings = pd.read_csv('ratings.dat', delimiter='::', engine='python', names=colnames_ratings) #open the ratings.dat datafile
df_ratings.head()
df_ratings.shape
colnames_users=['UserID','Gender','Age','Occupation','Zip-code']
df_users = pd.read_csv('users.dat', delimiter='::', engine='python', names=colnames_users) #open the users.dat datafile
df_users.head()
df_users.shape
#merge the ratings dataframe with movies dataframe on column MovieID
df_ratings_movies = pd.merge(df_ratings, df_movies, how='inner', on='MovieID', left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False)
df_ratings_movies.shape
df_ratings_movies.head()
#now merge the users dataframe with the ratings_movies dataframe
df_ratings_movies_users = pd.merge(df_ratings_movies, df_users, how='inner', on='UserID', left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False)
df_ratings_movies_users.shape
df_ratings_movies_users.head()
df_master_data = df_ratings_movies_users[['MovieID', 'Title', 'UserID', 'Age', 'Gender', 'Occupation', 'Rating','Genres']]
df_master_data.head()
df_master_data.shape
Explore the datasets using visual representations (graphs or tables), also include your comments on the following:¶
1. User Age Distribution¶
from matplotlib import pyplot as plt #import the matpllotlib pyplot subpackage
plt.hist(df_master_data.Age, bins = 7)
plt.show()
#below histogram shows that age group 25 (i.e. 25-34 years) have voted the maximum times
2.User rating of the movie “Toy Story”¶
df_bytitle = df_master_data.groupby('Title')
df_bytitle.get_group('Toy Story (1995)')
#Get the average ratings for all the movies
df_master_data.groupby(['Title'])['Rating'].mean()
#get the average user rating for the movie - Toy Story (1995)
np.average(df_bytitle.get_group('Toy Story (1995)').Rating)
#shows that Toy Story (1995) has an average user rating of 4.14
3.Top 25 movies by viewership rating¶
Top25_Movies = df_master_data.groupby(['MovieID','Title'])['Rating'].mean().sort_values(ascending=False)
Top25_Movies
4.Find the ratings for all the movies reviewed by for a particular user of user id = 2696¶
df_byuserID = df_master_data.groupby('UserID')
df_byuserID.get_group(2696) #output is not much impressive. Let us try an alternate method
#alternative method
df_master_data[df_master_data['UserID'] == 2696][['Title','Rating']]
Feature Engineering¶
Find out all the unique genres (Hint: split the data in column genre making a list and then process the data to find out only the unique categories of genres)
df_master_data.head()
#below is an inefficient way of getting the genres
genres = ['Action','Adventure','Animation','Children''s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir',
'Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
Create a separate column for each genre category with a one-hot encoding ( 1 and 0) whether or not the movie belongs to that genre.
#below is an efficient way by python. It will add dummy columns
df_genres = df_master_data['Genres'].str.get_dummies("|")
df_genres
df_master_data = pd.merge(df_master_data, df_genres, how='inner', left_on=None, right_on=None,
left_index=True, right_index=True, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False) #Merge by Index; however it is not a good practice
df_master_data.head()
#This is a classification problem.
#We need to predict the Rating of a movie and find out variables that are significant
df_master_data['Rating'].hasnans
df_master_data['Rating']=df_master_data.Rating.astype('int') #convert Rating to integer
df_master_data['Age']=df_master_data.Age.astype('int') #convert Age to integer
df_master_data['Occupation']=df_master_data.Occupation.astype('int') #convert Occupation to integer
Model Selection Process¶
from sklearn.model_selection import train_test_split #used to train and test existing dataset
Perform Exploratory Data Analysis (EDA) for the Master Data Set
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
#Visualize user age distribution
df_master_data['Age'].value_counts().plot(kind='barh',alpha=0.7,figsize=(10,10))
plt.show()
df_master_data.Age.plot.hist(bins=25)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('Age')
#Visualize overall rating by users
df_master_data['Rating'].value_counts().plot(kind='bar',alpha=0.7,figsize=(10,10))
plt.show()
Perform Machine Learning Algorithms
#Use the following features:movie id,age,occupation
features = df_master_data[['MovieID','Age','Occupation']].values
#Use rating as label i.e response variable
labels = df_master_data[['Rating']].values
#Create train and test data set
train, test, train_labels, test_labels = train_test_split(features,labels,test_size=0.25,random_state=1)
#Create a histogram for movie
df_master_data.Age.plot.hist(bins=25)
plt.title("Movie & Rating")
plt.ylabel('MovieID')
plt.xlabel('Ratings')
#Create a histogram for age
df_master_data.Age.plot.hist(bins=25)
plt.title("Age & Rating")
plt.ylabel('Age')
plt.xlabel('Ratings')
#Create a histogram for occupation
df_master_data.Age.plot.hist(bins=25)
plt.title("Occupation & Rating")
plt.ylabel('Occupation')
plt.xlabel('Ratings')
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(train, train_labels)
Y_pred = logreg.predict(test)
acc_log = round(logreg.score(train, train_labels) * 100, 2)
acc_log
# K Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train, train_labels)
Y_pred = knn.predict(test)
acc_knn = round(knn.score(train, train_labels) * 100, 2)
acc_knn
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(train, train_labels)
Y_pred = gaussian.predict(test)
acc_gaussian = round(gaussian.score(train, train_labels) * 100, 2)
acc_gaussian
# Perceptron
perceptron = Perceptron()
perceptron.fit(train, train_labels)
Y_pred = perceptron.predict(test)
acc_perceptron = round(perceptron.score(train, train_labels) * 100, 2)
acc_perceptron
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train, train_labels)
Y_pred = decision_tree.predict(test)
acc_decision_tree = round(decision_tree.score(train, train_labels) * 100, 2)
acc_decision_tree
models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Naive Bayes', 'Perceptron',
'Decision Tree'],
'Score': [acc_knn, acc_log,
acc_gaussian, acc_perceptron, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)
from the above accuracy scores, Decision Tree seems to be the most suitable Model with 55.68% accuracy
Comments
Post a Comment