Data Pre-processing for Machine Learning
Machine Learning (ML) is subset of Artificial Intelligence (AI)
In [3]:
import os
os.getcwd() #gets current working directory
Out[3]:
In [38]:
#Data Acquisition
#Loading the CSV file in Python
import pandas as pd #pandas is python data wrangling package
df = pd.read_csv('BostonHousing.csv') #read the CSV file into a Python DataFrame
#df.to_csv("/home/neerajshinde/Data/BostonHousing.csv") #loading data into an existing CSV file
#df = pd.read_excel('BostonHousing.xlsx') #read the XLSX file into a Python DataFrame
#df.to_excel("/home/neerajshinde/Data/BostonHousing.xlsx") #loading data into an existing XLSX file
In [21]:
df.head(5) #display the first 5 observations from the DataFrame
Out[21]:
In [13]:
df.shape #display the no. of rows and columns (observations and variables)
Out[13]:
In [20]:
df.tail(5) #display the last 5 observation from the DataFrame
Out[20]:
In [22]:
df.columns #display the column names from the DataFrame
Out[22]:
In [25]:
#select specific rows from the DataFrame
df.iloc[0] #displays 1st row - note index always starts from 0
Out[25]:
In [30]:
df.iloc[-1] #displays last row in the DataFrame
Out[30]:
In [31]:
df.iloc[:,-1] #display the last column in the DataFrame
Out[31]:
In [32]:
df.iloc[:,0:2] #display the 1st two columns
Out[32]:
Let us try to read another dataset SalaryGender.csv
In [41]:
df_sg = pd.read_csv("SalaryGender.csv") #read the CSV file into a DataFrame
df_sg #display the DataFrame
Out[41]:
In [43]:
df_sg.shape #display the shape of the DataFrame rows X columns
Out[43]:
In [48]:
#check the datatype of the column Salary (Specific column)
df_sg['Salary'].dtype
Out[48]:
In [47]:
#check the datatype of all columns in one shot
df_sg.dtypes
Out[47]:
In [53]:
#display unique values in a column
df_sg['Age'].unique()
df_sg['Gender'].unique()
Out[53]:
In [55]:
#display all values in a particular column
df_sg['Gender'].values
Out[55]:
In [59]:
#display statistical values for numeric (continous) columns
df_sg['Salary'].mean()
Out[59]:
In [60]:
df_sg['Salary'].median()
Out[60]:
In [63]:
df_sg['Salary'].mode()
Out[63]:
In [66]:
df_sg.mean() #display mean for all the columns. Note that results make sense for numeric continuous data
Out[66]:
In [70]:
df_sg['Salary'].mean(axis=0) #axis=0 refers to column; axis=1 refers to rows
Out[70]:
Using Seaborn Subpackage of Matplotlib to plot a Histogram to find out Correlation between the Data
In [71]:
import matplotlib.pyplot as plt
import seaborn as sns
correlations = df.corr()
In [87]:
correlations #display the correlations
#correlation between 2 variables (columns) tell us if 2 variables have a relation or not.
#if correlation is Moving towards +1 then maximum correlation
#if correlation is Moving towards -1 then minimum correlation
Out[87]:
In [88]:
#plot the heatmap of the above correlation
sns.heatmap(data = correlations,square = True, cmap = "bwr")
#blue i.e. -1 signifies minimum correlation
#red i.e. +1 signifies maximum correlation
Out[88]:
In [76]:
plt.yticks(rotation=0)
plt.xticks(rotation=90)
Out[76]:
In [77]:
df_school = pd.read_csv('middle_tn_schools.csv') #read the CSV file into a DataFrame
In [78]:
df_school.shape
Out[78]:
In [79]:
df_school.head(5)
Out[79]:
In [84]:
#Find the Correlation between 'reduced_lunch' and 'school_rating' columns
correlation_school = df_school[['reduced_lunch','school_rating']].corr()
In [85]:
correlation_school
Out[85]:
In [86]:
#plot the heatmap of the above correlation
sns.heatmap(data = correlation_school,square = True, cmap = "bwr")
Out[86]:
Data Exploration¶
In [89]:
df_mtcars = pd.read_csv('mtcars.csv') #read the CSV into a DataFrame
In [90]:
df_mtcars.head(5) #display the 1st 5 rows
Out[90]:
In [92]:
df_mtcars.shape #display the shape of the DataSet
Out[92]:
In [93]:
df_mtcars.dtypes #display the DataTypes of all the columns
Out[93]:
In [124]:
df_mtcars.groupby(['model'])['hp'].mean() #average size of horsepower across all the car models
Out[124]:
In [108]:
correlation_mtcars = df_mtcars.corr() #derive the correlation amongst all the variables
correlation_mtcars #display the correlation table
Out[108]:
In [113]:
#plot the heatmap of the above correlation
sns.heatmap(data = correlation_mtcars,square = True, cmap = "Oranges")
Out[113]:
Data Wrangling¶
In [147]:
#load the load_diabetes dataset from SKLEARN
from sklearn.datasets import load_diabetes
In [148]:
load_diabetes = load_diabetes() #load the dataset into a variable; Type = sklearn.utils.Bunch
print(load_diabetes.DESCR) #describe the dataset
In [139]:
#convert the dataset to a DataFrame
df_diabetes = pd.DataFrame(load_diabetes.data)
In [142]:
df_diabetes.head(5) #display first 5 rows of the DataFrame
Out[142]:
In [172]:
#give names to the columns
df_diabetes.columns = ['Column1','Column2','Column3','Column4','Column5','Column6','Column7','Column8','Column9','Column10']
df_diabetes.head(2)
Out[172]:
In [152]:
df_diabetes.shape #display the shape of the data
Out[152]:
In [151]:
#check if there are any null values in the data
df_diabetes.isna().any()
Out[151]:
In [167]:
#Detect Outliers in each column of the DataFrame using a BOXPLOT
sns.boxplot(x=df_diabetes.iloc[:,2]) #shows outliers in column 3
#shows 3 outliers
Out[167]:
In [165]:
sns.boxplot(x=df_diabetes.iloc[:,0]) #shows outliers in column 1
#shows no outliers
Out[165]:
In [174]:
sns.boxplot(df_diabetes['Column5']) #show outliers in Column5 by refering Column Name
Out[174]:
In [179]:
#let us treat the outliers in Column5 and filter them out
filter = df_diabetes['Column5']>0.13 #filter the values > 0.13 so that outliers are removed
df1_out_rem = df_diabetes[filter] #create a DataFrame with filtered Data
sns.boxplot(x=df1_out_rem['Column5']) #BoxPlot for the Filtered Data (Outliers are filtered)
Out[179]:
In [181]:
df_north = pd.read_csv('north_america_2000_2010.csv') #read the 1st file
In [197]:
df_north.shape
Out[197]:
In [198]:
df_north.head(5)
Out[198]:
In [182]:
df_south = pd.read_csv('south_america_2000_2010.csv') #read the 2nd file
In [194]:
df_south.shape
Out[194]:
In [195]:
df_south.head(5)
Out[195]:
In [192]:
df_america = pd.concat([df_north,df_south], axis=0) #concat the 2 dataframes; axis=0 refers to rows
In [199]:
df_america.shape
Out[199]:
In [201]:
df_america.head(5) #display the concatenated output
Out[201]:
In [204]:
print('Thank You! Neeraj Shinde: 18-Oct-2020 11:26 PM IST')
In [ ]:
Comments
Post a Comment