Mahdi Salmani
Author
This section trains a convolutional neural network in PyTorch to classify handwritten digits from the MNIST dataset.
import torch
import numpy as np
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torchvision
from tqdm.auto import tqdm, trange
from torch.utils.data import random_split
%matplotlib inline
import matplotlib.pyplot as plt
import torch.nn.functional as F
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
print('runing on ',device)
runing on cuda
batch_size = 32
transform=transforms.ToTensor()
train_val_data= datasets.MNIST('./data',
train=True,
download=True,
transform=transform)
test_data= datasets.MNIST('./data',
train=False,
transform=transform)
test_data.classes
['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
train_size=int(0.9*(len(train_val_data)))
val_size=len(train_val_data)-train_size
train_data,val_data=random_split(train_val_data,[train_size,val_size])
train_loader=torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, drop_last = True)
test_loader=torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True, drop_last = True)
val_loader=torch.utils.data.DataLoader(val_data, batch_size=32, shuffle=True, drop_last = True)
print('number of batches in train data , test data and validation data are:')
print(len(train_loader),len(test_loader),len(val_loader))#number of batches in train data and test data and val data
number of batches in train data , test data and validation data are: 1687 312 187
images,labels=next(iter(train_loader))
print(images.shape,labels.shape)
print(test_data)
torch.Size([32, 1, 28, 28]) torch.Size([32])
Dataset MNIST
Number of datapoints: 10000
Root location: ./data
Split: Test
StandardTransform
Transform: ToTensor()
x = next(iter(test_loader))[0][:10].squeeze(1) # Get a batch and choose 10 of images
fig = plt.figure(figsize=(20, 20)) # figure size in inches
for i in range(len(x)):
ax = fig.add_subplot(1,20, i + 1, xticks=[], yticks=[])
ax.imshow(x[i].numpy(),cmap='gray', interpolation='nearest')
class CNN(nn.Module):
def __init__(self,BN,Dropout):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.bn2 = nn.BatchNorm2d(64)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(64*24*24, 128)
self.fc2 = nn.Linear(128, 10)
self.relu = nn.ReLU()
self.Dropout=Dropout
self.BN=BN
def forward(self, x):
x = self.conv1(x)
if self.BN:
x=self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
if self.BN:
x=self.bn2(x)
x = self.relu(x)
if self.Dropout:
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = self.relu(x)
if self.Dropout:
x = self.dropout2(x)
x = self.fc2(x)
m=torch.nn.Softmax(dim=1)
return m(x)
#Training on val and train data
def train(net):
from tqdm.notebook import tqdm
costFunc = torch.nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(net.parameters(),lr=1e-4,momentum=0.9)
val_loss,train_loss,train_acc,val_acc=[],[],[],[]
for epoch in tqdm(range(41)):
bchloss = 0
net.train()
correct=0
total=0
for i,batch in enumerate(train_loader,0):
data,output=batch
data,output = data.to(device),output.to(device)
prediction = net(data)
loss = costFunc(prediction,output)
bchloss += int(loss.item()*1000)
optimizer.zero_grad()
loss.backward()
optimizer.step()
pred=torch.argmax(prediction,dim=1)
total += output.size(0)#batch_size
correct += (pred==output).sum().item()
train_loss.append(bchloss/(len(train_loader)*32))
train_acc.append((correct/total)*100)
if epoch%5==0:
print('***************** epoch',epoch,'*****************')
print('train loss = ',bchloss/(len(train_loader)*32))
print('train accuracy ','= ',str((correct/total)*100),'%')
correct=0
total=0
bchloss=0
for data,output in val_loader:
data,output = data.to(device),output.to(device)
prediction = net(data)
loss = costFunc(prediction,output)
bchloss += int(loss.item()*1000)
pred=torch.argmax(prediction,dim=1)
total += output.size(0)
correct += (pred==output).sum().item()
val_acc.append((correct/total)*100)
val_loss.append(bchloss/(len(val_loader)*32))
if epoch%5==0:
print('validation loss = ',bchloss/(len(val_loader)*32))
print('vaidation accuracy ','= ',str((correct/total)*100),'%')
return val_loss,train_loss,train_acc,val_acc
The next experiments train three variants to compare batch normalization and dropout:
(Batch_Normalization=False, Dropout=False)(Batch_Normalization=True, Dropout=False)(Batch_Normalization=False, Dropout=True)model1 = CNN(BN=False,Dropout=False).to(device)
val1_loss,train1_loss,train1_acc,val1_acc=train(model1)
HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))
***************** epoch 0 ***************** train loss = 71.87635225251927 train accuracy = 19.229771784232366 % validation loss = 71.80314171122994 vaidation accuracy = 33.1216577540107 % ***************** epoch 5 ***************** train loss = 52.3941908713693 train accuracy = 81.63529934795494 % validation loss = 51.66794786096256 vaidation accuracy = 83.23863636363636 % ***************** epoch 10 ***************** train loss = 50.93153526970954 train accuracy = 83.83224659158269 % validation loss = 50.70755347593583 vaidation accuracy = 84.6590909090909 % ***************** epoch 15 ***************** train loss = 50.625 train accuracy = 84.54727326615293 % validation loss = 50.405080213903744 vaidation accuracy = 85.17713903743316 % ***************** epoch 20 ***************** train loss = 50.44565056312982 train accuracy = 84.98258743331357 % validation loss = 50.248495989304814 vaidation accuracy = 85.46122994652407 % ***************** epoch 25 ***************** train loss = 50.300255631298164 train accuracy = 85.4253112033195 % validation loss = 50.11380347593583 vaidation accuracy = 85.86229946524064 % ***************** epoch 30 ***************** train loss = 50.17634854771784 train accuracy = 85.82913455838766 % validation loss = 50.00835561497326 vaidation accuracy = 86.38034759358288 % ***************** epoch 35 ***************** train loss = 50.06216656787196 train accuracy = 86.11810906935389 % validation loss = 49.90524732620321 vaidation accuracy = 86.6644385026738 % ***************** epoch 40 ***************** train loss = 49.959228660343804 train accuracy = 86.49599881446355 % validation loss = 49.81751336898396 vaidation accuracy = 86.99866310160428 %
plt.figure()
plt.subplot(2,1,1)
plt.title('accuracy')
plt.plot(val1_acc,label='validation')
plt.plot(train1_acc,label='train')
plt.legend()
plt.show()
plt.figure()
plt.subplot(2,1,1)
plt.title('loss validation')
plt.plot(val1_loss,label='validation')
plt.title('loss train ')
plt.plot(train1_loss,label='train')
plt.legend()
plt.show()
correct=0
total=0
model1.eval()
for data,output in test_loader:
total += output.size(0)
data,output = data.to(device),output.to(device)
prediction = model1(data)
pred=torch.argmax(prediction,dim=1)
correct += (pred==output).sum().item()
print('test Accuracy on epoch ',30,'= ',str((correct/total)*100),'%')
test Accuracy on epoch 30 = 93.64983974358975 %
How does batch normalization change the training process of a CNN?
Batch normalization helps the network converge faster. In the plots below, the loss of the network with batch normalization falls much faster than the network without batch normalization.
model2 = CNN(BN=True,Dropout=False).to(device)
val2_loss,train2_loss,train2_acc,val2_acc=train(model2)
HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))
***************** epoch 0 ***************** train loss = 57.30516449318316 train accuracy = 73.44027860106699 % validation loss = 50.4884692513369 vaidation accuracy = 90.9090909090909 % ***************** epoch 5 ***************** train loss = 47.38783713692946 train accuracy = 95.91730883224659 % validation loss = 47.25300802139037 vaidation accuracy = 96.2566844919786 % ***************** epoch 10 ***************** train loss = 46.75913233550682 train accuracy = 97.38811499703615 % validation loss = 46.77991310160428 vaidation accuracy = 97.14237967914438 % ***************** epoch 15 ***************** train loss = 46.50611292234736 train accuracy = 97.97903082394784 % validation loss = 46.587901069518715 vaidation accuracy = 97.7105614973262 % ***************** epoch 20 ***************** train loss = 46.35755038529935 train accuracy = 98.34210136336692 % validation loss = 46.474598930481285 vaidation accuracy = 97.96122994652407 % ***************** epoch 25 ***************** train loss = 46.25653897451097 train accuracy = 98.61625666864256 % validation loss = 46.3793449197861 vaidation accuracy = 98.21189839572193 % ***************** epoch 30 ***************** train loss = 46.18835210432721 train accuracy = 98.74407231772378 % validation loss = 46.35360962566845 vaidation accuracy = 98.34558823529412 %
plt.title('loss')
plt.plot(val1_loss, 'r', label='without BN')
plt.plot(val2_loss, 'g', label='with BN')
plt.legend()
plt.show()
plt.title('accuarcy')
plt.plot(val1_acc, 'm', label='without BN')
plt.plot(val2_acc, 'b', label='with BN')
plt.legend()
plt.show()
Dropout Dropout is a technique used in neural networks to prevent overfitting the training data by dropping out neurons with probability p>0 It forces the model to avoid relying too much on particular sets of features..
Dropout helps reduce overfitting and generalization error by randomly deactivating a subset of neurons during each training step.
model3 = CNN(BN=False,Dropout=True).to(device)
val3_loss,train3_loss,train3_acc,val3_acc=train(model3)
HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))
***************** epoch 0 ***************** train loss = 71.8227252519265 train accuracy = 16.458580320094843 % validation loss = 71.64388368983957 vaidation accuracy = 26.15307486631016 % ***************** epoch 5 ***************** train loss = 54.058054238292826 train accuracy = 75.42234736218138 % validation loss = 53.05347593582888 vaidation accuracy = 78.72660427807486 % ***************** epoch 10 ***************** train loss = 49.96578615886188 train accuracy = 87.73895969176053 % validation loss = 49.65524732620321 vaidation accuracy = 88.58622994652407 % ***************** epoch 15 ***************** train loss = 49.13205764671014 train accuracy = 89.92849733254297 % validation loss = 48.90992647058823 vaidation accuracy = 90.57486631016043 % ***************** epoch 20 ***************** train loss = 48.728919679905154 train accuracy = 91.03623295791346 % validation loss = 48.5076871657754 vaidation accuracy = 91.5942513368984 % ***************** epoch 25 ***************** train loss = 48.427941612329576 train accuracy = 91.87722288085358 % validation loss = 48.17613636363637 vaidation accuracy = 92.59692513368985 % ***************** epoch 30 ***************** train loss = 48.2403304682869 train accuracy = 92.3606994665086 % validation loss = 48.02139037433155 vaidation accuracy = 93.09826203208557 % ***************** epoch 35 ***************** train loss = 48.052663752222884 train accuracy = 92.86640486069948 % validation loss = 47.98495989304813 vaidation accuracy = 93.09826203208557 % ***************** epoch 40 ***************** train loss = 47.920365293420275 train accuracy = 93.32024303497333 % validation loss = 47.84876336898396 vaidation accuracy = 93.54946524064172 %
plt.title('loss')
plt.plot(val1_loss, 'r', label='without dropout')
plt.plot(val3_loss, 'g', label='with dropout')
plt.legend()
plt.show()
plt.title('accuracy')
plt.plot(val1_acc, 'm', label='without dropout')
plt.plot(val3_acc, 'b', label='with dropout')
plt.legend()
plt.show()
In convolutional neural networks, filters are the learned weights of convolutional layers. These filters determine which local patterns the network responds to as it processes the input image.
model1.parameters
<bound method Module.parameters of CNN( (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (dropout1): Dropout(p=0.25, inplace=False) (dropout2): Dropout(p=0.5, inplace=False) (fc1): Linear(in_features=36864, out_features=128, bias=True) (fc2): Linear(in_features=128, out_features=10, bias=True) (relu): ReLU() )>
# load the model
model_weights = []
conv_layers = []
model_children = list(model1.children())
counter = 0
for i in range(len(model_children)):
if type(model_children[i]) == nn.Conv2d:
counter += 1
model_weights.append(model_children[i].weight)
conv_layers.append(model_children[i])
elif type(model_children[i]) == nn.Sequential:
for j in range(len(model_children[i])):
for child in model_children[i][j].children():
if type(child) == nn.Conv2d:
counter += 1
model_weights.append(child.weight)
conv_layers.append(child)
print(f"Total convolutional layers: {counter}")
Total convolutional layers: 2
# the first conv layer filters visualization
plt.figure(figsize=(20, 17))
for i, filter in enumerate(model_weights[0]):
plt.subplot(8, 8, i+1)
plt.imshow(filter[0, :, :].detach().cpu()
, cmap='gray')
plt.axis('off')
plt.show()
Feature maps are the activations produced after filters pass over an input image. They show what the convolutional layer sees at different stages of the network.
images,labels=next(iter(train_loader))
results = [conv_layers[0](images.cuda())]
for i in range(1, len(conv_layers)):
results.append(conv_layers[i](results[-1]))
outputs = results
# visualizing features
for num_layer in range(len(outputs)):
plt.figure(figsize=(30, 30))
layer_viz = outputs[num_layer][0, :, :, :]
layer_viz = layer_viz.data
print(layer_viz.size())
for i, filter in enumerate(layer_viz):
if i == 64:
break
plt.subplot(8, 8, i + 1)
plt.imshow(filter.cpu(), cmap='gray')
plt.axis("off")
print(f" layer {num_layer} feature maps...")
plt.show()
plt.close()
torch.Size([32, 26, 26]) Saving layer 0 feature maps...
torch.Size([64, 24, 24]) Saving layer 1 feature maps...
In this jupyter file, several linear regression methods are implemented and as a result, the performance and accuracies of different algorithms was reported. The main dataset is collected data about Toyota Corolla cars information with different option and features.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from collections import Counter
data_df = pd.read_csv("./ToyotaCorolla.csv")
data_df.head()
| Price | Age | KM | FuelType | HP | MetColor | Automatic | CC | Doors | Weight | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13500 | 23 | 46986 | Diesel | 90 | 1 | 0 | 2000 | 3 | 1165 |
| 1 | 13750 | 23 | 72937 | Diesel | 90 | 1 | 0 | 2000 | 3 | 1165 |
| 2 | 13950 | 24 | 41711 | Diesel | 90 | 1 | 0 | 2000 | 3 | 1165 |
| 3 | 14950 | 26 | 48000 | Diesel | 90 | 0 | 0 | 2000 | 3 | 1165 |
| 4 | 13750 | 30 | 38500 | Diesel | 90 | 0 | 0 | 2000 | 3 | 1170 |
data_df.count()
Price 1436 Age 1436 KM 1436 FuelType 1436 HP 1436 MetColor 1436 Automatic 1436 CC 1436 Doors 1436 Weight 1436 dtype: int64
data_df.describe()
| Price | Age | KM | HP | MetColor | Automatic | CC | Doors | Weight | |
|---|---|---|---|---|---|---|---|---|---|
| count | 1436.000000 | 1436.000000 | 1436.000000 | 1436.000000 | 1436.000000 | 1436.000000 | 1436.000000 | 1436.000000 | 1436.00000 |
| mean | 10730.824513 | 55.947075 | 68533.259749 | 101.502089 | 0.674791 | 0.055710 | 1566.827994 | 4.033426 | 1072.45961 |
| std | 3626.964585 | 18.599988 | 37506.448872 | 14.981080 | 0.468616 | 0.229441 | 187.182436 | 0.952677 | 52.64112 |
| min | 4350.000000 | 1.000000 | 1.000000 | 69.000000 | 0.000000 | 0.000000 | 1300.000000 | 2.000000 | 1000.00000 |
| 25% | 8450.000000 | 44.000000 | 43000.000000 | 90.000000 | 0.000000 | 0.000000 | 1400.000000 | 3.000000 | 1040.00000 |
| 50% | 9900.000000 | 61.000000 | 63389.500000 | 110.000000 | 1.000000 | 0.000000 | 1600.000000 | 4.000000 | 1070.00000 |
| 75% | 11950.000000 | 70.000000 | 87020.750000 | 110.000000 | 1.000000 | 0.000000 | 1600.000000 | 5.000000 | 1085.00000 |
| max | 32500.000000 | 80.000000 | 243000.000000 | 192.000000 | 1.000000 | 1.000000 | 2000.000000 | 5.000000 | 1615.00000 |
data_df.isnull().sum()
Price 0 Age 0 KM 0 FuelType 0 HP 0 MetColor 0 Automatic 0 CC 0 Doors 0 Weight 0 dtype: int64
# Check Correlation amoung parameters
corr = data_df.corr()
fig, ax = plt.subplots(figsize=(8,8))
# Generate a heatmap
sns.heatmap(corr, cmap = 'magma', annot = True, fmt = ".2f")
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show()
# plot regplots for Age, KM, CC & HP against Price
f, axes = plt.subplots(2,2, figsize=(12,8))
# Age Vs Price
sns.regplot(x = 'Price', y = 'Age', data = data_df, ax = axes[0,0], scatter_kws={'alpha':0.6})
axes[0,0].set_xlabel('Price', fontsize = 14)
axes[0,0].set_ylabel('Age', fontsize=14)
axes[0,0].yaxis.tick_left()
# KM Vs Price
sns.regplot(x = 'Price', y = 'KM', data = data_df, ax = axes[0,1], scatter_kws={'alpha':0.6})
axes[0,1].set_xlabel('Price', fontsize = 14)
axes[0,1].set_ylabel('KM', fontsize=14)
axes[0,1].yaxis.set_label_position("right")
axes[0,1].yaxis.tick_right()
# CC Vs Price
sns.regplot(x = 'Price', y = 'CC', data = data_df, ax = axes[1,0], scatter_kws={'alpha':0.6})
axes[1,0].set_xlabel('Price', fontsize = 14)
axes[1,0].set_ylabel('CC', fontsize=14)
axes[1,0].yaxis.tick_left()
# Weight Vs Price
sns.regplot(x = 'Price', y = 'Weight', data = data_df, ax = axes[1,1], scatter_kws={'alpha':0.6})
axes[1,1].set_xlabel('Price', fontsize = 14)
axes[1,1].set_ylabel('Weight', fontsize=14)
axes[1,1].yaxis.set_label_position("right")
axes[1,1].yaxis.tick_right()
plt.show()
# Create the clasiification.
data_df = pd.get_dummies(data_df)
data_df.head()
| Price | Age | KM | HP | MetColor | Automatic | CC | Doors | Weight | FuelType_CNG | FuelType_Diesel | FuelType_Petrol | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13500 | 23 | 46986 | 90 | 1 | 0 | 2000 | 3 | 1165 | 0 | 1 | 0 |
| 1 | 13750 | 23 | 72937 | 90 | 1 | 0 | 2000 | 3 | 1165 | 0 | 1 | 0 |
| 2 | 13950 | 24 | 41711 | 90 | 1 | 0 | 2000 | 3 | 1165 | 0 | 1 | 0 |
| 3 | 14950 | 26 | 48000 | 90 | 0 | 0 | 2000 | 3 | 1165 | 0 | 1 | 0 |
| 4 | 13750 | 30 | 38500 | 90 | 0 | 0 | 2000 | 3 | 1170 | 0 | 1 | 0 |
In statistics, simple linear regression is a linear regression model with a single explanatory variable. That is, it concerns two-dimensional sample points with one independent variable and one dependent variable (conventionally, the x and y coordinates in a Cartesian coordinate system) and finds a linear function (a non-vertical straight line) that, as accurately as possible, predicts the dependent variable values as a function of the independent variable. The adjective simple refers to the fact that the outcome variable is related to a single predictor.
from sklearn.linear_model import LinearRegression
Let us see how the model performs when using only one independent variable, age, to predict the price.
X_simple_lreg = data_df[["Age"]].values
y_simple_lreg = data_df["Price"].values
print(X_simple_lreg[0:5])
print(y_simple_lreg[0:5])
[[23] [23] [24] [26] [30]] [13500 13750 13950 14950 13750]
# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_slreg, X_test_slreg, y_train_slreg, y_test_slreg = train_test_split(X_simple_lreg,y_simple_lreg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_slreg.shape, y_train_slreg.shape)
print('Test Dataset : ', X_test_slreg.shape, y_test_slreg.shape)
Train Dataset : (1077, 1) (1077,) Test Dataset : (359, 1) (359,)
simple_lreg = LinearRegression()
simple_lreg.fit(X_train_slreg, y_train_slreg)
print('Intercept : ', simple_lreg.intercept_)
print('Slope : ', simple_lreg.coef_)
Intercept : 20137.410273159752 Slope : [-169.09157285]
As we can see, the slope is -169.09, which means that price of the vehicle is highly impacted by the age of the vehicle. However, it is negatively proportional to Price.
# Use the model to predict the test dataset.
y_simplelreg_pred_test = simple_lreg.predict(X_test_slreg)
# Use the model to predict the train dataset.
y_simplelreg_pred_train = simple_lreg.predict(X_train_slreg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_slreg_train = r2_score(y_simplelreg_pred_train, y_train_slreg)
r2_score_slreg_test = r2_score(y_simplelreg_pred_test, y_test_slreg)
rmse_slreg = np.sqrt(mean_squared_error(y_simplelreg_pred_test, y_test_slreg)**2)
print('r2_ score for train dataset for simple linear reg : ', r2_score_slreg_train)
print('r2_ score for test dataset for simple linear reg : ', r2_score_slreg_test)
print('root mean squared error for simple linear reg : ', rmse_slreg)
r2_ score for train dataset for simple linear reg : 0.6978153650611345 r2_ score for test dataset for simple linear reg : 0.6734388905656996 root mean squared error for simple linear reg : 3438902.3311535786
Multiple linear regression (MLR), also known simply as multiple regression, is a statistical technique that uses several explanatory variables to predict the outcome of a response variable. The goal of multiple linear regression (MLR) is to model the linear relationship between the explanatory (independent) variables and response (dependent) variable.
Let us include some more independent variables to predict the price of the vehicle.
# Separating the independent and dependent variable.
X_multi_lreg = data_df.drop('Price', axis = 1).values
y_multi_lreg = data_df["Price"].values.reshape(-1,1)
# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_mlreg, X_test_mlreg, y_train_mlreg, y_test_mlreg = train_test_split(X_multi_lreg,y_multi_lreg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_mlreg.shape, y_train_mlreg.shape)
print('Test Dataset : ', X_test_mlreg.shape, y_test_mlreg.shape)
Train Dataset : (1077, 11) (1077, 1) Test Dataset : (359, 11) (359, 1)
multi_lreg = LinearRegression()
multi_lreg.fit(X_train_mlreg, y_train_mlreg)
print('Intercept : ', multi_lreg.intercept_)
print('Slope : ', multi_lreg.coef_)
Intercept : [-3502.88149804] Slope : [[-1.20452220e+02 -1.55550211e-02 6.03642847e+01 2.06335655e+01 3.04043156e+02 -4.36662302e+00 -1.69942436e+01 2.12472911e+01 -1.74877821e+03 1.98227868e+03 -2.33500476e+02]]
# Use the model to predict the test dataset.
y_mlreg_pred_test = multi_lreg.predict(X_test_mlreg)
# Use the model to predict the train dataset.
y_mlreg_pred_train = multi_lreg.predict(X_train_mlreg)
# Have a look at the predicted & actual values.
print(y_mlreg_pred_test[0:5])
# print(y_test[0:5])
print(y_mlreg_pred_train[0:5])
# print(y_train[0:5])
[[ 7903.20434738] [10249.8764368 ] [ 9573.1006555 ] [11689.26155808] [ 8921.60255708]] [[10138.88623699] [ 6324.52345301] [11437.15043997] [13059.85723899] [ 8895.51258966]]
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_mlreg_train = r2_score(y_mlreg_pred_train, y_train_mlreg)
r2_score_mlreg_test = r2_score(y_mlreg_pred_test, y_test_mlreg)
rmse_mlreg = np.sqrt(mean_squared_error(y_mlreg_pred_test, y_test_mlreg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_mlreg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_mlreg_test)
print('root mean squared error for multi linear reg : ', rmse_mlreg)
r2_ score for train dataset for multi linear reg : 0.8453913190051008 r2_ score for test dataset for multi linear reg : 0.854121832445731 root mean squared error for multi linear reg : 1836109.732041979
As we can see that using multiple independent variables we can improve the accuracy of the model.
Ridge regression is a way to create a parsimonious model when the number of predictor variables in a set exceeds the number of observations, or when a data set has multicollinearity (correlations between predictor variables).
Let us look at a 2nd degree polynomial regression.
# Separating the independent and dependent variable.
X_ridge_reg = data_df.drop('Price', axis = 1).values
y_ridge_reg = data_df["Price"].values.reshape(-1,1)
# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_ridge_reg, X_test_ridge_reg, y_train_ridge_reg, y_test_ridge_reg = train_test_split(X_ridge_reg,y_ridge_reg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_ridge_reg.shape, y_train_ridge_reg.shape)
print('Test Dataset : ', X_test_ridge_reg.shape, y_test_ridge_reg.shape)
Train Dataset : (1077, 11) (1077, 1) Test Dataset : (359, 11) (359, 1)
from sklearn.linear_model import Ridge
## training the model
ridgeReg = Ridge(alpha=0.05, normalize=True)
ridgeReg.fit(X_train_ridge_reg,y_train_ridge_reg)
# Use the model to predict the test dataset.
y_ridgereg_pred_test = ridgeReg.predict(X_test_ridge_reg)
# Use the model to predict the train dataset.
y_ridgereg_pred_train = ridgeReg.predict(X_train_ridge_reg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_ridgereg_train = r2_score(y_ridgereg_pred_train, y_train_ridge_reg)
r2_score_ridgereg_test = r2_score(y_ridgereg_pred_test, y_test_ridge_reg)
rmse_ridgereg = np.sqrt(mean_squared_error(y_ridgereg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_ridgereg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_ridgereg_test)
print('root mean squared error for multi linear reg : ', rmse_ridgereg)
r2_ score for train dataset for multi linear reg : 0.831886671104807 r2_ score for test dataset for multi linear reg : 0.8384733768946311 root mean squared error for multi linear reg : 1879140.3375855063
Lasso regression is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain parts of model selection, like variable selection/parameter elimination.
from sklearn.linear_model import Lasso
## training the model
lassoReg = Lasso(alpha=0.3, normalize=True)
lassoReg.fit(X_train_ridge_reg,y_train_ridge_reg)
# Use the model to predict the test dataset.
y_lassoreg_pred_test = lassoReg.predict(X_test_ridge_reg)
# Use the model to predict the train dataset.
y_lassoreg_pred_train = lassoReg.predict(X_train_ridge_reg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_lassoreg_train = r2_score(y_lassoreg_pred_train, y_train_ridge_reg)
r2_score_lassoreg_test = r2_score(y_lassoreg_pred_test, y_test_ridge_reg)
rmse_lassoreg = np.sqrt(mean_squared_error(y_lassoreg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_lassoreg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_lassoreg_test)
print('root mean squared error for multi linear reg : ', rmse_lassoreg)
r2_ score for train dataset for multi linear reg : 0.8428580545349307 r2_ score for test dataset for multi linear reg : 0.8502844251108707 root mean squared error for multi linear reg : 1846994.3544547232
from sklearn.linear_model import ElasticNet
## training the model
elasticNetReg = ElasticNet(alpha=1, l1_ratio=0.5, normalize=True)
elasticNetReg.fit(X_train_ridge_reg,y_train_ridge_reg)
# Use the model to predict the test dataset.
y_elasticNetReg_pred_test = elasticNetReg.predict(X_test_ridge_reg)
# Use the model to predict the train dataset.
y_elasticNetReg_pred_train = elasticNetReg.predict(X_train_ridge_reg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_elasticNetReg_train = r2_score(y_elasticNetReg_pred_train, y_train_ridge_reg)
r2_score_elasticNetReg_test = r2_score(y_elasticNetReg_pred_test, y_test_ridge_reg)
rmse_elasticNetReg = np.sqrt(mean_squared_error(y_lassoreg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_elasticNetReg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_elasticNetReg_test)
print('root mean squared error for multi linear reg : ', rmse_elasticNetReg)
r2_ score for train dataset for multi linear reg : -97076.91299047269 r2_ score for test dataset for multi linear reg : -103047.22391574454 root mean squared error for multi linear reg : 1846994.3544547232
Models = [('Simple Linear Regression', r2_score_slreg_train, r2_score_slreg_test, rmse_slreg),
('Multiplt Linear Regression', r2_score_mlreg_train, r2_score_mlreg_test, rmse_mlreg),
('Ridge Regression', r2_score_ridgereg_train, r2_score_ridgereg_test, rmse_ridgereg),
('Lasso Regression', r2_score_lassoreg_train, r2_score_lassoreg_test, rmse_lassoreg),]
predict = pd.DataFrame(data = Models, columns = ['Models', 'r2_score Training', 'r2_score Testing', 'RMSE'])
predict
| Models | r2_score Training | r2_score Testing | RMSE | |
|---|---|---|---|---|
| 0 | Simple Linear Regression | 0.697815 | 0.673439 | 3.438902e+06 |
| 1 | Multiplt Linear Regression | 0.845391 | 0.854122 | 1.836110e+06 |
| 2 | Ridge Regression | 0.831887 | 0.838473 | 1.879140e+06 |
| 3 | Lasso Regression | 0.842858 | 0.850284 | 1.846994e+06 |
The performance of each algorithm is visualized as below:
f, axes = plt.subplots(3,1, figsize=(18,8))
sns.barplot(x='Models', y='r2_score Training', data = predict, ax = axes[0])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('r2_score Training')
axes[0].set_ylim(0,1.0)
sns.barplot(x='Models', y='r2_score Testing', data = predict, ax = axes[1])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('r2_score Testing')
axes[0].set_ylim(0,1.0)
sns.barplot(x='Models', y='RMSE', data = predict, ax = axes[2])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('RMSE')
axes[0].set_ylim(0,1.0)
(0.0, 1.0)
This section introduces neural networks and multilayer perceptrons (MLPs), then trains a simple model on Fashion MNIST.
Neural networks: Neural networks are modeled loosely on the human brain. A neural network can contain many connected units that transform input data into increasingly useful representations.
Multilayer perceptrons: MLPs are feedforward neural networks with one or more hidden layers between the input and output layers. They are a core building block for supervised learning tasks.
Below we have the steps to learn the Fashion MNIST database with neural networks and a bunch of built-in libraries.
First we have to import needed libraries:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
We import the Fashion MNIST dataset, a dataset which contains 70,000 grayscale images in 10 categories. The images show individual articles of clothing at low resolution (28 by 28 pixels).
Here, 60,000 images are used to train the network and 10,000 images to evaluate how accurately the network learned to classify images. We can access the Fashion MNIST directly from TensorFlow.
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz 32768/29515 [=================================] - 0s 0us/step Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz 26427392/26421880 [==============================] - 0s 0us/step Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz 8192/5148 [===============================================] - 0s 0us/step Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz 4423680/4422102 [==============================] - 0s 0us/step
The labels are an array of integers, ranging from 0 to 9. These correspond to the class of clothing the image represents:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
If you inspect the first image in the training set, you will see that the pixel values fall in the range of 0 to 255, we scale these values to a range of 0 to 1 before feeding them to the neural network model. To do so, divide the values by 255.
train_images = train_images / 255.0
test_images = test_images / 255.0
In order to see if the scaling worked correctly, and to get a sense from the pictures obtain an output.
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(train_images[i], cmap=plt.cm.binary)
plt.xlabel(class_names[train_labels[i]])
plt.show()
Now we have to create our neural network. The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Hopefully, these representations are meaningful for the problem at hand.
Most of deep learning consists of chaining together simple layers. Most layers, such as tf.keras.layers.Dense, have parameters that are learned during training.
The first layer in this network, tf.keras.layers.Flatten, transforms the format of the images from a two-dimensional array (of 28 by 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). In other words, this layer flattens the data by unstacking the rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data.
After the pixels are flattened, the network consists of a sequence of two tf.keras.layers.Dense layers. These are densely connected, or fully connected, neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes.
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10)
])
#compiling the model
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
Training the neural network model requires the following steps:
train_images and train_labels arrays.test_labels array.model.fit(train_images, train_labels, epochs=10)
Epoch 1/10 1875/1875 [==============================] - 5s 2ms/step - loss: 0.4943 - accuracy: 0.8267 Epoch 2/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.3721 - accuracy: 0.8671 Epoch 3/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.3369 - accuracy: 0.8756 Epoch 4/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.3126 - accuracy: 0.8849 Epoch 5/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.2944 - accuracy: 0.8913 Epoch 6/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.2788 - accuracy: 0.8969 Epoch 7/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.2670 - accuracy: 0.9014 Epoch 8/10 1875/1875 [==============================] - 4s 2ms/step - loss: 0.2563 - accuracy: 0.9044 Epoch 9/10 1875/1875 [==============================] - 3s 2ms/step - loss: 0.2457 - accuracy: 0.9075 Epoch 10/10 1875/1875 [==============================] - 3s 2ms/step - loss: 0.2366 - accuracy: 0.9110
<tensorflow.python.keras.callbacks.History at 0x7f4a7bc54f90>
As the model trains, the loss and accuracy metrics are displayed. This model reaches an accuracy of about 0.91 (or 91%) on the training data.
Next, we have to test the trained model against the test data.
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)
313/313 - 0s - loss: 0.3595 - accuracy: 0.8737 Test accuracy: 0.8737000226974487
It turns out that the accuracy on the test dataset is a little less than the accuracy on the training dataset. This gap between training accuracy and test accuracy represents overfitting. Overfitting happens when a machine learning model performs worse on new, previously unseen inputs than it does on the training data. An overfitted model "memorizes" the noise and details in the training dataset to a point where it negatively impacts the performance of the model on the new data.
With the model trained, you can use it to make predictions about some images.
Let's plot several images with their predictions. Note that the model can be wrong even when very confident.
probability_model = tf.keras.Sequential([model,
tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_images)
def plot_image(i, predictions_array, true_label, img):
true_label, img = true_label[i], img[i]
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.imshow(img, cmap=plt.cm.binary)
predicted_label = np.argmax(predictions_array)
if predicted_label == true_label:
color = 'blue'
else:
color = 'red'
plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
100*np.max(predictions_array),
class_names[true_label]),
color=color)
def plot_value_array(i, predictions_array, true_label):
true_label = true_label[i]
plt.grid(False)
plt.xticks(range(10))
plt.yticks([])
thisplot = plt.bar(range(10), predictions_array, color="#777777")
plt.ylim([0, 1])
predicted_label = np.argmax(predictions_array)
thisplot[predicted_label].set_color('red')
thisplot[true_label].set_color('blue')
# Plot the first X test images, their predicted labels, and the true labels.
# Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
plt.subplot(num_rows, 2*num_cols, 2*i+1)
plot_image(i, predictions[i], test_labels, test_images)
plt.subplot(num_rows, 2*num_cols, 2*i+2)
plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()
We also can use the model to classify a single photo:
img = test_images[20]
img = (np.expand_dims(img,0))
predictions_single = probability_model.predict(img)
plot_value_array(1, predictions_single[0], test_labels)
_ = plt.xticks(range(10), class_names, rotation=45)