Convolutional Neural Networks on MNIST

This section trains a convolutional neural network in PyTorch to classify handwritten digits from the MNIST dataset.

Classification with CNNs

Importing Libraries

In [ ]:

import torch
import numpy as np
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torchvision
from tqdm.auto import tqdm, trange
from torch.utils.data import random_split
%matplotlib inline
import matplotlib.pyplot as plt 
import torch.nn.functional as F

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('runing on ',device)

runing on  cuda

Loading the Dataset

In [ ]:

batch_size = 32
transform=transforms.ToTensor()
train_val_data= datasets.MNIST('./data', 
                               train=True, 
                               download=True, 
                               transform=transform)

test_data= datasets.MNIST('./data', 
                                    train=False, 
                                    transform=transform)

In [ ]:

test_data.classes

Out[ ]:

['0 - zero',
 '1 - one',
 '2 - two',
 '3 - three',
 '4 - four',
 '5 - five',
 '6 - six',
 '7 - seven',
 '8 - eight',
 '9 - nine']

In [ ]:

train_size=int(0.9*(len(train_val_data)))
val_size=len(train_val_data)-train_size
train_data,val_data=random_split(train_val_data,[train_size,val_size])
train_loader=torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, drop_last = True)
test_loader=torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True, drop_last = True)
val_loader=torch.utils.data.DataLoader(val_data, batch_size=32, shuffle=True, drop_last = True)

print('number of batches in train data , test data and validation data are:')
print(len(train_loader),len(test_loader),len(val_loader))#number of batches in train data and test data and val data

number of batches in train data , test data and validation data are:
1687 312 187

In [ ]:

images,labels=next(iter(train_loader))
print(images.shape,labels.shape)
print(test_data)

torch.Size([32, 1, 28, 28]) torch.Size([32])
Dataset MNIST
    Number of datapoints: 10000
    Root location: ./data
    Split: Test
    StandardTransform
Transform: ToTensor()

Visualizing Samples

In [ ]:

x = next(iter(test_loader))[0][:10].squeeze(1) # Get a batch and choose 10 of images
    
fig = plt.figure(figsize=(20, 20))  # figure size in inches
for i in range(len(x)):
        ax = fig.add_subplot(1,20, i + 1, xticks=[], yticks=[])
        ax.imshow(x[i].numpy(),cmap='gray', interpolation='nearest')

No description has been provided for this image

Defining the CNN Model

In [ ]:

class CNN(nn.Module):
    def __init__(self,BN,Dropout):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.bn2 = nn.BatchNorm2d(64)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(64*24*24, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.Dropout=Dropout
        self.BN=BN

    def forward(self, x):
        x = self.conv1(x)
        if self.BN:
          x=self.bn1(x)

        x = self.relu(x)
        x = self.conv2(x)
        
        if self.BN:
          x=self.bn2(x)

        x = self.relu(x)
        if self.Dropout:
          x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu(x)
        if self.Dropout:
          x = self.dropout2(x)
        x = self.fc2(x)
        m=torch.nn.Softmax(dim=1)
        return m(x)

Training the CNN Model on MNIST

In [53]:

#Training on val and train data

def train(net):
  
  from tqdm.notebook import tqdm
  costFunc = torch.nn.CrossEntropyLoss()
  optimizer=torch.optim.SGD(net.parameters(),lr=1e-4,momentum=0.9)
  val_loss,train_loss,train_acc,val_acc=[],[],[],[]
  for epoch in tqdm(range(41)):
          bchloss = 0
          net.train()
          correct=0
          total=0
          for i,batch in enumerate(train_loader,0):
              data,output=batch
              data,output = data.to(device),output.to(device)
              prediction = net(data)
              loss = costFunc(prediction,output)
              bchloss += int(loss.item()*1000)

              optimizer.zero_grad()
              loss.backward()
              optimizer.step()
        
              pred=torch.argmax(prediction,dim=1)
              total += output.size(0)#batch_size
              correct += (pred==output).sum().item()
          train_loss.append(bchloss/(len(train_loader)*32))
          train_acc.append((correct/total)*100)
          if epoch%5==0:
            print('***************** epoch',epoch,'*****************')
            print('train loss = ',bchloss/(len(train_loader)*32))
            print('train accuracy ','= ',str((correct/total)*100),'%')
          
          
          correct=0
          total=0
          bchloss=0
          for data,output in val_loader:
              data,output = data.to(device),output.to(device)
              prediction = net(data)
              loss = costFunc(prediction,output)
              bchloss += int(loss.item()*1000)
  
              pred=torch.argmax(prediction,dim=1)
              total += output.size(0)
              correct += (pred==output).sum().item()
          val_acc.append((correct/total)*100)
          val_loss.append(bchloss/(len(val_loader)*32))
          
          if epoch%5==0:
            print('validation loss = ',bchloss/(len(val_loader)*32))
            print('vaidation accuracy ','= ',str((correct/total)*100),'%')
          
  return val_loss,train_loss,train_acc,val_acc

The next experiments train three variants to compare batch normalization and dropout:

model1: no batch normalization and no dropout: (Batch_Normalization=False, Dropout=False)
model2: with batch normalization and no dropout: (Batch_Normalization=True, Dropout=False)
model3: no batch normalization and with dropout: (Batch_Normalization=False, Dropout=True)

In [56]:

model1 = CNN(BN=False,Dropout=False).to(device)
val1_loss,train1_loss,train1_acc,val1_acc=train(model1)

HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

***************** epoch 0 *****************
train loss =  71.87635225251927
train accuracy  =  19.229771784232366 %
validation loss =  71.80314171122994
vaidation accuracy  =  33.1216577540107 %
***************** epoch 5 *****************
train loss =  52.3941908713693
train accuracy  =  81.63529934795494 %
validation loss =  51.66794786096256
vaidation accuracy  =  83.23863636363636 %
***************** epoch 10 *****************
train loss =  50.93153526970954
train accuracy  =  83.83224659158269 %
validation loss =  50.70755347593583
vaidation accuracy  =  84.6590909090909 %
***************** epoch 15 *****************
train loss =  50.625
train accuracy  =  84.54727326615293 %
validation loss =  50.405080213903744
vaidation accuracy  =  85.17713903743316 %
***************** epoch 20 *****************
train loss =  50.44565056312982
train accuracy  =  84.98258743331357 %
validation loss =  50.248495989304814
vaidation accuracy  =  85.46122994652407 %
***************** epoch 25 *****************
train loss =  50.300255631298164
train accuracy  =  85.4253112033195 %
validation loss =  50.11380347593583
vaidation accuracy  =  85.86229946524064 %
***************** epoch 30 *****************
train loss =  50.17634854771784
train accuracy  =  85.82913455838766 %
validation loss =  50.00835561497326
vaidation accuracy  =  86.38034759358288 %
***************** epoch 35 *****************
train loss =  50.06216656787196
train accuracy  =  86.11810906935389 %
validation loss =  49.90524732620321
vaidation accuracy  =  86.6644385026738 %
***************** epoch 40 *****************
train loss =  49.959228660343804
train accuracy  =  86.49599881446355 %
validation loss =  49.81751336898396
vaidation accuracy  =  86.99866310160428 %

In [41]:

plt.figure()
plt.subplot(2,1,1)
plt.title('accuracy')
plt.plot(val1_acc,label='validation')
plt.plot(train1_acc,label='train')
plt.legend()
plt.show()

In [42]:

plt.figure()
plt.subplot(2,1,1)
plt.title('loss validation')
plt.plot(val1_loss,label='validation')

plt.title('loss train ')
plt.plot(train1_loss,label='train')
plt.legend()
plt.show()

Evaluating on Test Data

In [40]:

correct=0
total=0
model1.eval()
for data,output in test_loader:
            total += output.size(0)
            data,output = data.to(device),output.to(device)
            prediction = model1(data)
            pred=torch.argmax(prediction,dim=1)
            correct += (pred==output).sum().item()
print('test Accuracy on epoch ',30,'= ',str((correct/total)*100),'%')

test Accuracy on epoch  30 =  93.64983974358975 %

Batch Normalization

How does batch normalization change the training process of a CNN?

Batch normalization helps the network converge faster. In the plots below, the loss of the network with batch normalization falls much faster than the network without batch normalization.

In [43]:

model2 = CNN(BN=True,Dropout=False).to(device)
val2_loss,train2_loss,train2_acc,val2_acc=train(model2)

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

***************** epoch 0 *****************
train loss =  57.30516449318316
train accuracy  =  73.44027860106699 %
validation loss =  50.4884692513369
vaidation accuracy  =  90.9090909090909 %
***************** epoch 5 *****************
train loss =  47.38783713692946
train accuracy  =  95.91730883224659 %
validation loss =  47.25300802139037
vaidation accuracy  =  96.2566844919786 %
***************** epoch 10 *****************
train loss =  46.75913233550682
train accuracy  =  97.38811499703615 %
validation loss =  46.77991310160428
vaidation accuracy  =  97.14237967914438 %
***************** epoch 15 *****************
train loss =  46.50611292234736
train accuracy  =  97.97903082394784 %
validation loss =  46.587901069518715
vaidation accuracy  =  97.7105614973262 %
***************** epoch 20 *****************
train loss =  46.35755038529935
train accuracy  =  98.34210136336692 %
validation loss =  46.474598930481285
vaidation accuracy  =  97.96122994652407 %
***************** epoch 25 *****************
train loss =  46.25653897451097
train accuracy  =  98.61625666864256 %
validation loss =  46.3793449197861
vaidation accuracy  =  98.21189839572193 %
***************** epoch 30 *****************
train loss =  46.18835210432721
train accuracy  =  98.74407231772378 %
validation loss =  46.35360962566845
vaidation accuracy  =  98.34558823529412 %

In [44]:

plt.title('loss')
plt.plot(val1_loss, 'r', label='without BN')
plt.plot(val2_loss, 'g', label='with BN')
plt.legend()
plt.show()
plt.title('accuarcy')
plt.plot(val1_acc, 'm', label='without BN')
plt.plot(val2_acc, 'b', label='with BN')
plt.legend()
plt.show()

Dropout

Dropout Dropout is a technique used in neural networks to prevent overfitting the training data by dropping out neurons with probability p>0 It forces the model to avoid relying too much on particular sets of features..

Dropout helps reduce overfitting and generalization error by randomly deactivating a subset of neurons during each training step.

In [54]:

model3 = CNN(BN=False,Dropout=True).to(device)
val3_loss,train3_loss,train3_acc,val3_acc=train(model3)

HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))

***************** epoch 0 *****************
train loss =  71.8227252519265
train accuracy  =  16.458580320094843 %
validation loss =  71.64388368983957
vaidation accuracy  =  26.15307486631016 %
***************** epoch 5 *****************
train loss =  54.058054238292826
train accuracy  =  75.42234736218138 %
validation loss =  53.05347593582888
vaidation accuracy  =  78.72660427807486 %
***************** epoch 10 *****************
train loss =  49.96578615886188
train accuracy  =  87.73895969176053 %
validation loss =  49.65524732620321
vaidation accuracy  =  88.58622994652407 %
***************** epoch 15 *****************
train loss =  49.13205764671014
train accuracy  =  89.92849733254297 %
validation loss =  48.90992647058823
vaidation accuracy  =  90.57486631016043 %
***************** epoch 20 *****************
train loss =  48.728919679905154
train accuracy  =  91.03623295791346 %
validation loss =  48.5076871657754
vaidation accuracy  =  91.5942513368984 %
***************** epoch 25 *****************
train loss =  48.427941612329576
train accuracy  =  91.87722288085358 %
validation loss =  48.17613636363637
vaidation accuracy  =  92.59692513368985 %
***************** epoch 30 *****************
train loss =  48.2403304682869
train accuracy  =  92.3606994665086 %
validation loss =  48.02139037433155
vaidation accuracy  =  93.09826203208557 %
***************** epoch 35 *****************
train loss =  48.052663752222884
train accuracy  =  92.86640486069948 %
validation loss =  47.98495989304813
vaidation accuracy  =  93.09826203208557 %
***************** epoch 40 *****************
train loss =  47.920365293420275
train accuracy  =  93.32024303497333 %
validation loss =  47.84876336898396
vaidation accuracy  =  93.54946524064172 %

In [57]:

plt.title('loss')
plt.plot(val1_loss, 'r', label='without dropout')
plt.plot(val3_loss, 'g', label='with dropout')
plt.legend()
plt.show()
plt.title('accuracy')
plt.plot(val1_acc, 'm', label='without dropout')
plt.plot(val3_acc, 'b', label='with dropout')
plt.legend()
plt.show()

Visualizing Filters

In convolutional neural networks, filters are the learned weights of convolutional layers. These filters determine which local patterns the network responds to as it processes the input image.

In [47]:

model1.parameters

Out[47]:

<bound method Module.parameters of CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=36864, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
)>

In [48]:

# load the model
model_weights = [] 
conv_layers = [] 
model_children = list(model1.children())

In [49]:

counter = 0 
 
for i in range(len(model_children)):
    if type(model_children[i]) == nn.Conv2d:
        counter += 1
        model_weights.append(model_children[i].weight)
        conv_layers.append(model_children[i])
    elif type(model_children[i]) == nn.Sequential:
        for j in range(len(model_children[i])):
            for child in model_children[i][j].children():
                if type(child) == nn.Conv2d:
                    counter += 1
                    model_weights.append(child.weight)
                    conv_layers.append(child)
print(f"Total convolutional layers: {counter}")

Total convolutional layers: 2

In [50]:

# the first conv layer filters visualization
plt.figure(figsize=(20, 17))
for i, filter in enumerate(model_weights[0]):
    plt.subplot(8, 8, i+1) 
    plt.imshow(filter[0, :, :].detach().cpu()
    , cmap='gray')
    plt.axis('off')
plt.show()

Visualizing Feature Maps

Feature maps are the activations produced after filters pass over an input image. They show what the convolutional layer sees at different stages of the network.

In [51]:

images,labels=next(iter(train_loader))
results = [conv_layers[0](images.cuda())]
for i in range(1, len(conv_layers)):
    results.append(conv_layers[i](results[-1]))

outputs = results

In [52]:

# visualizing features
for num_layer in range(len(outputs)):
    plt.figure(figsize=(30, 30))
    layer_viz = outputs[num_layer][0, :, :, :]
    layer_viz = layer_viz.data
    print(layer_viz.size())
    for i, filter in enumerate(layer_viz):
        if i == 64:
            break
        plt.subplot(8, 8, i + 1)
        plt.imshow(filter.cpu(), cmap='gray')
        plt.axis("off")
    print(f" layer {num_layer} feature maps...")
    plt.show()
    plt.close()

torch.Size([32, 26, 26])
Saving layer 0 feature maps...

torch.Size([64, 24, 24])
Saving layer 1 feature maps...

References

Linear Regression Baselines

Introduction

In this jupyter file, several linear regression methods are implemented and as a result, the performance and accuracies of different algorithms was reported. The main dataset is collected data about Toyota Corolla cars information with different option and features.

Importing Packages

In [3]:

import numpy as np 
import pandas as pd

In [5]:

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from collections import Counter

Reading Data

In [6]:

data_df = pd.read_csv("./ToyotaCorolla.csv")

Dataset Overview

In [8]:

data_df.head()

Out[8]:

	Price	Age	KM	FuelType	HP	MetColor	CC	Doors	Weight
0	13500	23	46986	Diesel	90	1	2000	3	1165
1	13750	23	72937	Diesel	90	1	2000	3	1165
2	13950	24	41711	Diesel	90	1	2000	3	1165
3	14950	26	48000	Diesel	90	0	2000	3	1165
4	13750	30	38500	Diesel	90	0	2000	3	1170

In [10]:

data_df.count()

Out[10]:

Price        1436
Age          1436
KM           1436
FuelType     1436
HP           1436
MetColor     1436
Automatic    1436
CC           1436
Doors        1436
Weight       1436
dtype: int64

In [11]:

data_df.describe()

Out[11]:

	Price	Age	KM	HP	MetColor	Automatic	CC	Doors	Weight
count	1436.000000	1436.000000	1436.000000	1436.000000	1436.000000	1436.000000	1436.000000	1436.000000	1436.00000
mean	10730.824513	55.947075	68533.259749	101.502089	0.674791	0.055710	1566.827994	4.033426	1072.45961
std	3626.964585	18.599988	37506.448872	14.981080	0.468616	0.229441	187.182436	0.952677	52.64112
min	4350.000000	1.000000	1.000000	69.000000	0.000000	0.000000	1300.000000	2.000000	1000.00000
25%	8450.000000	44.000000	43000.000000	90.000000	0.000000	0.000000	1400.000000	3.000000	1040.00000
50%	9900.000000	61.000000	63389.500000	110.000000	1.000000	0.000000	1600.000000	4.000000	1070.00000
75%	11950.000000	70.000000	87020.750000	110.000000	1.000000	0.000000	1600.000000	5.000000	1085.00000
max	32500.000000	80.000000	243000.000000	192.000000	1.000000	1.000000	2000.000000	5.000000	1615.00000

In [12]:

data_df.isnull().sum()

Out[12]:

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

In [13]:

# Check Correlation amoung parameters
corr = data_df.corr()
fig, ax = plt.subplots(figsize=(8,8))
# Generate a heatmap
sns.heatmap(corr, cmap = 'magma', annot = True, fmt = ".2f")
plt.xticks(range(len(corr.columns)), corr.columns)

plt.yticks(range(len(corr.columns)), corr.columns)

plt.show()

In [14]:

# plot regplots  for Age, KM, CC & HP against Price
f, axes = plt.subplots(2,2, figsize=(12,8))
# Age Vs Price
sns.regplot(x = 'Price', y = 'Age', data = data_df, ax = axes[0,0], scatter_kws={'alpha':0.6})
axes[0,0].set_xlabel('Price', fontsize = 14)
axes[0,0].set_ylabel('Age', fontsize=14)
axes[0,0].yaxis.tick_left()

# KM Vs Price
sns.regplot(x = 'Price', y = 'KM', data = data_df, ax = axes[0,1], scatter_kws={'alpha':0.6})
axes[0,1].set_xlabel('Price', fontsize = 14)
axes[0,1].set_ylabel('KM', fontsize=14)
axes[0,1].yaxis.set_label_position("right")
axes[0,1].yaxis.tick_right()

# CC Vs Price
sns.regplot(x = 'Price', y = 'CC', data = data_df, ax = axes[1,0], scatter_kws={'alpha':0.6})
axes[1,0].set_xlabel('Price', fontsize = 14)
axes[1,0].set_ylabel('CC', fontsize=14)
axes[1,0].yaxis.tick_left()

# Weight Vs Price
sns.regplot(x = 'Price', y = 'Weight', data = data_df, ax = axes[1,1], scatter_kws={'alpha':0.6})
axes[1,1].set_xlabel('Price', fontsize = 14)
axes[1,1].set_ylabel('Weight', fontsize=14)
axes[1,1].yaxis.set_label_position("right")
axes[1,1].yaxis.tick_right()

plt.show()

In [15]:

# Create the clasiification.
data_df = pd.get_dummies(data_df)

In [16]:

data_df.head()

Out[16]:

	Price	Age	KM	HP	MetColor	CC	Doors	Weight	FuelType_Diesel
0	13500	23	46986	90	1	2000	3	1165	1
1	13750	23	72937	90	1	2000	3	1165	1
2	13950	24	41711	90	1	2000	3	1165	1
3	14950	26	48000	90	0	2000	3	1165	1
4	13750	30	38500	90	0	2000	3	1170	1

Regression Models

Linear Regression

In statistics, simple linear regression is a linear regression model with a single explanatory variable. That is, it concerns two-dimensional sample points with one independent variable and one dependent variable (conventionally, the x and y coordinates in a Cartesian coordinate system) and finds a linear function (a non-vertical straight line) that, as accurately as possible, predicts the dependent variable values as a function of the independent variable. The adjective simple refers to the fact that the outcome variable is related to a single predictor.

In [17]:

from sklearn.linear_model import LinearRegression

Simple Linear Regression

Let us see how the model performs when using only one independent variable, age, to predict the price.

In [19]:

X_simple_lreg = data_df[["Age"]].values
y_simple_lreg = data_df["Price"].values

print(X_simple_lreg[0:5])
print(y_simple_lreg[0:5])

[[23]
 [23]
 [24]
 [26]
 [30]]
[13500 13750 13950 14950 13750]

In [20]:

# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_slreg, X_test_slreg, y_train_slreg, y_test_slreg = train_test_split(X_simple_lreg,y_simple_lreg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_slreg.shape, y_train_slreg.shape)
print('Test Dataset : ', X_test_slreg.shape, y_test_slreg.shape)

Train Dataset :  (1077, 1) (1077,)
Test Dataset :  (359, 1) (359,)

In [21]:

simple_lreg = LinearRegression()
simple_lreg.fit(X_train_slreg, y_train_slreg)
print('Intercept : ', simple_lreg.intercept_)
print('Slope : ', simple_lreg.coef_)

Intercept :  20137.410273159752
Slope :  [-169.09157285]

As we can see, the slope is -169.09, which means that price of the vehicle is highly impacted by the age of the vehicle. However, it is negatively proportional to Price.

In [22]:

# Use the model to predict the test dataset.
y_simplelreg_pred_test = simple_lreg.predict(X_test_slreg)

# Use the model to predict the train dataset.
y_simplelreg_pred_train = simple_lreg.predict(X_train_slreg)

In [23]:

# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_slreg_train = r2_score(y_simplelreg_pred_train, y_train_slreg)
r2_score_slreg_test = r2_score(y_simplelreg_pred_test, y_test_slreg)
rmse_slreg = np.sqrt(mean_squared_error(y_simplelreg_pred_test, y_test_slreg)**2)
print('r2_ score for train dataset for simple linear reg : ', r2_score_slreg_train)
print('r2_ score for test dataset for simple linear reg : ', r2_score_slreg_test)
print('root mean squared error for simple linear reg : ', rmse_slreg)

r2_ score for train dataset for simple linear reg :  0.6978153650611345
r2_ score for test dataset for simple linear reg :  0.6734388905656996
root mean squared error for simple linear reg :  3438902.3311535786

Multiple Regression

Multiple linear regression (MLR), also known simply as multiple regression, is a statistical technique that uses several explanatory variables to predict the outcome of a response variable. The goal of multiple linear regression (MLR) is to model the linear relationship between the explanatory (independent) variables and response (dependent) variable.

Let us include some more independent variables to predict the price of the vehicle.

In [24]:

# Separating the independent and dependent variable.
X_multi_lreg = data_df.drop('Price', axis = 1).values
y_multi_lreg = data_df["Price"].values.reshape(-1,1)

In [25]:

# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_mlreg, X_test_mlreg, y_train_mlreg, y_test_mlreg = train_test_split(X_multi_lreg,y_multi_lreg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_mlreg.shape, y_train_mlreg.shape)
print('Test Dataset : ', X_test_mlreg.shape, y_test_mlreg.shape)

Train Dataset :  (1077, 11) (1077, 1)
Test Dataset :  (359, 11) (359, 1)

In [26]:

multi_lreg = LinearRegression()
multi_lreg.fit(X_train_mlreg, y_train_mlreg)
print('Intercept : ', multi_lreg.intercept_)
print('Slope : ', multi_lreg.coef_)

Intercept :  [-3502.88149804]
Slope :  [[-1.20452220e+02 -1.55550211e-02  6.03642847e+01  2.06335655e+01
   3.04043156e+02 -4.36662302e+00 -1.69942436e+01  2.12472911e+01
  -1.74877821e+03  1.98227868e+03 -2.33500476e+02]]

In [27]:

# Use the model to predict the test dataset.
y_mlreg_pred_test = multi_lreg.predict(X_test_mlreg)

# Use the model to predict the train dataset.
y_mlreg_pred_train = multi_lreg.predict(X_train_mlreg)

In [30]:

# Have a look at the predicted & actual values.
print(y_mlreg_pred_test[0:5])
# print(y_test[0:5])

print(y_mlreg_pred_train[0:5])
# print(y_train[0:5])

[[ 7903.20434738]
 [10249.8764368 ]
 [ 9573.1006555 ]
 [11689.26155808]
 [ 8921.60255708]]
[[10138.88623699]
 [ 6324.52345301]
 [11437.15043997]
 [13059.85723899]
 [ 8895.51258966]]

In [31]:

# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_mlreg_train = r2_score(y_mlreg_pred_train, y_train_mlreg)
r2_score_mlreg_test = r2_score(y_mlreg_pred_test, y_test_mlreg)
rmse_mlreg = np.sqrt(mean_squared_error(y_mlreg_pred_test, y_test_mlreg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_mlreg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_mlreg_test)
print('root mean squared error for multi linear reg : ', rmse_mlreg)

r2_ score for train dataset for multi linear reg :  0.8453913190051008
r2_ score for test dataset for multi linear reg :  0.854121832445731
root mean squared error for multi linear reg :  1836109.732041979

As we can see that using multiple independent variables we can improve the accuracy of the model.

Ridge Regression

Ridge regression is a way to create a parsimonious model when the number of predictor variables in a set exceeds the number of observations, or when a data set has multicollinearity (correlations between predictor variables).

Let us look at a 2nd degree polynomial regression.

In [32]:

# Separating the independent and dependent variable.
X_ridge_reg = data_df.drop('Price', axis = 1).values
y_ridge_reg = data_df["Price"].values.reshape(-1,1)

In [33]:

# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_ridge_reg, X_test_ridge_reg, y_train_ridge_reg, y_test_ridge_reg = train_test_split(X_ridge_reg,y_ridge_reg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_ridge_reg.shape, y_train_ridge_reg.shape)
print('Test Dataset : ', X_test_ridge_reg.shape, y_test_ridge_reg.shape)

Train Dataset :  (1077, 11) (1077, 1)
Test Dataset :  (359, 11) (359, 1)

In [34]:

from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha=0.05, normalize=True)

ridgeReg.fit(X_train_ridge_reg,y_train_ridge_reg)

# Use the model to predict the test dataset.
y_ridgereg_pred_test = ridgeReg.predict(X_test_ridge_reg)

# Use the model to predict the train dataset.
y_ridgereg_pred_train = ridgeReg.predict(X_train_ridge_reg)

# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_ridgereg_train = r2_score(y_ridgereg_pred_train, y_train_ridge_reg)
r2_score_ridgereg_test = r2_score(y_ridgereg_pred_test, y_test_ridge_reg)
rmse_ridgereg = np.sqrt(mean_squared_error(y_ridgereg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_ridgereg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_ridgereg_test)
print('root mean squared error for multi linear reg : ', rmse_ridgereg)

r2_ score for train dataset for multi linear reg :  0.831886671104807
r2_ score for test dataset for multi linear reg :  0.8384733768946311
root mean squared error for multi linear reg :  1879140.3375855063

Lasso Regression

Lasso regression is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain parts of model selection, like variable selection/parameter elimination.

In [35]:

from sklearn.linear_model import Lasso

## training the model

lassoReg = Lasso(alpha=0.3, normalize=True)

lassoReg.fit(X_train_ridge_reg,y_train_ridge_reg)

# Use the model to predict the test dataset.
y_lassoreg_pred_test = lassoReg.predict(X_test_ridge_reg)

# Use the model to predict the train dataset.
y_lassoreg_pred_train = lassoReg.predict(X_train_ridge_reg)

# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_lassoreg_train = r2_score(y_lassoreg_pred_train, y_train_ridge_reg)
r2_score_lassoreg_test = r2_score(y_lassoreg_pred_test, y_test_ridge_reg)
rmse_lassoreg = np.sqrt(mean_squared_error(y_lassoreg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_lassoreg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_lassoreg_test)
print('root mean squared error for multi linear reg : ', rmse_lassoreg)

r2_ score for train dataset for multi linear reg :  0.8428580545349307
r2_ score for test dataset for multi linear reg :  0.8502844251108707
root mean squared error for multi linear reg :  1846994.3544547232

In [36]:

from sklearn.linear_model import ElasticNet

## training the model

elasticNetReg = ElasticNet(alpha=1, l1_ratio=0.5, normalize=True)

elasticNetReg.fit(X_train_ridge_reg,y_train_ridge_reg)

# Use the model to predict the test dataset.
y_elasticNetReg_pred_test = elasticNetReg.predict(X_test_ridge_reg)

# Use the model to predict the train dataset.
y_elasticNetReg_pred_train = elasticNetReg.predict(X_train_ridge_reg)

# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_elasticNetReg_train = r2_score(y_elasticNetReg_pred_train, y_train_ridge_reg)
r2_score_elasticNetReg_test = r2_score(y_elasticNetReg_pred_test, y_test_ridge_reg)
rmse_elasticNetReg = np.sqrt(mean_squared_error(y_lassoreg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_elasticNetReg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_elasticNetReg_test)
print('root mean squared error for multi linear reg : ', rmse_elasticNetReg)

r2_ score for train dataset for multi linear reg :  -97076.91299047269
r2_ score for test dataset for multi linear reg :  -103047.22391574454
root mean squared error for multi linear reg :  1846994.3544547232

Measuring Error

In [40]:

Models = [('Simple Linear Regression', r2_score_slreg_train, r2_score_slreg_test, rmse_slreg),
          ('Multiplt Linear Regression', r2_score_mlreg_train, r2_score_mlreg_test, rmse_mlreg),
          ('Ridge Regression', r2_score_ridgereg_train, r2_score_ridgereg_test, rmse_ridgereg),
          ('Lasso Regression', r2_score_lassoreg_train, r2_score_lassoreg_test, rmse_lassoreg),]

In [41]:

predict = pd.DataFrame(data = Models, columns = ['Models', 'r2_score Training', 'r2_score Testing', 'RMSE'])
predict

Out[41]:

	Models	r2_score Training	r2_score Testing	RMSE
0	Simple Linear Regression	0.697815	0.673439	3.438902e+06
1	Multiplt Linear Regression	0.845391	0.854122	1.836110e+06
2	Ridge Regression	0.831887	0.838473	1.879140e+06
3	Lasso Regression	0.842858	0.850284	1.846994e+06

Visualization

The performance of each algorithm is visualized as below:

In [42]:

f, axes = plt.subplots(3,1, figsize=(18,8))

sns.barplot(x='Models', y='r2_score Training', data = predict, ax = axes[0])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('r2_score Training')
axes[0].set_ylim(0,1.0)

sns.barplot(x='Models', y='r2_score Testing', data = predict, ax = axes[1])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('r2_score Testing')
axes[0].set_ylim(0,1.0)

sns.barplot(x='Models', y='RMSE', data = predict, ax = axes[2])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('RMSE')
axes[0].set_ylim(0,1.0)

Out[42]:

(0.0, 1.0)

In [ ]:

Neural Networks and MLPs on Fashion MNIST

Introduction

This section introduces neural networks and multilayer perceptrons (MLPs), then trains a simple model on Fashion MNIST.

Neural networks: Neural networks are modeled loosely on the human brain. A neural network can contain many connected units that transform input data into increasingly useful representations.

Multilayer perceptrons: MLPs are feedforward neural networks with one or more hidden layers between the input and output layers. They are a core building block for supervised learning tasks.

Below we have the steps to learn the Fashion MNIST database with neural networks and a bunch of built-in libraries.

First we have to import needed libraries:

In [2]:

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

We import the Fashion MNIST dataset, a dataset which contains 70,000 grayscale images in 10 categories. The images show individual articles of clothing at low resolution (28 by 28 pixels).

Here, 60,000 images are used to train the network and 10,000 images to evaluate how accurately the network learned to classify images. We can access the Fashion MNIST directly from TensorFlow.

In [3]:

fashion_mnist = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
32768/29515 [=================================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
26427392/26421880 [==============================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
8192/5148 [===============================================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
4423680/4422102 [==============================] - 0s 0us/step

The labels are an array of integers, ranging from 0 to 9. These correspond to the class of clothing the image represents:

Screenshot from 2021-06-12 17-38-45.png

In [4]:

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

If you inspect the first image in the training set, you will see that the pixel values fall in the range of 0 to 255, we scale these values to a range of 0 to 1 before feeding them to the neural network model. To do so, divide the values by 255.

In [6]:

train_images = train_images / 255.0
test_images = test_images / 255.0

In order to see if the scaling worked correctly, and to get a sense from the pictures obtain an output.

In [7]:

plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])
plt.show()

Now we have to create our neural network. The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Hopefully, these representations are meaningful for the problem at hand.

Most of deep learning consists of chaining together simple layers. Most layers, such as tf.keras.layers.Dense, have parameters that are learned during training.

The first layer in this network, tf.keras.layers.Flatten, transforms the format of the images from a two-dimensional array (of 28 by 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). In other words, this layer flattens the data by unstacking the rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data.

After the pixels are flattened, the network consists of a sequence of two tf.keras.layers.Dense layers. These are densely connected, or fully connected, neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes.

In [10]:

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10)
])
#compiling the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

Training the Model

Training the neural network model requires the following steps:

Feed the training data to the model. In this example, the training data is in the train_images and train_labels arrays.
Let the model learn to associate images and labels.
Ask the model to make predictions about the test set.
Verify that the predictions match the labels from the test_labels array.

In [11]:

model.fit(train_images, train_labels, epochs=10)

Epoch 1/10
1875/1875 [==============================] - 5s 2ms/step - loss: 0.4943 - accuracy: 0.8267
Epoch 2/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.3721 - accuracy: 0.8671
Epoch 3/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.3369 - accuracy: 0.8756
Epoch 4/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.3126 - accuracy: 0.8849
Epoch 5/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.2944 - accuracy: 0.8913
Epoch 6/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.2788 - accuracy: 0.8969
Epoch 7/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.2670 - accuracy: 0.9014
Epoch 8/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.2563 - accuracy: 0.9044
Epoch 9/10
1875/1875 [==============================] - 3s 2ms/step - loss: 0.2457 - accuracy: 0.9075
Epoch 10/10
1875/1875 [==============================] - 3s 2ms/step - loss: 0.2366 - accuracy: 0.9110

Out[11]:

<tensorflow.python.keras.callbacks.History at 0x7f4a7bc54f90>

As the model trains, the loss and accuracy metrics are displayed. This model reaches an accuracy of about 0.91 (or 91%) on the training data.

Next, we have to test the trained model against the test data.

In [12]:

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print('\nTest accuracy:', test_acc)

313/313 - 0s - loss: 0.3595 - accuracy: 0.8737

Test accuracy: 0.8737000226974487

It turns out that the accuracy on the test dataset is a little less than the accuracy on the training dataset. This gap between training accuracy and test accuracy represents overfitting. Overfitting happens when a machine learning model performs worse on new, previously unseen inputs than it does on the training data. An overfitted model "memorizes" the noise and details in the training dataset to a point where it negatively impacts the performance of the model on the new data.

With the model trained, you can use it to make predictions about some images.

Let's plot several images with their predictions. Note that the model can be wrong even when very confident.

In [16]:

probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_images)

def plot_image(i, predictions_array, true_label, img):
  true_label, img = true_label[i], img[i]
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])

  plt.imshow(img, cmap=plt.cm.binary)

  predicted_label = np.argmax(predictions_array)
  if predicted_label == true_label:
    color = 'blue'
  else:
    color = 'red'

  plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                100*np.max(predictions_array),
                                class_names[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
  true_label = true_label[i]
  plt.grid(False)
  plt.xticks(range(10))
  plt.yticks([])
  thisplot = plt.bar(range(10), predictions_array, color="#777777")
  plt.ylim([0, 1])
  predicted_label = np.argmax(predictions_array)

  thisplot[predicted_label].set_color('red')
  thisplot[true_label].set_color('blue')

In [17]:

# Plot the first X test images, their predicted labels, and the true labels.
# Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_image(i, predictions[i], test_labels, test_images)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()

We also can use the model to classify a single photo:

In [19]:

img = test_images[20]
img = (np.expand_dims(img,0))
predictions_single = probability_model.predict(img)
plot_value_array(1, predictions_single[0], test_labels)
_ = plt.xticks(range(10), class_names, rotation=45)

Deep Neural Networks and CNNs

Convolutional Neural Networks on MNIST

Classification with CNNs

Importing Libraries

Loading the Dataset

Visualizing Samples

Defining the CNN Model

Training the CNN Model on MNIST

Evaluating on Test Data

Batch Normalization

Dropout

Visualizing Filters

Visualizing Feature Maps

References

Linear Regression Baselines

Introduction

Importing Packages

Reading Data

Dataset Overview

Regression Models

Linear Regression

Simple Linear Regression

Multiple Regression

Ridge Regression

Lasso Regression

Measuring Error

Visualization

Neural Networks and MLPs on Fashion MNIST

Introduction

Training the Model

References

Mahdi Salmani

Parsa Hosseini

Alireza Dehghanpour Farashah

Ali J. Alaee