Deep learning.22 practice of improving deep neural network

Posted by LanceT on Thu, 16 Sep 2021 19:23:55 +0200

Three methods of initializing parameters

First, download the data set and necessary files required for this practice. ( Download link )Put it in the project folder established below.

Then open pycharm, create a new project called improved neural network, and then create a new python file called init.py. Three initialization methods are used to import related libraries and data sets, namely zero initialization, random initialization, and He initialization.

Zero initialization: set in the input parameters initialization = "zeros". 
Random initialization: set in the input parameters initialization = "random"
He Initialization: set in input parameters initialization = "he"

Finally, the codes of the three initialization methods are as follows. You can cancel the annotation of the training model code under a method to view the training effect of the method. Only the effect of the last He initialization is shown here.

# Import the required libraries and datasets
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import init_utils   #Part I, initialization
import reg_utils    #The second part is regularization
import gc_utils     #The third part is gradient verification
from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec

# set default size of plots
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# Read and draw data
train_X, train_Y, test_X, test_Y = load_dataset()
#Drawing display
# plt.show()

# Define neural network model
def model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he", is_polt=True):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0] | 1],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Print cost value, once every 1000 iterations
        initialization - String type, initialization type["zeros" | "random" | "he"]
        is_polt - Whether to draw the curve of gradient descent
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 10, 5, 1]

    # Select the type of initialization parameter
    if initialization == "zeros":
        parameters = initialize_parameters_zeros(layers_dims)
    elif initialization == "random":
        parameters = initialize_parameters_random(layers_dims)
    elif initialization == "he":
        parameters = initialize_parameters_he(layers_dims)
    else:
        print("Bad initialization parameters! Program exit")
        exit

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        a3, cache = init_utils.forward_propagation(X, parameters)

        # Calculate cost
        cost = init_utils.compute_loss(a3, Y)

        # Back propagation
        grads = init_utils.backward_propagation(X, Y, cache)

        # Update parameters
        parameters = init_utils.update_parameters(parameters, grads, learning_rate)

        # Record cost
        if i % 1000 == 0:
            costs.append(cost)
            # Print cost
            if print_cost:
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # After learning, draw the cost curve
    if is_polt:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (per hundreds)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return parameters after learning
    return parameters


# All parameters are initialized to 0
def initialize_parameters_zeros(layers_dims):
    """
    Set all parameters of the model to 0

    Parameters:
        layers_dims - List, the number of layers of the model and the number of nodes corresponding to each layer
    return
        parameters - Contains all W and b Dictionary of
            W1 - Weight matrix, dimension( layers_dims[1], layers_dims[0])
            b1 - Offset vector, dimension( layers_dims[1],1)
            ···
            WL - Weight matrix, dimension( layers_dims[L], layers_dims[L -1])
            bL - Offset vector, dimension( layers_dims[L],1)
    """
    parameters = {}

    L = len(layers_dims)  # Number of network layers

    for l in range(1, L):
        parameters["W" + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
        parameters["b" + str(l)] = np.zeros((layers_dims[l], 1))

    assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
    assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))

    return parameters

# Test to see if they are all 0
# parameters = initialize_parameters_zeros([3,2,1])
# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))

# The training model is initialized with zero
#parameters = model(train_X, train_Y, initialization="zeros",is_polt=True)


#View forecast results
#print ("training set:")
#predictions_train = init_utils.predict(train_X, train_Y, parameters)
#print ("test set:")
#predictions_test = init_utils.predict(test_X, test_Y, parameters)

# Parameter random initialization
def initialize_parameters_random(layers_dims):
    """
    Parameters:
        layers_dims - List, the number of layers of the model and the number of nodes corresponding to each layer
    return
        parameters - Contains all W and b Dictionary of
            W1 - Weight matrix, dimension( layers_dims[1], layers_dims[0])
            b1 - Offset vector, dimension( layers_dims[1],1)
            ···
            WL - Weight matrix, dimension( layers_dims[L], layers_dims[L -1])
            b1 - Offset vector, dimension( layers_dims[L],1)
            Initialize weights to larger random values (press*10 Zoom) and set the deviation to 0.
            take np.random.randn(..,..) * 10 For weights, set np.zeros((.., ..))For deviation
    """

    parameters = {}
    L = len(layers_dims)  # Number of layers

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 10  # Use 10x zoom
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

        # Use assertions to ensure that my data format is correct
        assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
        assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))

    return parameters

# Test the parameter output of random initialization
# parameters = initialize_parameters_random([3, 2, 1])
# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))

#Use random initialization training model
#parameters = model(train_X, train_Y, initialization = "random",is_polt=True)

#Output the accuracy of training set and test set
# print("training set:")
# predictions_train = init_utils.predict(train_X, train_Y, parameters)
# print("test set:")
# predictions_test = init_utils.predict(test_X, test_Y, parameters)

# View the classification results of the graph
# plt.title("Model with large random initialization")
# axes = plt.gca()
# axes.set_xlim([-1.5, 1.5])
# axes.set_ylim([-1.5, 1.5])
# init_utils.plot_decision_boundary(lambda x: init_utils.predict_dec(parameters, x.T), train_X, train_Y)


# He initialization, according to the paper of he et al
def initialize_parameters_he(layers_dims):
    """
    Parameters:
        layers_dims - List, the number of layers of the model and the number of nodes corresponding to each layer
    return
        parameters - Contains all W and b Dictionary of
            W1 - Weight matrix, dimension( layers_dims[1], layers_dims[0])
            b1 - Offset vector, dimension( layers_dims[1],1)
            ···
            WL - Weight matrix, dimension( layers_dims[L], layers_dims[L -1])
            b1 - Offset vector, dimension( layers_dims[L],1)
    """

    np.random.seed(3)  # Specify random seed
    parameters = {}
    L = len(layers_dims)  # Number of layers

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

        # Use assertions to ensure that my data format is correct
        assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
        assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))

    return parameters

# Train the model and output the accuracy of the model
parameters = model(train_X, train_Y, initialization = "he",is_polt=True)
#
#
print("Training set:")
predictions_train = init_utils.predict(train_X, train_Y, parameters)
print("Test set:")
init_utils.predictions_test = init_utils.predict(test_X, test_Y, parameters)

# Draw a picture of the forecast
plt.title("Model with He initialization")
axes = plt.gca()
axes.set_xlim([-1.5, 1.5])
axes.set_ylim([-1.5, 1.5])
init_utils.plot_decision_boundary(lambda x: init_utils.predict_dec(parameters, x.T), train_X, train_Y)

The data initially loaded is shown below,

Using the method of He, the cost function J is as follows

Final classification effect

Initialization weight parameter complete code

# Import the required libraries and datasets
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import init_utils   #Part I, initialization
import reg_utils    #The second part is regularization
import gc_utils     #The third part is gradient verification
from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec

# set default size of plots
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# Read and draw data
train_X, train_Y, test_X, test_Y = load_dataset()
#Drawing display
plt.show()

# Define neural network model
def model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he", is_polt=True):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0] | 1],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Print cost value, once every 1000 iterations
        initialization - String type, initialization type["zeros" | "random" | "he"]
        is_polt - Whether to draw the curve of gradient descent
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 10, 5, 1]

    # Select the type of initialization parameter
    if initialization == "zeros":
        parameters = initialize_parameters_zeros(layers_dims)
    elif initialization == "random":
        parameters = initialize_parameters_random(layers_dims)
    elif initialization == "he":
        parameters = initialize_parameters_he(layers_dims)
    else:
        print("Bad initialization parameters! Program exit")
        exit

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        a3, cache = init_utils.forward_propagation(X, parameters)

        # Calculate cost
        cost = init_utils.compute_loss(a3, Y)

        # Back propagation
        grads = init_utils.backward_propagation(X, Y, cache)

        # Update parameters
        parameters = init_utils.update_parameters(parameters, grads, learning_rate)

        # Record cost
        if i % 1000 == 0:
            costs.append(cost)
            # Print cost
            if print_cost:
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # After learning, draw the cost curve
    if is_polt:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (per hundreds)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return parameters after learning
    return parameters


# All parameters are initialized to 0
def initialize_parameters_zeros(layers_dims):
    """
    Set all parameters of the model to 0

    Parameters:
        layers_dims - List, the number of layers of the model and the number of nodes corresponding to each layer
    return
        parameters - Contains all W and b Dictionary of
            W1 - Weight matrix, dimension( layers_dims[1], layers_dims[0])
            b1 - Offset vector, dimension( layers_dims[1],1)
            ···
            WL - Weight matrix, dimension( layers_dims[L], layers_dims[L -1])
            bL - Offset vector, dimension( layers_dims[L],1)
    """
    parameters = {}

    L = len(layers_dims)  # Number of network layers

    for l in range(1, L):
        parameters["W" + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
        parameters["b" + str(l)] = np.zeros((layers_dims[l], 1))

    assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
    assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))

    return parameters

# Test to see if they are all 0
# parameters = initialize_parameters_zeros([3,2,1])
# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))

# The training model is initialized with zero
#parameters = model(train_X, train_Y, initialization="zeros",is_polt=True)


#View forecast results
#print ("training set:")
#predictions_train = init_utils.predict(train_X, train_Y, parameters)
#print ("test set:")
#predictions_test = init_utils.predict(test_X, test_Y, parameters)

# Parameter random initialization
def initialize_parameters_random(layers_dims):
    """
    Parameters:
        layers_dims - List, the number of layers of the model and the number of nodes corresponding to each layer
    return
        parameters - Contains all W and b Dictionary of
            W1 - Weight matrix, dimension( layers_dims[1], layers_dims[0])
            b1 - Offset vector, dimension( layers_dims[1],1)
            ···
            WL - Weight matrix, dimension( layers_dims[L], layers_dims[L -1])
            b1 - Offset vector, dimension( layers_dims[L],1)
            Initialize weights to larger random values (press*10 Zoom) and set the deviation to 0.
            take np.random.randn(..,..) * 10 For weights, set np.zeros((.., ..))For deviation
    """

    parameters = {}
    L = len(layers_dims)  # Number of layers

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 10  # Use 10x zoom
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

        # Use assertions to ensure that my data format is correct
        assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
        assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))

    return parameters

# Test the parameter output of random initialization
# parameters = initialize_parameters_random([3, 2, 1])
# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))

#Use random initialization training model
#parameters = model(train_X, train_Y, initialization = "random",is_polt=True)

#Output the accuracy of training set and test set
# print("training set:")
# predictions_train = init_utils.predict(train_X, train_Y, parameters)
# print("test set:")
# predictions_test = init_utils.predict(test_X, test_Y, parameters)

# View the classification results of the graph
# plt.title("Model with large random initialization")
# axes = plt.gca()
# axes.set_xlim([-1.5, 1.5])
# axes.set_ylim([-1.5, 1.5])
# init_utils.plot_decision_boundary(lambda x: init_utils.predict_dec(parameters, x.T), train_X, train_Y)


# He initialization, according to the paper of he et al
def initialize_parameters_he(layers_dims):
    """
    Parameters:
        layers_dims - List, the number of layers of the model and the number of nodes corresponding to each layer
    return
        parameters - Contains all W and b Dictionary of
            W1 - Weight matrix, dimension( layers_dims[1], layers_dims[0])
            b1 - Offset vector, dimension( layers_dims[1],1)
            ···
            WL - Weight matrix, dimension( layers_dims[L], layers_dims[L -1])
            b1 - Offset vector, dimension( layers_dims[L],1)
    """

    np.random.seed(3)  # Specify random seed
    parameters = {}
    L = len(layers_dims)  # Number of layers

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

        # Use assertions to ensure that my data format is correct
        assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
        assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))

    return parameters

# Train the model and output the accuracy of the model
parameters = model(train_X, train_Y, initialization = "he",is_polt=True)
#
#
print("Training set:")
predictions_train = init_utils.predict(train_X, train_Y, parameters)
print("Test set:")
init_utils.predictions_test = init_utils.predict(test_X, test_Y, parameters)

# Draw a picture of the forecast
plt.title("Model with He initialization")
axes = plt.gca()
axes.set_xlim([-1.5, 1.5])
axes.set_ylim([-1.5, 1.5])
init_utils.plot_decision_boundary(lambda x: init_utils.predict_dec(parameters, x.T), train_X, train_Y)


Summary

  • Using zero initialization, every example predicted by the model is 0. Generally, initializing all weights to zero will make the network unable to break the symmetry. This means that each neuron in each layer will learn the same thing. Therefore, the weight w should be initialized randomly, and it is OK to initialize the deviation b to 0
  • Using random initialization, initializing weights to very large random values does not work well. Initializing to smaller random values would be better. The important question is: how small should these random values be?
  • Try "He initialization", which is named after He et al. (similar to "Xavier initialization", but Xavier initialization uses the scale factor sqrt(1./layers_dims[l-1]) to represent the weight, and He initialization is sqrt(1./layers_dims[l-1])
  • Different initialization will lead to different results. Random initialization is used to break the symmetry and ensure that different hidden units can learn different things. Do not initialize to too large values. Initialization is very effective for networks with ReLU activation.

Regularization

Regularization solves the over fitting phenomenon caused by insufficient training data.
Continue to create a new python file, zheng_ze.py, in the project directory. The required data and library are the same as above. The first step is to import and store the data in the code.

import numpy as np
import matplotlib.pyplot as plt
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *


plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

train_X, train_Y, test_X, test_Y = load_2D_dataset()
# Display data
plt.show()

Our data set is about the football game. The purpose is to predict the point where the goalkeeper sends the ball so that the local players can catch it. The data set is shown as follows. The purple point is the point that the local players can catch, and the red point is the point that the opponent can catch.

Using non regularization model

import numpy as np
import matplotlib.pyplot as plt
import reg_utils
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *


plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

train_X, train_Y, test_X, test_Y = load_2D_dataset()
plt.show()

# Define our model
def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0](blue) | 1(gules)],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Whether to print the cost value. Print once every 10000 iterations, but record one cost value every 1000 iterations
        is_polt - Whether to draw the curve of gradient descent
        lambd - Regularized hyperparameters, real numbers, will lambd If the input is set to a non-zero value, regularization is enabled; otherwise, regularization is turned off
        We use“ lambd"Not“ lambda",Because“ lambda"yes Python Reserved keywords in
        keep_prob - The probability of randomly deleting nodes is set to a value less than 1 when regularization is turned on.
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    # Initialization parameters
    parameters = reg_utils.initialize_parameters(layers_dims)

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        ##Delete nodes randomly
        if keep_prob == 1:
            ###Do not randomly delete nodes
            a3, cache = reg_utils.forward_propagation(X, parameters)
        elif keep_prob < 1:
            ###Randomly delete nodes
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        else:
            print("keep_prob Parameter error! Program exit.")
            exit

        # Calculate cost
        ## Whether to use two norm
        if lambd == 0:
            ###L2 regularization is not used
            cost = reg_utils.compute_cost(a3, Y)
        else:
            ###Using L2 regularization
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

        # Back propagation
        ##L2 regularization and random deletion of nodes can be used at the same time, but not in this experiment.
        assert (lambd == 0 or keep_prob == 1)

        ##Usage of two parameters
        if (lambd == 0 and keep_prob == 1):
            ### Do not use L2 regularization and do not use random deletion of nodes
            grads = reg_utils.backward_propagation(X, Y, cache)
        elif lambd != 0:
            ### L2 regularization is used instead of randomly deleting nodes
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            ### Randomly delete nodes without L2 regularization
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # Update parameters
        parameters = reg_utils.update_parameters(parameters, grads, learning_rate)

        # Record and print costs
        if i % 1000 == 0:
            ## Record cost
            costs.append(cost)
            if (print_cost and i % 10000 == 0):
                # Print cost
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # Whether to draw cost curve
    if is_plot:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (x1,000)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return learned parameters
    return parameters

# Training without regularization and showing accuracy
parameters = model(train_X, train_Y,is_plot=True)
print("Training set:")
predictions_train = reg_utils.predict(train_X, train_Y, parameters)
print("Test set:")
predictions_test = reg_utils.predict(test_X, test_Y, parameters)

# Prediction results of data
plt.title("Model without regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)


It can be seen that the training accuracy is 94.8%, while the test accuracy is 91.5%

As can be seen from the figure below, the non regularized model obviously over fitted the training set and fitted some noise points!

Using L2 regularization

The loss function using L2 regularization needs to be modified, as shown below

J just be turn = − 1 m ∑ i = 1 m ( y ( i ) log ⁡ ( a [ L ] ( i ) ) + ( 1 − y ( i ) ) log ⁡ ( 1 − a [ L ] ( i ) ) ) ⏟ cross-entropy cost + 1 m λ 2 ∑ l ∑ k ∑ j W k , j [ l ] 2 ⏟ L2   Regularization cost (2) In the face of the J {thatis the beginning of the J {}}}}}}} thefollowing from the beginning of the first of the following of the J {{{{{{{{}} = = = = completingthe beginning of the first of the J of the J of the J of the j} = = = \ small \ underbattle {- \ frac{\ frac{\ underbattle{\ frac {{{1 {{{{{{{{{{{{{{{{}} \\} \ full full full} \ amount amount {{\\\\k \ sum \ limits_j w {K, J} ^ {[l] 2} \ text {L2 regularization cost} \ tag{2} J regularization = cross entropy   cost −m1​i=1∑m​(y(i)log(a[L](i))+(1−y(i))log(1−a[L](i)))​​+L2   Regularization cost m1​2λ​l∑​k∑​j∑​Wk,j[l]2​​​(2)
calculation ∑ k ∑ j W k , j [ l ] 2 \sum\limits_k\sum\limits_j W_{k,j}^{[l]2} The code of k Σ J Σ Wk,j[l]2 is np.sum(np.square(Wl)).
Note: you must W [ 1 ] W^{[1]} W[1], W [ 2 ] W^{[2]} W[2], W [ 3 ] W^{[3]} W[3] do this, then add the three items and multiply by 1 m λ 2 \frac{1}{m}\frac{\lambda}{2} m1​2λ​.

# Cost function calculation of L2 regularization
def compute_cost_with_regularization(A3, Y, parameters, lambd):
    """
    Implementation of formula 2 L2 Regularization calculation cost

    Parameters:
        A3 - The dimension of the output result of forward propagation is (number of output nodes, training/Number of tests)
        Y - The label vector corresponds to the data one by one, and the dimension is(Number of output nodes, training/Number of tests)
        parameters - Dictionary containing parameters after model learning
    return:
        cost - The value of the regularization loss calculated using equation 2

    """
    m = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    W3 = parameters["W3"]

    cross_entropy_cost = reg_utils.compute_cost(A3, Y)

    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m)

    cost = cross_entropy_cost + L2_regularization_cost

    return cost

Because the cost function J is changed, the backward propagation also needs to be redefined. The code is as follows
Gradient calculation formula of regularization part d d W ( 1 2 λ m W 2 ) = λ m W \frac{d}{dW} ( \frac{1}{2}\frac{\lambda}{m} W^2) = \frac{\lambda}{m} W dWd​(21​mλ​W2)=mλ​W

def backward_propagation_with_regularization(X, Y, cache, lambd):
    """
    Implementation we added L2 Backward propagation of regularized model.

    Parameters:
        X - Enter a dataset with dimension (enter the number of nodes and the number in the dataset)
        Y - Label, dimension is (number of output nodes, number in dataset)
        cache - come from forward_propagation()of cache output
        lambda - regularization Super parameter, real number

    return:
        gradients - A dictionary containing gradients for each parameter, activation value, and pre activation value variable
    """

    m = X.shape[1]

    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y

    dW3 = (1 / m) * np.dot(dZ3, A2.T) + ((lambd * W3) / m)
    db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = (1 / m) * np.dot(dZ2, A1.T) + ((lambd * W2) / m)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = (1 / m) * np.dot(dZ1, X.T) + ((lambd * W1) / m)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

Now we use L2 regularization to train the model and output the accuracy, cost function, Image J, and prediction classification results.

import numpy as np
import matplotlib.pyplot as plt
import reg_utils
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *


plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

train_X, train_Y, test_X, test_Y = load_2D_dataset()
plt.show()

# Define our model
def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0](blue) | 1(gules)],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Whether to print the cost value. Print once every 10000 iterations, but record one cost value every 1000 iterations
        is_polt - Whether to draw the curve of gradient descent
        lambd - Regularized hyperparameters, real numbers, will lambd If the input is set to a non-zero value, regularization is enabled; otherwise, regularization is turned off
        We use“ lambd"Not“ lambda",Because“ lambda"yes Python Reserved keywords in
        keep_prob - The probability of randomly deleting nodes is set to a value less than 1 when regularization is turned on.
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    # Initialization parameters
    parameters = reg_utils.initialize_parameters(layers_dims)

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        ##Delete nodes randomly
        if keep_prob == 1:
            ###Do not randomly delete nodes
            a3, cache = reg_utils.forward_propagation(X, parameters)
        elif keep_prob < 1:
            ###Randomly delete nodes
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        else:
            print("keep_prob Parameter error! Program exit.")
            exit

        # Calculate cost
        ## Whether to use two norm
        if lambd == 0:
            ###L2 regularization is not used
            cost = reg_utils.compute_cost(a3, Y)
        else:
            ###Using L2 regularization
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

        # Back propagation
        ##L2 regularization and random deletion of nodes can be used at the same time, but not in this experiment.
        assert (lambd == 0 or keep_prob == 1)

        ##Usage of two parameters
        if (lambd == 0 and keep_prob == 1):
            ### Do not use L2 regularization and do not use random deletion of nodes
            grads = reg_utils.backward_propagation(X, Y, cache)
        elif lambd != 0:
            ### L2 regularization is used instead of randomly deleting nodes
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            ### Randomly delete nodes without L2 regularization
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # Update parameters
        parameters = reg_utils.update_parameters(parameters, grads, learning_rate)

        # Record and print costs
        if i % 1000 == 0:
            ## Record cost
            costs.append(cost)
            if (print_cost and i % 10000 == 0):
                # Print cost
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # Whether to draw cost curve
    if is_plot:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (x1,000)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return learned parameters
    return parameters

# Training without regularization and showing accuracy
# parameters = model(train_X, train_Y,is_plot=True)
# print("training set:")
# predictions_train = reg_utils.predict(train_X, train_Y, parameters)
# print("test set:")
# predictions_test = reg_utils.predict(test_X, test_Y, parameters)

# Prediction results of data
# plt.title("Model without regularization")
# axes = plt.gca()
# axes.set_xlim([-0.75,0.40])
# axes.set_ylim([-0.75,0.65])
# reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)

# Cost function calculation of L2 regularization
def compute_cost_with_regularization(A3, Y, parameters, lambd):
    """
    Implementation of formula 2 L2 Regularization calculation cost

    Parameters:
        A3 - The dimension of the output result of forward propagation is (number of output nodes, training/Number of tests)
        Y - The label vector corresponds to the data one by one, and the dimension is(Number of output nodes, training/Number of tests)
        parameters - Dictionary containing parameters after model learning
    return:
        cost - The value of the regularization loss calculated using equation 2

    """
    m = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    W3 = parameters["W3"]

    cross_entropy_cost = reg_utils.compute_cost(A3, Y)

    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m)

    cost = cross_entropy_cost + L2_regularization_cost

    return cost


# Of course, because the cost function is changed, we must also change the back-propagation function, and all gradients must be calculated according to the new cost value.
def backward_propagation_with_regularization(X, Y, cache, lambd):
    """
    Implementation we added L2 Backward propagation of regularized model.

    Parameters:
        X - Enter a dataset with dimension (enter the number of nodes and the number in the dataset)
        Y - Label, dimension is (number of output nodes, number in dataset)
        cache - come from forward_propagation()of cache output
        lambda - regularization Super parameter, real number

    return:
        gradients - A dictionary containing gradients for each parameter, activation value, and pre activation value variable
    """

    m = X.shape[1]

    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y

    dW3 = (1 / m) * np.dot(dZ3, A2.T) + ((lambd * W3) / m)
    db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = (1 / m) * np.dot(dZ2, A1.T) + ((lambd * W2) / m)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = (1 / m) * np.dot(dZ1, X.T) + ((lambd * W1) / m)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

# Model training using L2 regularization
parameters = model(train_X, train_Y, lambd=0.7,is_plot=True)

# Output accuracy
print("Using regularization, the training set:")
predictions_train = reg_utils.predict(train_X, train_Y, parameters)
print("Using regularization, test set:")
predictions_test = reg_utils.predict(test_X, test_Y, parameters)

#View the classification results of the dataset
plt.title("Model with L2-regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)

As can be seen from the results in the figure below, our accuracy has been improved to 93.5%


Note: lambd in the code means λ, Is a hyperparameter, L2 regularization is to make the boundary smoother if λ Too large, it may be too smooth. Impact of L2 regularization: regularization conditions will be added to the loss function. In the gradient of the weight matrix, the weight will eventually become smaller ("weight attenuation"), and the weight will be pushed to a smaller value.

Random deactivation using Dropout

Dropout is a regularization technique widely used for deep learning. It turns off some neurons at random in each iteration. In each iteration, you turn off some neurons in a layer with probability keep prop. Both forward and back propagation of closed neurons to iteration are not conducive to training. The principle of execution is as follows:

Create a dropout.py file. Under the same project folder, import relevant libraries, load data and define neural network model.

import numpy as np
import matplotlib.pyplot as plt
import reg_utils
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *


plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

train_X, train_Y, test_X, test_Y = load_2D_dataset()
plt.show()

# Define our model
def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0](blue) | 1(gules)],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Whether to print the cost value. Print once every 10000 iterations, but record one cost value every 1000 iterations
        is_polt - Whether to draw the curve of gradient descent
        lambd - Regularized hyperparameters, real numbers, will lambd If the input is set to a non-zero value, regularization is enabled; otherwise, regularization is turned off
        We use“ lambd"Not“ lambda",Because“ lambda"yes Python Reserved keywords in
        keep_prob - The probability of randomly deleting nodes is set to a value less than 1 when regularization is turned on.
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    # Initialization parameters
    parameters = reg_utils.initialize_parameters(layers_dims)

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        ##Delete nodes randomly
        if keep_prob == 1:
            ###Do not randomly delete nodes
            a3, cache = reg_utils.forward_propagation(X, parameters)
        elif keep_prob < 1:
            ###Randomly delete nodes
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        else:
            print("keep_prob Parameter error! Program exit.")
            exit

        # Calculate cost
        ## Whether to use two norm
        if lambd == 0:
            ###L2 regularization is not used
            cost = reg_utils.compute_cost(a3, Y)
        else:
            ###Using L2 regularization
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

        # Back propagation
        ##L2 regularization and random deletion of nodes can be used at the same time, but not in this experiment.
        assert (lambd == 0 or keep_prob == 1)

        ##Usage of two parameters
        if (lambd == 0 and keep_prob == 1):
            ### Do not use L2 regularization and do not use random deletion of nodes
            grads = reg_utils.backward_propagation(X, Y, cache)
        elif lambd != 0:
            ### L2 regularization is used instead of randomly deleting nodes
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            ### Randomly delete nodes without L2 regularization
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # Update parameters
        parameters = reg_utils.update_parameters(parameters, grads, learning_rate)

        # Record and print costs
        if i % 1000 == 0:
            ## Record cost
            costs.append(cost)
            if (print_cost and i % 10000 == 0):
                # Print cost
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # Whether to draw cost curve
    if is_plot:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (x1,000)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return learned parameters
    return parameters
# Define dropout random deactivation forward propagation
def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
    """
    Forward propagation with randomly discarded nodes is realized.
    LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.

    Parameters:
        X  - Input dataset, dimension is (2, number of examples)
        parameters - Include parameters“ W1","b1","W2","b2","W3","b3"of python Dictionaries:
            W1  - Weight matrix with dimension of (20),2)
            b1  - Bias, dimension (20),1)
            W2  - Weight matrix with dimension (3),20)
            b2  - Bias, dimension (3),1)
            W3  - Weight matrix with dimension (1),3)
            b3  - Bias, dimension (1),1)
        keep_prob  - Probability of random deletion, real number
    return:
        A3  - The last activation value, dimension (1),1),Forward propagating output
        cache - Some tuples of values used to calculate back propagation are stored
    """
    np.random.seed(1)

    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]

    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    Z1 = np.dot(W1, X) + b1
    A1 = reg_utils.relu(Z1)

    # The following steps 1-4 correspond to the above steps 1-4.
    D1 = np.random.rand(A1.shape[0], A1.shape[1])  # Step 1: initialize the matrix D1 = np.random.rand(...,...)
    D1 = D1 < keep_prob  # Step 2: convert the value of D1 to 0 or 1 (use keep_prob as the threshold)
    A1 = A1 * D1  # Step 3: discard some nodes of A1 (change its value to 0 or False)
    A1 = A1 / keep_prob  # Step 4: scale the value of the UN discarded node (not 0)
    """
    #If you don't understand, just run the following code.
    import numpy as np
    np.random.seed(1)
    A1 = np.random.randn(1,3)

    D1 = np.random.rand(A1.shape[0],A1.shape[1])
    keep_prob=0.5
    D1 = D1 < keep_prob
    print(D1)

    A1 = 0.01
    A1 = A1 * D1
    A1 = A1 / keep_prob
    print(A1)
    """

    Z2 = np.dot(W2, A1) + b2
    A2 = reg_utils.relu(Z2)

    # The following steps 1-4 correspond to the above steps 1-4.
    D2 = np.random.rand(A2.shape[0], A2.shape[1])  # Step 1: initialize matrix D2 = np.random.rand(...,...)
    D2 = D2 < keep_prob  # Step 2: convert the value of D2 to 0 or 1 (use keep_prob as the threshold)
    A2 = A2 * D2  # Step 3: discard some nodes of A1 (change its value to 0 or False)
    A2 = A2 / keep_prob  # Step 4: scale the value of the UN discarded node (not 0)

    Z3 = np.dot(W3, A2) + b3
    A3 = reg_utils.sigmoid(Z3)

    cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)

    return A3, cache

# The corresponding backward propagation function is changed
def backward_propagation_with_dropout(X, Y, cache, keep_prob):
    """
    Implement the backward propagation of our randomly deleted model.
    Parameters:
        X  - Input dataset, dimension is (2, number of examples)
        Y  - Label, dimension is (number of output nodes, number of samples)
        cache - come from forward_propagation_with_dropout()of cache output
        keep_prob  - Probability of random deletion, real number

    return:
        gradients - A dictionary of gradient values for each parameter, activation value, and pre activation variable
    """
    m = X.shape[1]
    (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y
    dW3 = (1 / m) * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    dA2 = np.dot(W3.T, dZ3)

    dA2 = dA2 * D2  # Step 1: use the same nodes during forward propagation and discard those closed nodes (because any number multiplied by 0 or False is 0 or False)
    dA2 = dA2 / keep_prob  # Step 2: scale the value of the UN discarded node (not 0)

    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)

    dA1 = dA1 * D1  # Step 1: use the same nodes during forward propagation and discard those closed nodes (because any number multiplied by 0 or False is 0 or False)
    dA1 = dA1 / keep_prob  # Step 2: scale the value of the UN discarded node (not 0)

    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

Next, we use dropout regularization to train the model, and show the accuracy, cost function J and final prediction classification

# The training model uses dropou regularization, with a 14 percent (0.14) probability of losing some neural units each time
parameters = model(train_X, train_Y, keep_prob=0.86, learning_rate=0.3,is_plot=True)

print("Use to randomly delete nodes, training sets:")
predictions_train = reg_utils.predict(train_X, train_Y, parameters)
print("Test set using randomly deleted nodes:")
reg_utils.predictions_test = reg_utils.predict(test_X, test_Y, parameters)

# View forecast classification
plt.title("Model with dropout")
axes = plt.gca()
axes.set_xlim([-0.75, 0.40])
axes.set_ylim([-0.75, 0.65])
reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)


be careful:

  • dropout is a regularization technique.
  • Use dropout only during training and not during testing.
  • dropout is applied during both forward and reverse propagation.

Regularization complete code

import numpy as np
import matplotlib.pyplot as plt
import reg_utils
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *


plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

train_X, train_Y, test_X, test_Y = load_2D_dataset()
plt.show()

# Define our model
def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0](blue) | 1(gules)],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Whether to print the cost value. Print once every 10000 iterations, but record one cost value every 1000 iterations
        is_polt - Whether to draw the curve of gradient descent
        lambd - Regularized hyperparameters, real numbers, will lambd If the input is set to a non-zero value, regularization is enabled; otherwise, regularization is turned off
        We use“ lambd"Not“ lambda",Because“ lambda"yes Python Reserved keywords in
        keep_prob - The probability of randomly deleting nodes is set to a value less than 1 when regularization is turned on.
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    # Initialization parameters
    parameters = reg_utils.initialize_parameters(layers_dims)

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        ##Delete nodes randomly
        if keep_prob == 1:
            ###Do not randomly delete nodes
            a3, cache = reg_utils.forward_propagation(X, parameters)
        elif keep_prob < 1:
            ###Randomly delete nodes
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        else:
            print("keep_prob Parameter error! Program exit.")
            exit

        # Calculate cost
        ## Whether to use two norm
        if lambd == 0:
            ###L2 regularization is not used
            cost = reg_utils.compute_cost(a3, Y)
        else:
            ###Using L2 regularization
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

        # Back propagation
        ##L2 regularization and random deletion of nodes can be used at the same time, but not in this experiment.
        assert (lambd == 0 or keep_prob == 1)

        ##Usage of two parameters
        if (lambd == 0 and keep_prob == 1):
            ### Do not use L2 regularization and do not use random deletion of nodes
            grads = reg_utils.backward_propagation(X, Y, cache)
        elif lambd != 0:
            ### L2 regularization is used instead of randomly deleting nodes
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            ### Randomly delete nodes without L2 regularization
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # Update parameters
        parameters = reg_utils.update_parameters(parameters, grads, learning_rate)

        # Record and print costs
        if i % 1000 == 0:
            ## Record cost
            costs.append(cost)
            if (print_cost and i % 10000 == 0):
                # Print cost
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # Whether to draw cost curve
    if is_plot:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (x1,000)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return learned parameters
    return parameters

# Training without regularization and showing accuracy
# parameters = model(train_X, train_Y,is_plot=True)
# print("training set:")
# predictions_train = reg_utils.predict(train_X, train_Y, parameters)
# print("test set:")
# predictions_test = reg_utils.predict(test_X, test_Y, parameters)

# Prediction results of data
# plt.title("Model without regularization")
# axes = plt.gca()
# axes.set_xlim([-0.75,0.40])
# axes.set_ylim([-0.75,0.65])
# reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)

# Cost function calculation of L2 regularization
def compute_cost_with_regularization(A3, Y, parameters, lambd):
    """
    Implementation of formula 2 L2 Regularization calculation cost

    Parameters:
        A3 - The dimension of the output result of forward propagation is (number of output nodes, training/Number of tests)
        Y - The label vector corresponds to the data one by one, and the dimension is(Number of output nodes, training/Number of tests)
        parameters - Dictionary containing parameters after model learning
    return:
        cost - The value of the regularization loss calculated using equation 2

    """
    m = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    W3 = parameters["W3"]

    cross_entropy_cost = reg_utils.compute_cost(A3, Y)

    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m)

    cost = cross_entropy_cost + L2_regularization_cost

    return cost


# Of course, because the cost function is changed, we must also change the back-propagation function, and all gradients must be calculated according to the new cost value.
def backward_propagation_with_regularization(X, Y, cache, lambd):
    """
    Implementation we added L2 Backward propagation of regularized model.

    Parameters:
        X - Enter a dataset with dimension (enter the number of nodes and the number in the dataset)
        Y - Label, dimension is (number of output nodes, number in dataset)
        cache - come from forward_propagation()of cache output
        lambda - regularization Super parameter, real number

    return:
        gradients - A dictionary containing gradients for each parameter, activation value, and pre activation value variable
    """

    m = X.shape[1]

    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y

    dW3 = (1 / m) * np.dot(dZ3, A2.T) + ((lambd * W3) / m)
    db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = (1 / m) * np.dot(dZ2, A1.T) + ((lambd * W2) / m)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = (1 / m) * np.dot(dZ1, X.T) + ((lambd * W1) / m)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

# Model training using L2 regularization
parameters = model(train_X, train_Y, lambd=0.7,is_plot=True)

# Output accuracy
print("Using regularization, the training set:")
predictions_train = reg_utils.predict(train_X, train_Y, parameters)
print("Using regularization, test set:")
predictions_test = reg_utils.predict(test_X, test_Y, parameters)

#View the classification results of the dataset
plt.title("Model with L2-regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)

Random deactivation complete code

import numpy as np
import matplotlib.pyplot as plt
import reg_utils
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *


plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

train_X, train_Y, test_X, test_Y = load_2D_dataset()
plt.show()

# Define our model
def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
    """
    Implement a three-layer neural network: LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    Parameters:
        X - The dimension of the entered data is(2, To train/Number of tests)
        Y - Label, [0](blue) | 1(gules)],Dimension is(1,This corresponds to the label of the input data)
        learning_rate - Learning rate
        num_iterations - Number of iterations
        print_cost - Whether to print the cost value. Print once every 10000 iterations, but record one cost value every 1000 iterations
        is_polt - Whether to draw the curve of gradient descent
        lambd - Regularized hyperparameters, real numbers, will lambd If the input is set to a non-zero value, regularization is enabled; otherwise, regularization is turned off
        We use“ lambd"Not“ lambda",Because“ lambda"yes Python Reserved keywords in
        keep_prob - The probability of randomly deleting nodes is set to a value less than 1 when regularization is turned on.
    return
        parameters - Parameters after learning
    """
    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    # Initialization parameters
    parameters = reg_utils.initialize_parameters(layers_dims)

    # Start learning
    for i in range(0, num_iterations):
        # Forward propagation
        ##Delete nodes randomly
        if keep_prob == 1:
            ###Do not randomly delete nodes
            a3, cache = reg_utils.forward_propagation(X, parameters)
        elif keep_prob < 1:
            ###Randomly delete nodes
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        else:
            print("keep_prob Parameter error! Program exit.")
            exit

        # Calculate cost
        ## Whether to use two norm
        if lambd == 0:
            ###L2 regularization is not used
            cost = reg_utils.compute_cost(a3, Y)
        else:
            ###Using L2 regularization
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

        # Back propagation
        ##L2 regularization and random deletion of nodes can be used at the same time, but not in this experiment.
        assert (lambd == 0 or keep_prob == 1)

        ##Usage of two parameters
        if (lambd == 0 and keep_prob == 1):
            ### Do not use L2 regularization and do not use random deletion of nodes
            grads = reg_utils.backward_propagation(X, Y, cache)
        elif lambd != 0:
            ### L2 regularization is used instead of randomly deleting nodes
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            ### Randomly delete nodes without L2 regularization
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # Update parameters
        parameters = reg_utils.update_parameters(parameters, grads, learning_rate)

        # Record and print costs
        if i % 1000 == 0:
            ## Record cost
            costs.append(cost)
            if (print_cost and i % 10000 == 0):
                # Print cost
                print("The first" + str(i) + "For the second iteration, the cost value is:" + str(cost))

    # Whether to draw cost curve
    if is_plot:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (x1,000)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # Return learned parameters
    return parameters

# Define dropout random deactivation forward propagation
def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
    """
    Forward propagation with randomly discarded nodes is realized.
    LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.

    Parameters:
        X  - Input dataset, dimension is (2, number of examples)
        parameters - Include parameters“ W1","b1","W2","b2","W3","b3"of python Dictionaries:
            W1  - Weight matrix with dimension of (20),2)
            b1  - Bias, dimension (20),1)
            W2  - Weight matrix with dimension (3),20)
            b2  - Bias, dimension (3),1)
            W3  - Weight matrix with dimension (1),3)
            b3  - Bias, dimension (1),1)
        keep_prob  - Probability of random deletion, real number
    return:
        A3  - The last activation value, dimension (1),1),Forward propagating output
        cache - Some tuples of values used to calculate back propagation are stored
    """
    np.random.seed(1)

    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]

    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    Z1 = np.dot(W1, X) + b1
    A1 = reg_utils.relu(Z1)

    # The following steps 1-4 correspond to the above steps 1-4.
    D1 = np.random.rand(A1.shape[0], A1.shape[1])  # Step 1: initialize the matrix D1 = np.random.rand(...,...)
    D1 = D1 < keep_prob  # Step 2: convert the value of D1 to 0 or 1 (use keep_prob as the threshold)
    A1 = A1 * D1  # Step 3: discard some nodes of A1 (change its value to 0 or False)
    A1 = A1 / keep_prob  # Step 4: scale the value of the UN discarded node (not 0)
    """
    #If you don't understand, just run the following code.
    import numpy as np
    np.random.seed(1)
    A1 = np.random.randn(1,3)

    D1 = np.random.rand(A1.shape[0],A1.shape[1])
    keep_prob=0.5
    D1 = D1 < keep_prob
    print(D1)

    A1 = 0.01
    A1 = A1 * D1
    A1 = A1 / keep_prob
    print(A1)
    """

    Z2 = np.dot(W2, A1) + b2
    A2 = reg_utils.relu(Z2)

    # The following steps 1-4 correspond to the above steps 1-4.
    D2 = np.random.rand(A2.shape[0], A2.shape[1])  # Step 1: initialize matrix D2 = np.random.rand(...,...)
    D2 = D2 < keep_prob  # Step 2: convert the value of D2 to 0 or 1 (use keep_prob as the threshold)
    A2 = A2 * D2  # Step 3: discard some nodes of A1 (change its value to 0 or False)
    A2 = A2 / keep_prob  # Step 4: scale the value of the UN discarded node (not 0)

    Z3 = np.dot(W3, A2) + b3
    A3 = reg_utils.sigmoid(Z3)

    cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)

    return A3, cache

# The corresponding backward propagation function is changed
def backward_propagation_with_dropout(X, Y, cache, keep_prob):
    """
    Implement the backward propagation of our randomly deleted model.
    Parameters:
        X  - Input dataset, dimension is (2, number of examples)
        Y  - Label, dimension is (number of output nodes, number of samples)
        cache - come from forward_propagation_with_dropout()of cache output
        keep_prob  - Probability of random deletion, real number

    return:
        gradients - A dictionary of gradient values for each parameter, activation value, and pre activation variable
    """
    m = X.shape[1]
    (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y
    dW3 = (1 / m) * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    dA2 = np.dot(W3.T, dZ3)

    dA2 = dA2 * D2  # Step 1: use the same nodes during forward propagation and discard those closed nodes (because any number multiplied by 0 or False is 0 or False)
    dA2 = dA2 / keep_prob  # Step 2: scale the value of the UN discarded node (not 0)

    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)

    dA1 = dA1 * D1  # Step 1: use the same nodes during forward propagation and discard those closed nodes (because any number multiplied by 0 or False is 0 or False)
    dA1 = dA1 / keep_prob  # Step 2: scale the value of the UN discarded node (not 0)

    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

# The training model uses dropou regularization, with a 14 percent (0.14) probability of losing some neural units each time
parameters = model(train_X, train_Y, keep_prob=0.86, learning_rate=0.3,is_plot=True)

print("Use to randomly delete nodes, training sets:")
predictions_train = reg_utils.predict(train_X, train_Y, parameters)
print("Test set using randomly deleted nodes:")
reg_utils.predictions_test = reg_utils.predict(test_X, test_Y, parameters)

# View forecast classification
plt.title("Model with dropout")
axes = plt.gca()
axes.set_xlim([-0.75, 0.40])
axes.set_ylim([-0.75, 0.65])
reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)

Summary

  • Regularization will help reduce over fitting.
  • Regularization will reduce the weight to a lower value.
  • L2 regularization and Dropout are two very effective regularization techniques.
  • Regularization will damage the performance of the training set! This is because it limits the ability of the network to over fit the training set. However, it can help the training model because it can ultimately provide better test accuracy.

Gradient test

During the calculation of back-propagation, the calculation process of back-propagation function is relatively complex. In order to verify whether the back-propagation function we obtained is correct, we need to write some code to verify the correctness of back-propagation function. First, let's take a look at the definition of derivative, θ Represents the parameters in the model, ∂ J ∂ θ \frac{\partial J}{\partial \theta} ∂ θ ∂ J ∂ is the value we want to test.

First, we test a one-dimensional linear model. Our function is J ( θ ) = θ x J(\theta) = \theta x J( θ)=θ x, θ Is a real valued parameter, X is the input, and the derivative is ∂ J ∂ θ \frac{\partial J}{\partial \theta} ∂θ∂J​.
The forward propagation of this function is defined as follows

def forward_propagation(x,theta):
    """
    
    Realize the linear forward propagation (calculation) presented in the figure J)(J(theta)= theta * x)
    
    Parameters:
    x  - A real value input
    theta  - Parameter is also a real number
    
    return:
    J  - function J Value of, using formula J(theta)= theta * x calculation
    """
    J = np.dot(theta,x)
    
    return J

Back propagation definition of this function

def backward_propagation(x,theta):
    """
    calculation J be relative toθDerivative of.
    
    Parameters:
        x  - A real value input
        theta  - Parameter is also a real number
    
    return:
        dtheta  - be relative toθCost gradient
    """
    dtheta = x
    
    return dtheta

Next is the step of gradient check, in which 2 in formula (4) means square.

When the gradient detection formula is defined and the difference is less than the negative 7th power of 10, we usually think that our calculation result is correct.

def gradient_check(x,theta,epsilon=1e-7):
    """
    
    Implement the back propagation in the diagram.
    
    Parameters:
        x  - A real value input
        theta  - Parameter is also a real number
        epsilon  - Use equation (3) to calculate the small offset of the input to calculate the approximate gradient
    
    return:
        Difference between approximate gradient and backward propagation gradient
    """
    
    #Calculate gradaprox using the left side of equation (3).
    thetaplus = theta + epsilon                               # Step 1
    thetaminus = theta - epsilon                              # Step 2
    J_plus = forward_propagation(x, thetaplus)                # Step 3
    J_minus = forward_propagation(x, thetaminus)              # Step 4
    gradapprox = (J_plus - J_minus) / (2 * epsilon)           # Step 5
    
    
    #Check that gradaprox is close enough to backward_ Output of propagation()
    grad = backward_propagation(x, theta)
    
    numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
    difference = numerator / denominator                               # Step 3'
    
    if difference < 1e-7:
        print("Gradient inspection: the gradient is normal!")
    else:
        print("Gradient check: gradient exceeds threshold!")
    
    return difference

Forward propagation in high dimensional case

def forward_propagation_n(X,Y,parameters):
    """
    Realize the forward propagation in the diagram (and calculate the cost).
    
    Parameters:
        X - Training set is m Examples
        Y -  m Label for example
        parameters - Include parameters“ W1","b1","W2","b2","W3","b3"of python Dictionaries:
            W1  - Weight matrix with dimension (5),4)
            b1  - Bias, dimension (5),1)
            W2  - Weight matrix with dimension (3),5)
            b2  - Bias, dimension (3),1)
            W3  - Weight matrix with dimension (1),3)
            b3  - Bias, dimension (1),1)
   
    return:
        cost - Cost function( logistic)
    """
    m = X.shape[1]
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    Z1 = np.dot(W1,X) + b1
    A1 = gc_utils.relu(Z1)
    
    Z2 = np.dot(W2,A1) + b2
    A2 = gc_utils.relu(Z2)
    
    Z3 = np.dot(W3,A2) + b3
    A3 = gc_utils.sigmoid(Z3)
    
    #Calculate cost
    logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
    cost = (1 / m) * np.sum(logprobs)
    
    cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)

    return cost, cache

Back propagation

def backward_propagation_n(X,Y,cache):
    """
    Implement the back propagation shown in the figure.
    
    Parameters:
        X - Input data points (number of input nodes, 1)
        Y - label
        cache - come from forward_propagation_n()of cache output
    
    return:
        gradients - A dictionary containing cost gradients associated with each parameter, activation, and pre activation variable.
    """
    m = X.shape[1]
    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
    
    dZ3 = A3 - Y
    dW3 = (1. / m) * np.dot(dZ3,A2.T)
    dW3 = 1. / m * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    
    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    #dW2 = 1. / m * np.dot(dZ2, A1.T) * 2  # Should not multiply by 2
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    #db1 = 4. / m * np.sum(dZ1, axis=1, keepdims=True) # Should not multiply by 4
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    
    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
                 "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
                 "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
 
    return gradients

The gradient test formula in the high-dimensional case is the same as the one-dimensional linear formula, but, θ It's no longer a scalar. It's a dictionary called "parameters". In the required database, we implemented a function "dictionary_to_vector()" for you. It converts the "parameter" dictionary into a vector called "value", which is obtained by reshaping all parameters (W1, b1, W2, b2, W3, b3) into vectors and concatenating them. The inverse function is "vector_to_dictionary" , which outputs back to the "parameters" dictionary.
Define n-dimensional gradient test function

def gradient_check_n(parameters,gradients,X,Y,epsilon=1e-7):
    """
    inspect backward_propagation_n Is it calculated correctly forward_propagation_n Output cost gradient
    
    Parameters:
        parameters - Include parameters“ W1","b1","W2","b2","W3","b3"of python Dictionaries:
        grad_output_propagation_n The output of contains the cost gradient associated with the parameter.
        x  - Enter the data point with dimension (enter the number of nodes, 1)
        y  - label
        epsilon  - Calculate the small offset of the input to calculate the approximate gradient
    
    return:
        difference - Difference between approximate gradient and backward propagation gradient
    """
    #Initialization parameters
    parameters_values , keys = gc_utils.dictionary_to_vector(parameters) #keys not available
    grad = gc_utils.gradients_to_vector(gradients)
    num_parameters = parameters_values.shape[0]
    J_plus = np.zeros((num_parameters,1))
    J_minus = np.zeros((num_parameters,1))
    gradapprox = np.zeros((num_parameters,1))
    
    #Calculate gradaprox
    for i in range(num_parameters):
        #Calculate J_plus [i]. Input: "parameters_values, epsilon". Output = "J_plus [i]"
        thetaplus = np.copy(parameters_values)                                                  # Step 1
        thetaplus[i][0] = thetaplus[i][0] + epsilon                                             # Step 2
        J_plus[i], cache = forward_propagation_n(X,Y,gc_utils.vector_to_dictionary(thetaplus))  # Step 3, the cache is not used
        
        #Calculate J_minus [i]. Input: "parameters_values, epsilon". Output = "J_minus [i]".
        thetaminus = np.copy(parameters_values)                                                 # Step 1
        thetaminus[i][0] = thetaminus[i][0] - epsilon                                           # Step 2        
        J_minus[i], cache = forward_propagation_n(X,Y,gc_utils.vector_to_dictionary(thetaminus))# Step 3, the cache is not used
        
        #Calculate gradaprox [i]
        gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)
        
    #Gradaprox and backward propagation gradients are compared by calculating the difference.
    numerator = np.linalg.norm(grad - gradapprox)                                     # Step 1'
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)                   # Step 2'
    difference = numerator / denominator                                              # Step 3'
    
    if difference < 1e-7:
        print("Gradient inspection: the gradient is normal!")
    else:
        print(difference,"Gradient check: gradient exceeds threshold!")
    
    return difference

Verify the gradient test results in multi-dimensional case

#Functions in testCases in the data file
X, Y, parameters = gradient_check_n_test_case()

cost, cache = forward_propagation_n(X, Y, parameters)
gradients = backward_propagation_n(X, Y, cache)
difference = gradient_check_n(parameters, gradients, X, Y)

Finally, the running results of multidimensional gradient test

Gradient test complete code

import numpy as np
import matplotlib.pyplot as plt
import reg_utils
import gc_utils
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *

# Forward propagation of one-dimensional linear model
def forward_propagation(x, theta):
    """

    Realize the linear forward propagation (calculation) presented in the figure J)(J(theta)= theta * x)

    Parameters:
    x  - A real value input
    theta  - Parameter is also a real number

    return:
    J  - function J Value of, using formula J(theta)= theta * x calculation
    """
    J = np.dot(theta, x)

    return J

#Back propagation
def backward_propagation(x, theta):
    """
    calculation J be relative toθDerivative of.

    Parameters:
        x  - A real value input
        theta  - Parameter is also a real number

    return:
        dtheta  - be relative toθCost gradient
    """
    dtheta = x

    return dtheta
# One dimensional linear gradient test function
def gradient_check(x, theta, epsilon=1e-7):
    """

    Implement the back propagation in the diagram.

    Parameters:
        x  - A real value input
        theta  - Parameter is also a real number
        epsilon  - Use equation (3) to calculate the small offset of the input to calculate the approximate gradient

    return:
        Difference between approximate gradient and backward propagation gradient
    """

    # Calculate gradaprox using the left side of equation (3).
    thetaplus = theta + epsilon  # Step 1
    thetaminus = theta - epsilon  # Step 2
    J_plus = forward_propagation(x, thetaplus)  # Step 3
    J_minus = forward_propagation(x, thetaminus)  # Step 4
    gradapprox = (J_plus - J_minus) / (2 * epsilon)  # Step 5

    # Check that gradaprox is close enough to the output of backward_propagation()
    grad = backward_propagation(x, theta)

    numerator = np.linalg.norm(grad - gradapprox)  # Step 1'
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)  # Step 2'
    difference = numerator / denominator  # Step 3'

    if difference < 1e-7:
        print("Gradient inspection: the gradient is normal!")
    else:
        print("Gradient check: gradient exceeds threshold!")

    return difference

# Test run results
# print("------------------ test gradient_check ---------------")
# x, theta = 2, 4
# difference = gradient_check(x, theta)
# print("difference = " + str(difference))

# Define multidimensional forward propagation
def forward_propagation_n(X, Y, parameters):
    """
    Realize the forward propagation in the diagram (and calculate the cost).

    Parameters:
        X - Training set is m Examples
        Y -  m Label for example
        parameters - Include parameters“ W1","b1","W2","b2","W3","b3"of python Dictionaries:
            W1  - Weight matrix with dimension (5),4)
            b1  - Bias, dimension (5),1)
            W2  - Weight matrix with dimension (3),5)
            b2  - Bias, dimension (3),1)
            W3  - Weight matrix with dimension (1),3)
            b3  - Bias, dimension (1),1)

    return:
        cost - Cost function( logistic)
    """
    m = X.shape[1]
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]

    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    Z1 = np.dot(W1, X) + b1
    A1 = gc_utils.relu(Z1)

    Z2 = np.dot(W2, A1) + b2
    A2 = gc_utils.relu(Z2)

    Z3 = np.dot(W3, A2) + b3
    A3 = gc_utils.sigmoid(Z3)

    # Calculate cost
    logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
    cost = (1 / m) * np.sum(logprobs)

    cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)

    return cost, cache

# Multidimensional back propagation
def backward_propagation_n(X, Y, cache):
    """
    Implement the back propagation shown in the figure.

    Parameters:
        X - Input data points (number of input nodes, 1)
        Y - label
        cache - come from forward_propagation_n()of cache output

    return:
        gradients - A dictionary containing cost gradients associated with each parameter, activation, and pre activation variable.
    """
    m = X.shape[1]
    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y
    dW3 = (1. / m) * np.dot(dZ3, A2.T)
    dW3 = 1. / m * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    # dW2 = 1. / m * np.dot(dZ2, A1.T) * 2  # Should not multiply by 2
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    # db1 = 4. / m * np.sum(dZ1, axis=1, keepdims=True) # Should not multiply by 4
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
                 "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
                 "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

# Multidimensional gradient test function
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7):
    """
    inspect backward_propagation_n Is it calculated correctly forward_propagation_n Output cost gradient

    Parameters:
        parameters - Include parameters“ W1","b1","W2","b2","W3","b3"of python Dictionaries:
        grad_output_propagation_n The output of contains the cost gradient associated with the parameter.
        x  - Enter the data point with dimension (enter the number of nodes, 1)
        y  - label
        epsilon  - Calculate the small offset of the input to calculate the approximate gradient

    return:
        difference - Difference between approximate gradient and backward propagation gradient
    """
    # Initialization parameters
    parameters_values, keys = gc_utils.dictionary_to_vector(parameters)  # keys not available
    grad = gc_utils.gradients_to_vector(gradients)
    num_parameters = parameters_values.shape[0]
    J_plus = np.zeros((num_parameters, 1))
    J_minus = np.zeros((num_parameters, 1))
    gradapprox = np.zeros((num_parameters, 1))

    # Calculate gradaprox
    for i in range(num_parameters):
        # Calculate J_plus [i]. Input: "parameters_values, epsilon". Output = "J_plus [i]"
        thetaplus = np.copy(parameters_values)  # Step 1
        thetaplus[i][0] = thetaplus[i][0] + epsilon  # Step 2
        J_plus[i], cache = forward_propagation_n(X, Y, gc_utils.vector_to_dictionary(thetaplus))  # Step 3, the cache is not used

        # Calculate J_minus [i]. Input: "parameters_values, epsilon". Output = "J_minus [i]".
        thetaminus = np.copy(parameters_values)  # Step 1
        thetaminus[i][0] = thetaminus[i][0] - epsilon  # Step 2
        J_minus[i], cache = forward_propagation_n(X, Y, gc_utils.vector_to_dictionary(thetaminus))  # Step 3, the cache is not used

        # Calculate gradaprox [i]
        gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)

    # Gradaprox and backward propagation gradients are compared by calculating the difference.
    numerator = np.linalg.norm(grad - gradapprox)  # Step 1'
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)  # Step 2'
    difference = numerator / denominator  # Step 3'

    if difference < 1e-7:
        print("Gradient inspection: the gradient is normal!")
    else:
        print(difference,"Gradient check: gradient exceeds threshold!")

    return difference

# Operation gradient test results

#Functions in testCases in the data file
X, Y, parameters = gradient_check_n_test_case()

cost, cache = forward_propagation_n(X, Y, parameters)
gradients = backward_propagation_n(X, Y, cache)
difference = gradient_check_n(parameters, gradients, X, Y)

Summary

Gradient test is slow! Approaching the gradient is computationally expensive. Therefore, we will not perform gradient test in each iteration during training. Just check whether the gradient is correct several times. Gradient test is not applicable to dropout. Generally, you will run the gradient test algorithm without dropout to ensure that your back propagation is correct, and then add dropout.

Topics: Python neural networks Deep Learning