Pytorch freezes some layers during training so that they do not participate in training (update gradient)

Posted by naturalbeauty7 on Sun, 23 Jan 2022 09:37:02 +0100

First of all, we know that the parameters in the deep learning network are updated in the back propagation by calculating the gradient, so as to obtain an excellent parameter. However, sometimes, we want to fix the parameters of some layers and do not participate in the back propagation. For example, when fine tuning, we want to fix the parameter part of the pre training model that has been loaded, and only want to update the classifier of the last layer. What should we do then.

Define network

# Define a simple network
class net(nn.Module):
    def __init__(self, num_class=10):
        super(net, self).__init__()
        self.fc1 = nn.Linear(8, 4)
        self.fc2 = nn.Linear(4, num_class)
    
    
    def forward(self, x):
        return self.fc2(self.fc1(x))

Case 1: when the layer is not frozen

code

model = net()

# Case 1: when parameters are not frozen
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)  # All parameters are passed in

# Model parameters before training
print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)

for epoch in range(10):
    x = torch.randn((3, 8))
    label = torch.randint(0,10,[3]).long()
    output = model(x)
    
    loss = loss_fn(output, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Model parameters after training
print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)

result

(bbn) jyzhang@admin2-X10DAi:~/test$ python -u "/home/jyzhang/test/net.py"
model.fc1.weight Parameter containing:
tensor([[ 0.3362, -0.2676, -0.3497, -0.3009, -0.1013, -0.2316, -0.0189,  0.1430],
        [-0.2486,  0.2900, -0.1818, -0.0942,  0.1445,  0.2410, -0.1407, -0.3176],
        [-0.3198,  0.2039, -0.2249,  0.2819, -0.3136, -0.2794, -0.3011, -0.2270],
        [ 0.3376, -0.0842,  0.2747, -0.0232,  0.0768,  0.3160, -0.1185,  0.2911]],
       requires_grad=True)
model.fc2.weight Parameter containing:
tensor([[ 0.4277,  0.0945,  0.1768,  0.3773],
        [-0.4595, -0.2447,  0.4701,  0.2873],
        [ 0.3281, -0.1861, -0.2202,  0.4413],
        [-0.1053, -0.1238,  0.0275, -0.0072],
        [-0.4448, -0.2787, -0.0280,  0.4629],
        [ 0.4063, -0.2091,  0.0706,  0.3216],
        [-0.2287, -0.1352, -0.0502,  0.3434],
        [-0.2946, -0.4074,  0.4926, -0.0832],
        [-0.2608,  0.0165,  0.0501, -0.1673],
        [ 0.2507,  0.3006,  0.0481,  0.2257]], requires_grad=True)
model.fc1.weight Parameter containing:
tensor([[ 0.3316, -0.2628, -0.3391, -0.2989, -0.0981, -0.2178, -0.0056,  0.1410],
        [-0.2529,  0.2991, -0.1772, -0.0992,  0.1447,  0.2480, -0.1370, -0.3186],
        [-0.3246,  0.2055, -0.2229,  0.2745, -0.3158, -0.2750, -0.2994, -0.2295],
        [ 0.3366, -0.0877,  0.2693, -0.0182,  0.0807,  0.3117, -0.1184,  0.2946]],
       requires_grad=True)
model.fc2.weight Parameter containing:
tensor([[ 0.4189,  0.0985,  0.1723,  0.3804],
        [-0.4593, -0.2356,  0.4772,  0.2784],
        [ 0.3269, -0.1874, -0.2173,  0.4407],
        [-0.1061, -0.1248,  0.0309, -0.0062],
        [-0.4322, -0.2868, -0.0319,  0.4647],
        [ 0.4048, -0.2150,  0.0692,  0.3228],
        [-0.2252, -0.1353, -0.0433,  0.3396],
        [-0.2936, -0.4118,  0.4875, -0.0782],
        [-0.2625,  0.0192,  0.0509, -0.1670],
        [ 0.2474,  0.3056,  0.0418,  0.2265]], requires_grad=True)

conclusion

When the layer is not frozen, the parameters of the learnable parameter layer in the model will change with the progress of training

Case 2: when mode I is adopted to freeze fc1 layer

Mode 1

  1. The optimizer passes in all parameters

    optimizer = optim.SGD(model.parameters(), lr=1e-2)  # All parameters are passed in
    
  2. Requires for the parameters of the layer to be frozen_ Grad set to False

    for name, param in model.named_parameters():
        if "fc1" in name:
            param.requires_grad = False
    

code

# Case 2: when mode I is adopted to freeze fc1 layer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)  # The optimizer passes in all parameters

# Model parameters before training
print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)

# Freeze fc1 layer parameters
for name, param in model.named_parameters():
    if "fc1" in name:
        param.requires_grad = False

for epoch in range(10):
    x = torch.randn((3, 8))
    label = torch.randint(0,10,[3]).long()
    output = model(x)
 
    loss = loss_fn(output, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)

result

(bbn) jyzhang@admin2-X10DAi:~/test$ python -u "/home/jyzhang/test/net.py"
model.fc1.weight Parameter containing:
tensor([[ 0.3163, -0.1592, -0.2360,  0.1436,  0.1158,  0.0406, -0.0627,  0.0566],
        [-0.1688,  0.3519,  0.2464, -0.2693,  0.1284,  0.0544, -0.0188,  0.2404],
        [ 0.0738,  0.2013,  0.0868,  0.1396, -0.2885,  0.3431, -0.1109,  0.2549],
        [ 0.1222, -0.1877,  0.3511,  0.1951,  0.2147, -0.0427, -0.3374, -0.0653]],
       requires_grad=True)
model.fc2.weight Parameter containing:
tensor([[-0.1830, -0.3147, -0.1698,  0.3235],
        [-0.1347,  0.3096,  0.4895,  0.1221],
        [ 0.2735, -0.2238,  0.4713, -0.0683],
        [-0.3150, -0.1905,  0.3645,  0.3766],
        [-0.0340,  0.3212,  0.0650,  0.1380],
        [-0.2500,  0.1128, -0.3338, -0.4151],
        [ 0.0446, -0.4776, -0.3655,  0.0822],
        [-0.1871, -0.0602, -0.4855, -0.3604],
        [-0.3296,  0.0523, -0.3424,  0.2151],
        [-0.2478,  0.1424,  0.4547, -0.1969]], requires_grad=True)
model.fc1.weight Parameter containing:
tensor([[ 0.3163, -0.1592, -0.2360,  0.1436,  0.1158,  0.0406, -0.0627,  0.0566],
        [-0.1688,  0.3519,  0.2464, -0.2693,  0.1284,  0.0544, -0.0188,  0.2404],
        [ 0.0738,  0.2013,  0.0868,  0.1396, -0.2885,  0.3431, -0.1109,  0.2549],
        [ 0.1222, -0.1877,  0.3511,  0.1951,  0.2147, -0.0427, -0.3374, -0.0653]])
model.fc2.weight Parameter containing:
tensor([[-0.1821, -0.3155, -0.1637,  0.3213],
        [-0.1353,  0.3130,  0.4807,  0.1245],
        [ 0.2731, -0.2206,  0.4687, -0.0718],
        [-0.3138, -0.1925,  0.3561,  0.3809],
        [-0.0344,  0.3152,  0.0606,  0.1332],
        [-0.2501,  0.1154, -0.3267, -0.4137],
        [ 0.0400, -0.4723, -0.3586,  0.0808],
        [-0.1823, -0.0667, -0.4854, -0.3543],
        [-0.3285,  0.0547, -0.3388,  0.2166],
        [-0.2497,  0.1410,  0.4551, -0.2008]], requires_grad=True)

conclusion

It can be seen from the experimental results that as long as you set requirements_ Grad = false although all parameters of the model are passed in, only requirements are updated_ Parameters of layers with grad = true

Case 3: when mode 2 is adopted to freeze fc1 floor

Mode 2

  1. The optimizer passes in parameters for the fc2 layer that are not frozen

    optimizer = optim.SGD(model.fc2.parameters(), lr=1e-2)  # The optimizer passes only the parameters of fc2
    
  2. Note: the requirements of the parameters of the layer to be frozen are not required_ Grad set to False

code

# Case 3: when mode 2 is adopted to freeze fc1 layer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.fc2.parameters(), lr=1e-2)  # The optimizer passes only the parameters of fc2
print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)

for epoch in range(10):
    x = torch.randn((3, 8))
    label = torch.randint(0,3,[3]).long()
    output = model(x)
 
    loss = loss_fn(output, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
 
print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)

result

model.fc1.weight Parameter containing:
tensor([[ 0.2519, -0.1772, -0.2229,  0.0711, -0.1681,  0.1233, -0.3217, -0.0412],
        [ 0.2032, -0.2045,  0.2723,  0.3272,  0.1034,  0.1519, -0.0587, -0.3436],
        [ 0.0470,  0.2379,  0.0590,  0.2400,  0.2280,  0.2045, -0.0229, -0.3484],
        [-0.3023, -0.1195,  0.1792, -0.2173, -0.0492,  0.2640, -0.3511, -0.2845]],
       requires_grad=True)
model.fc2.weight Parameter containing:
tensor([[-0.3263, -0.2938, -0.3516, -0.4578],
        [-0.4549, -0.0060,  0.4696, -0.0174],
        [-0.4841,  0.2861,  0.2658,  0.4483],
        [-0.3093,  0.0977, -0.2735,  0.1033],
        [-0.2421,  0.4489, -0.4649,  0.0110],
        [-0.3671,  0.0182, -0.1027, -0.4441],
        [ 0.0205, -0.0659,  0.4183, -0.2068],
        [-0.1846,  0.1741, -0.2302, -0.1745],
        [-0.3423, -0.2642,  0.2796,  0.4976],
        [-0.0770, -0.3766, -0.0512, -0.2105]], requires_grad=True)
model.fc1.weight Parameter containing:
tensor([[ 0.2519, -0.1772, -0.2229,  0.0711, -0.1681,  0.1233, -0.3217, -0.0412],
        [ 0.2032, -0.2045,  0.2723,  0.3272,  0.1034,  0.1519, -0.0587, -0.3436],
        [ 0.0470,  0.2379,  0.0590,  0.2400,  0.2280,  0.2045, -0.0229, -0.3484],
        [-0.3023, -0.1195,  0.1792, -0.2173, -0.0492,  0.2640, -0.3511, -0.2845]],
       requires_grad=True)
model.fc2.weight Parameter containing:
tensor([[-0.3253, -0.2973, -0.3707, -0.4560],
        [-0.4566,  0.0015,  0.4655, -0.0166],
        [-0.4796,  0.2931,  0.2592,  0.4661],
        [-0.3097,  0.0966, -0.2695,  0.1002],
        [-0.2433,  0.4455, -0.4587,  0.0063],
        [-0.3669,  0.0171, -0.0988, -0.4452],
        [ 0.0198, -0.0679,  0.4203, -0.2088],
        [-0.1854,  0.1717, -0.2241, -0.1781],
        [-0.3429, -0.2653,  0.2822,  0.4938],
        [-0.0773, -0.3765, -0.0464, -0.2127]], requires_grad=True)

conclusion

When the optimizer only passes in the parameters of the layer to be updated, only the parameters passed in by the optimizer will be updated. For the parameters that are not passed in, the derivation can be obtained, but the parameters will not be updated

Comparison and summary of mode I and mode II

In the training process, it may be necessary to fix some parameters of the model and update only another part of the parameters.

There are two ways to achieve this goal. One is to set the network layer that does not update parameters to false, and the other is to pass in only the parameters to be updated when defining the optimizer.

As a best practice, the optimizer only passes in requirements_ Grad = true parameter, which will occupy less memory and be more efficient.

Optimal writing method

Optimal writing method

The requirements of the parameters that will not be updated_ Grad is set to False and this parameter is not passed into optimizer

  1. The requirements of the parameter will not be updated_ Grad set to False

    # Parameters of frozen fc1 layer
    for name, param in model.named_parameters():
        if "fc1" in name:
            param.requires_grad = False
    
  2. Do not pass model parameters that are not updated into optimizer

    # Define a fliter and only pass in requirements_ Model parameters with grad = true
    optimizer = optim.SGD(filter(lambda p : p.requires_grad, model.parameters()), lr=1e-2) 
    

code

# Optimal writing method
loss_fn = nn.CrossEntropyLoss()

# # Model parameters before training
print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)
print("model.fc1.weight.requires_grad:", model.fc1.weight.requires_grad)
print("model.fc2.weight.requires_grad:", model.fc2.weight.requires_grad)

# Freeze fc1 layer parameters
for name, param in model.named_parameters():
    if "fc1" in name:
        param.requires_grad = False

optimizer = optim.SGD(filter(lambda p : p.requires_grad, model.parameters()), lr=1e-2)  # Define a fliter and only pass in requirements_ Model parameters with grad = true

for epoch in range(10):
    x = torch.randn((3, 8))
    label = torch.randint(0,3,[3]).long()
    output = model(x)
 
    loss = loss_fn(output, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("model.fc1.weight", model.fc1.weight)
print("model.fc2.weight", model.fc2.weight)
print("model.fc1.weight.requires_grad:", model.fc1.weight.requires_grad)
print("model.fc2.weight.requires_grad:", model.fc2.weight.requires_grad)

result

(bbn) jyzhang@admin2-X10DAi:~/test$ python -u "/home/jyzhang/test/net.py"
model.fc1.weight Parameter containing:
tensor([[-0.1193,  0.2354,  0.2520,  0.1187,  0.2699, -0.2301,  0.1622, -0.0478],
        [-0.2862, -0.1716,  0.2865,  0.2615, -0.2205, -0.2046, -0.0983, -0.1564],
        [-0.3143, -0.2248,  0.2198,  0.2338,  0.1184, -0.2033, -0.3418,  0.1434],
        [ 0.3107, -0.0411, -0.3016,  0.1924, -0.1756, -0.2881,  0.0528, -0.0444]],
       requires_grad=True)
model.fc2.weight Parameter containing:
tensor([[-0.2548,  0.2107, -0.1293, -0.2562],
        [-0.1989, -0.2624,  0.2226,  0.4861],
        [-0.1501,  0.2516,  0.4311, -0.1650],
        [ 0.0334, -0.0963, -0.1731,  0.1706],
        [ 0.2451, -0.2102,  0.0499,  0.0497],
        [-0.1464, -0.2973,  0.3692,  0.0523],
        [ 0.1192,  0.3575, -0.1911,  0.1457],
        [-0.0990,  0.2059,  0.2072, -0.2013],
        [-0.4397,  0.4036, -0.3402, -0.0417],
        [ 0.0379,  0.0128, -0.3212, -0.0867]], requires_grad=True)
model.fc1.weight.requires_grad: True
model.fc2.weight.requires_grad: True
model.fc1.weight Parameter containing:
tensor([[-0.1193,  0.2354,  0.2520,  0.1187,  0.2699, -0.2301,  0.1622, -0.0478],
        [-0.2862, -0.1716,  0.2865,  0.2615, -0.2205, -0.2046, -0.0983, -0.1564],
        [-0.3143, -0.2248,  0.2198,  0.2338,  0.1184, -0.2033, -0.3418,  0.1434],
        [ 0.3107, -0.0411, -0.3016,  0.1924, -0.1756, -0.2881,  0.0528, -0.0444]])
model.fc2.weight Parameter containing:
tensor([[-0.2637,  0.2073, -0.1293, -0.2422],
        [-0.2027, -0.2641,  0.2152,  0.4897],
        [-0.1543,  0.2504,  0.4188, -0.1576],
        [ 0.0356, -0.0947, -0.1698,  0.1669],
        [ 0.2474, -0.2081,  0.0536,  0.0456],
        [-0.1445, -0.2962,  0.3708,  0.0500],
        [ 0.1219,  0.3574, -0.1876,  0.1404],
        [-0.0961,  0.2058,  0.2091, -0.2046],
        [-0.4368,  0.4039, -0.3376, -0.0450],
        [ 0.0398,  0.0143, -0.3181, -0.0897]], requires_grad=True)
model.fc1.weight.requires_grad: False
model.fc2.weight.requires_grad: True

conclusion

The optimal writing method can save video memory and improve speed:

  1. Save video memory: do not pass parameters that are not updated into optimizer
  2. Increase speed: the parameters that will not be updated are required_ Grad is set to False, which saves the time of calculating the parameter gradient of this part

Topics: Pytorch