Computer vision --- image classification practice + theoretical explanation

Posted by werkkrew on Mon, 13 Dec 2021 10:39:36 +0100

Depth residual network

Innovation of deep residual network?

  1. Based on the ultra deep network structure, the Residule structure is proposed
  2. Batch Normalization is used to make our batch feature map the same dimension,
    It meets the distribution law with mean value of 0 and variance of 1..
  3. 1X1 dimension lifting structure is added to the residual block to greatly reduce the ultra deep network parameters. inception structure.

Degradation problem: with the deepening of network depth, gradient disappearance and gradient explosion will occur

The structure of residual network is shown in the figure

Different from the previous network structure, a fast channel is added to sum the input and output (original). The network layer relationship mapping is H(x), and the residual network fits another mapping, F(x):= H(x)-x, then the original mapping is F(x)+x. Optimizing the residual mapping F(x) is easier than optimizing the original mapping H(x). It is equivalent to eliminating redundant features and leaving unique features, so as to better realize classification.

Example: overall structure diagram of 34th floor:


Small components in residual network




The structure is relatively simple, and the code can be stacked.

import torch.nn as nn
import torch
#This structure is a residual structure without dotted lines,
class BasicBlock(nn.Module):
    expansion=1
    #This module is a 34 layer and 18 layer network, which does not involve 1x1 dimensionality reduction and elevation, but only the convolution of 3x3 convolution kernel layer

18th and 34th floors

        super(BasicBlock,self).__init__()
        self.conv1=nn.Conv2d(in_channels=in_channel,out_channels=out_channel,kernel_size=3,stride=stride,padding=1,bias=False)
        #The structure is that the input and output remain the same size
        self.bn1=nn.BatchNorm2d(out_channel)
        self.relu=nn.ReLU()
        self.conv2=nn.Conv2d(in_channels=out_channel,out_channels=out_channel,kernel_size=3,stride=1,padding=1,bias=False)
        #The structure is that the input and output remain the same size
        self.bn2=nn.BatchNorm2d(out_channel)
        self.downsampel=downsample
    def forward(self,x):
         identity=x
         if self.downsample is not None:
             identity=self.downsample(x)
         out=self.conv1(x)
         out=self.bn1(out)
         out=self.relu(out)
         out=self.con2(out)
         out=self.bn2(out)
         out+=identity
         out=self.relu(out)
         return out

class Bottleneck(nn.Module):
    #Dashed residual branch
    expansion=4
    def__init__(self,in_channel,out_channel,stride=1,downsample=None,group=1,width_per_group=64):
        super(Bottleneck,self)__init__()
        width=int(out_channel*(width_per_group/64.))*groups
        self.conv1=nn.Conv2d(in_channels=in_channel,out_channels=width,kernel_size=1,stride=1,bias=False)
        self.bn1=nn.BatchNorm2d(width)
        self.conv2=nn.Conv2d(in_channels=width,out_channels=width,groups=groups,kernel_size=3,stride=stride,bias=False,padding=1)
        self.bn2=nn.BatchNorm2d(width)
        self.conv3=nn.Conv2d(in_channels=width,out_channels_out_channel*self.expansion,kernel_size=1,stride=1,bias=False)
        self.bn3=nn.BatchNorm2d(out_channel*self.expansion)
        self.relu=nn.ReLU(inplace=True)
        self.downsample=downsample
    def forward(self,x): 
        identity=x
        if self.downsample is not None:
            identity=self.downsample(X)
            #This structure is a fast track, with an added dimension of 1 * 1. It is not available in the 34th floor, and others are available
        out=self.conv1(x)
        out=self.bn1(out)
        out=self.relu(out)
    
        out=self.conv2(out)
        out=self.bn2(out)
        out=self.relu(out)

        out=self.conv3(out)
        out=self.bn3(out)

        out+=identity
        out=self.relu(out)
        return out

class ResNet(nn.Module):
    def__init__(self,block,block_num,num_classes=1000,include_top=True,group=1,width_per_group=64):
        super(ResNet,self).__init__()
        self.include_top=include_top
        self.in_channel=64
        self.groups=groups
        self.width_per_group=width_per_group
        self.conv1=nn.Conv2d(3,self.in_channel,kernel_size=7,stride=2,padding=3,bias=False)
        self.bn1=nn.BatchNorm2d(self.in_channel)
        self.relu=nn.ReLU(inplace=True)
        self.maxpool=nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
        self.layer1=self._make_layer(block,64,block_num[0])
        self.layer2=self._make_layer(block,128,blocks_num[1],stride=2)
        self.layer3=self._make_layer(block,256,blocks_num[2],stride=2)
        self.layer4=self._make_layer(block,512,block_num[3],stride=2)
        if self.include_top:
            self.avgpool=nn.AdaptiveAvgPool2d((1,1))
            self.fc=nn.Linear(512*block.expansion,num_classes)
        for  m in self.module():
            if isinstance(m,nn.Conv2d):
                nn.init.kaimin_normal_(m.weight,mode='fan_out',nonlinearity='relu')
    def _mak_layer(self,block,channel,block_num,stride=1):
        downsample=None
        if stride1=1 or self.in_channel!=channel*block.expansion:
            downsample=nn.Sequential(
            nn.Conv2d(self.in_channel,channel*block.expansion,kernel_size=1,stride=stride,bias=False),
            nn.BatchNorm2d(channel*block.expansion)
        layers=[]
        layers.append(block(self.in_channel,channel,
        downsample=downsample,
        stride=stride,
        groups=slef.groups
        width_per_group=self.width_per_group))
        self.in_channel=channel*block.expansion
        #Personally, I think layer 34 is OK, but layer 50 input channel and output channel are not reasonable
        for _ in range(1,block_num):
            layers.append(block(self.in_channel,channel,
            groups=self.groups,
            width_per_group=self.width_per_group))
        return nn.Sequential(*layers)
    def forward(self,x):
        x=self.conv1(x)
        x=self.bn1(x)
        x=self.relu(x)
        x=self.maxpool(x)

        x=self.layer1(x)
        x=self.layer2(x)
        x=self.layer3(X)
        x=self.layer4(x)

        if self.include_top:
            x=self.avgpool(x)
            x=torch.flatten(x,1)
            x=self.fc(x)
        return x



    def resnet34(num_classes=1000,include_top=True):
        return ResNet(BasicBlock,[3,4,6,3],num_classes=num_classes,include_top=include_top)


    def resnet50(num_classes=1000,include_top=True):
        return ResNet(Bottleneck,[3,4,6,3],num_classes=num_classes,include_top=include_top)

    def resnet101(num_classes=1000,include_top=True):
        return ResNet(Bottleneck,[3,4,23,3],num_classes=num_classes,include_top=include_top)

    def resnext50_32x4d(num_classes=1000,include_top=True):
        groups=32
        width_per_group=4
        return ResNet(Bottleneck,[3,4,6,3],
    num_classes=num_classes,
    include_top=include_top,
    group=group,
    width_per_group=width_per_group)

    def resnext101_32x8d(num_classes=1000,include_top=True):
        groups=32
        width_per_group=8
        return ResNet(Bottleneck,[3,4,23,3],
        num_classes=num_classes,
        include_top=include_top,
        groups=groups,
        width_per_group=width_per_group)

Training model
Basic process of code training:

  1. Import package
    1.1 basic torch, torch nn,torch.optim some related layers, optimizer.
    1.2 training data preprocessing in torchvision transforms
    1.3 training dataset processing datasets image reading and processing ImageFolder.
    1.3 import model package from model import resnet34
    1.4 read address and file import os import json

  2. Get training equipment
    device=torch.device("cuda:0" if cuda.is_available() else "cpu")

  3. Training data and validation data preprocessing transforms compose
    train
    3.1 normalization, transforms Normalize([0.5,0.5,0.5],[0.5,0.5,0.5])
    3.2 cutting, transforms RandomResizedCrop(224)
    3.3 horizontal inversion, transformations RandomHorizontalFlip()
    3.4 totensor ,transforms.ToTensor()
    test
    3.1 readjustment, transformations Resize(224)
    3.2 central shear, transforms CenterCrop()
    3.3 normalization, transforms Normalize([0.5,0.5,0.5],[0.5,0.5,0.5])
    3.4 transforms ToTensor()
    data_transform={
    "train":transforms.Compose([transfoms.RandomResizedCrop(224),
    transforms.RandomHorizontalFilp(),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5])
    ])
    "val":transforms.Compose([transforms.Resize(),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5])
    ])
    }

  4. Read picture datasets The imagefolder parameters are root and transforms
    train_dataset=datasets.ImageFolder(root=os.path.join(image_path,"train"),transforms= data_transforms["train"])

  5. According to the read train_data, to become a dataloader, is to become batch data
    train_loader=torch.util.data.DataLoader(train_data,bach_size=bach_size,shuffle=True,num_workers=nw)
    After the training data is acquired and processed, it is loaded into the network for training

  6. net=resnet34()

  7. Load weights, use migration learning
    model_weight_path="./resnet34.pth"
    net.load_state_dict(torch.load(model_weight_path,map_location=device))

  8. Define loss function
    loss_function=nn.CrossEntropyLoss()

  9. The optimizer passes in the parameters to be optimized
    params=[p for p in net.parameters() if p.requires_grad]
    optimizer=optim.Adam(param,lr=0.001)

  10. Start training, number of training rounds, excellent weight maintenance address, verify each training batch
    Tqdm is a fast and extensible Python progress bar. You can add a progress prompt in the python long loop. Users only need to encapsulate any iterator tqdm(iterator).
    net.train()
    Optimizer zeroing
    optimizer.zero_grad()

  11. Forward propagation
    logits=net(image.to(device))

  12. Calculation loss error
    loss=loss_function(logits,labels.to(device))

  13. Back propagation
    loss.backward()

  14. Parameter update
    optimizer.step()

  15. data validation
    net.eval()

  16. According to the obtained results, the accuracy is calculated
    predict_y=torch.max(outputs,dim=1)[1]
    acc+=torch.eq(predict_y,val_labels.to(device)).sum().item()

  17. Save excellent weights
    if val_accurate>best_acc:
    best_accurate=best_acc
    torch.save(net.state_dict(),save_path)
    Complete the test
    Batch prediction is performed according to the weight saved in the previous step. Batch prediction is batch forward propagation

import os 
import json
import torch
from PIL import Image
from torchvision import transforms
from model import resnet34

def main():
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    data_transform=transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5])
    ])
    imgs_root="/.data/imgs"
    #Batch folder
    assert os.path.exists(imgs_root),f"file:'{imgs_root}'dose not exist."
    img_path_list=[os.path.json(imgs_root,i) for i in os.listdir(imgs_root) if i.endswith(".jpg")]
    json_path='./class_indices.json'
    assert os.path.exists(json_path).f"file:'{json_path}' dose not exist."
    json_file=open(json_path,"r")
    class_indict=json.load(json_file)
    model=resnet34(num_classes=5).to(device)
    weights_path="./resnet34.pth"
    assert os.path.exists(weight_path).f"file:'{weight_path}'dose not exist."
    model.loade_state_dict(torch.load(weights_paht,map_location=device))
    model.eval()
    batch_size=8
    with torch.no_grad():
        for ids in range(0,len(image_path_list)//bath_size):
            img_list=[]
            for img_path in img_path_list[ids*batch_size:(ids+1)*batch_size]:
                assert os.path.exists(img_path),f"file":'{img_path}'dose not exist."
                img=Image.ipen(img_path)
                img=data_transform(img)
                img_list.append(img)
            batch_img=torch.stack(img_list,dim=0)#pack
            output=model(batch_img.to(device)).cpu()
            predict=torch.softmax(output,dim=1)
            probs,classes=torch.max(predict,dim=1)
            Accuracy
            for idx,(pro,cla) in enumerate(zip(probs,classes)):
                print('image:{} class:{} prob:{:.3}'.format(img_path_list[ids *batch_size+idex],class_indict[str(cla.numpy())],pro.numpy())
if __name__=='__main__':
    main()
    

Topics: Algorithm Computer Vision Deep Learning