[PyTorch] 13 Image Caption: let neural network read pictures and tell stories

Posted by Syranide on Tue, 25 Jan 2022 10:54:37 +0100

1. Data set acquisition

Data from: AI challenger 2017 image description dataset
Baidu online disk: https://pan.baidu.com/s/1g1XaPKzNvOurH9M44p1qrw Extraction code: bag3

Since the original training set is too large, only the verification set AI is used here_ challenger_ caption_ validation_ 20170910.zip, unzip it

2. Text data processing

The data of image Chinese description competition is divided into two parts, one is 30000 pictures, and the other is the corresponding description caption_validation_annotations_20170910.json, the format of each sample is as follows:

[{"url": "http://img5.cache.netease.com/photo/0005/2013-09-25/99LA1FC60B6P0005.jpg", "image_id": "3cd32bef87ed98572bac868418521852ac3f6a70.jpg", "caption": ["\u4e00\u4e2a\u53cc\u81c2\u62ac\u8d77\u7684\u8fd0\u52a8\u5458\u8dea\u5728\u7eff\u8335\u8335\u7684\u7403\u573a\u4e0a", "\u4e00\u4e2a\u62ac\u7740\u53cc\u81c2\u7684\u8fd0\u52a8\u5458\u8dea\u5728\u8db3\u7403\u573a\u4e0a", "\u4e00\u4e2a\u53cc\u624b\u63e1\u62f3\u7684\u7537\u4eba\u8dea\u5728\u7eff\u8335\u8335\u7684\u8db3\u7403\u573a\u4e0a", "\u4e00\u4e2a\u62ac\u8d77\u53cc\u624b\u7684\u7537\u4eba\u8dea\u5728\u78a7\u7eff\u7684\u7403\u573a\u4e0a", "\u4e00\u4e2a\u53cc\u624b\u63e1\u62f3\u7684\u8fd0\u52a8\u5458\u8dea\u5728\u5e73\u5766\u7684\u8fd0\u52a8\u573a\u4e0a"]}, ...

"An athlete with his arms raised knelt on the green pitch", "A player with arms raised knelt on the football field", "A man with clenched hands knelt on the green football field", "A man with his hands up knelt on the Green Court", "A boxer knelt on the flat playground"

Features described above:

  • Each sentence varies in length
  • The description does not involve too much additional knowledge and should be as objective as possible
  • Try to point out the relationship between the characters in the picture as much as possible

here Direct download Pretreatment of manual description, including:

  • Chinese jieba participle
  • word2ix, filter low-frequency words
  • Fill all descriptions to pad_sequence
  • Using pack_padded_sequence to speed up the calculation

However, you cannot use caption PTH, because only the original verification set is used here, use the code provided in the book to deal with it:

# coding:utf8
import torch as t
import numpy as np
import json
import jieba
import tqdm


class Config:
    annotation_file = r'... your path\ai_challenger_caption_validation_20170910\caption_validation_annotations_20170910.json'
    unknown = '</UNKNOWN>'
    end = '</EOS>'
    padding = '</PAD>'
    max_words = 5000
    min_appear = 2
    save_path = r'... your path\ai_challenger_caption_validation_20170910\caption_2.pth'


# START='</START>'
# MAX_LENS = 25,

def process(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    with open(opt.annotation_file) as f:
        data = json.load(f)

    # 8f00f3d0f1008e085ab660e70dffced16a8259f6.jpg -> 0
    id2ix = {item['image_id']: ix for ix, item in enumerate(data)}
    # 0-> 8f00f3d0f1008e085ab660e70dffced16a8259f6.jpg
    ix2id = {ix: id for id, ix in (id2ix.items())}
    assert id2ix[ix2id[10]] == 10

    captions = [item['caption'] for item in data]
    # Word segmentation result
    cut_captions = [[list(jieba.cut(ii, cut_all=False)) for ii in item] for item in tqdm.tqdm(captions)]

    word_nums = {}  # 'happy' - > 10000 (Times)

    def update(word_nums):
        def fun(word):
            word_nums[word] = word_nums.get(word, 0) + 1
            return None

        return fun

    lambda_ = update(word_nums)
    _ = {lambda_(word) for sentences in cut_captions for sentence in sentences for word in sentence}

    # [(10000,u 'happy'), (9999,u 'happy')...]
    word_nums_list = sorted([(num, word) for word, num in word_nums.items()], reverse=True)

    #### The above operations are lossless and reversible###############################
    # **********Some information will be deleted below******************

    # 1. Discard words with insufficient word frequency
    # 2. ~ ~ discard words with too long length~~

    words = [word[1] for word in word_nums_list[:opt.max_words] if word[0] >= opt.min_appear]
    words = [opt.unknown, opt.padding, opt.end] + words
    word2ix = {word: ix for ix, word in enumerate(words)}
    ix2word = {ix: word for word, ix in word2ix.items()}
    assert word2ix[ix2word[123]] == 123

    ix_captions = [[[word2ix.get(word, word2ix.get(opt.unknown)) for word in sentence]
                    for sentence in item]
                   for item in cut_captions]
    readme = u"""
    word: Words
    ix:index
    id:Picture name
    caption: Description after participle, through ix2word Original Chinese words can be obtained
    """
    results = {
        'caption': ix_captions,
        'word2ix': word2ix,
        'ix2word': ix2word,
        'ix2id': ix2id,
        'id2ix': id2ix,
        'padding': '</PAD>',
        'end': '</EOS>',
        'readme': readme
    }
    t.save(results, opt.save_path)
    print('save file in %s' % opt.save_path)

    def test(ix, ix2=4):
        results = t.load(opt.save_path)
        ix2word = results['ix2word']
        examples = results['caption'][ix][4]
        sentences_p = (''.join([ix2word[ii] for ii in examples]))
        sentences_r = data[ix]['caption'][ix2]
        assert sentences_p == sentences_r, 'test failed'

    test(1000)
    print('test success')


if __name__ == '__main__':
    process()

Got a caption_2.pth file, an example:

import torch

data = torch.load(r'... your path\ai_challenger_caption_validation_20170910\caption_2.pth')
ix2word = data['ix2word']
ix2id = data['ix2id']
caption = data['caption']

img_ix = 0
img_caption = caption[img_ix]

print(ix2id[img_ix])
print(img_caption)

sen = img_caption[0]
sen = [ix2word[_] for _ in sen]
str = ''.join(sen)
print(str)
3cd32bef87ed98572bac868418521852ac3f6a70.jpg
[[4, 178, 79, 3, 47, 159, 5, 112, 3, 20], [4, 176, 178, 3, 47, 159, 5, 64, 6], [4, 19, 361, 3, 7, 159, 5, 112, 3, 64, 6], [4, 79, 19, 3, 7, 159, 5, 124, 3, 20], [4, 19, 361, 3, 47, 159, 5, 65, 3, 26, 6]]
An athlete with his arms raised knelt on the green pitch

3. Image data processing

Using the output of ResNet in the pool layer and the input of the full connection layer, the source code of ResNet is copied and modified here. It is output and returned in the penultimate layer. After modifying ResNet, the characteristics of 30000 pictures are extracted. The code is as follows:

from torchvision.models import resnet50

def new_forward(self, x):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)

    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)

    x = self.avgpool(x)
    x = x.view(x.size(0), -1)
    # x = self.fc(x)
    return x

model = resnet50(pretrained=True)
model.forward = lambda x:new_forward(model, x)
model = model.cuda()

import torchvision as tv  # General image conversion operation class
from PIL import Image  # Pilot library, PIL reads pictures
import numpy as np
import torch
from torch.utils import data
import os

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
transforms = tv.transforms.Compose([
    tv.transforms.Resize(256),
    tv.transforms.CenterCrop(256),
    tv.transforms.ToTensor(),
    normalize
])


class Dataset(data.Dataset):
    def __init__(self, caption_data_path):
        data = torch.load(
            '/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/caption_2.pth')
        self.ix2id = data['ix2id']
        self.imgs = [os.path.join(caption_data_path, self.ix2id[_]) for _ in range(len(self.ix2id))]

    def __getitem__(self, item):
        x = Image.open(self.imgs[item]).convert('RGB')
        x = transforms(x)  # ([3, 256, 256])
        return x, item

    def __len__(self):
        return len(self.imgs)


batch_size = 32
dataset = Dataset(
    '/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/caption_validation_images_20170910')
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

results = torch.Tensor(len(dataloader.dataset), 2048).fill_(0)

for ii, (imgs, indexs) in enumerate(dataloader):
    assert indexs[0] == batch_size * ii
    imgs = imgs.cuda()
    features = model(imgs)
    results[ii * batch_size:(ii + 1) * batch_size] = features.data.cpu()
    print(ii * batch_size)

torch.save(results, '/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/results_2048.pth')

Here we get the 1000 dimensional eigenvector results of the original ResNet PTH and modify 2048 dimensional vector results of the last linear layer_ 2048.pth(torch.Size([30000, 2048]))

4. Training

About NP random. Choice function:

np.random.choice(5, 3)
>> array([0, 3, 4]) # random

About pack_padded_sequence visible this

The code of beam search can be found in the book Official website

The maximum number of iterations is 5. Check the results every 100 generations:

'One man by one </EOS>'
'A man dressed in </EOS>'
'A man in a shirt is on the playground </EOS>'
'A man in a shirt is on the playground </EOS>'
'A man in a football shirt is playing football on the playground </EOS>'
'Two men in jerseys are playing football on the pitch </EOS>'
'Two men in sportswear are playing football on the field </EOS>'
'A man with a microphone in his right hand performed on the stage </EOS>'
'A man in a hat is performing on the stage </EOS>'
'A man in a hat is singing on the stage </EOS>'
'A man in a hat and a man in a hat stood on the stage </EOS>'
'A man in a hat is performing on the stage </EOS>'
'A woman in a hat is performing on the stage </EOS>'
'A woman in a hat is performing on the stage </EOS>'
'A man in a hat is performing on the stage </EOS>'
'A man in a hat is singing on the stage </EOS>'
'A man in a hat and a man in a hat are standing on the grass </EOS>'
'A man in a hat and a man in a hat performed on the stage </EOS>'
'A man in a hat and a man in a hat are standing on the grass </EOS>'
'A man in a hat and a man in a hat were standing on the road </EOS>'
...


Here, the model is saved for each iteration, and the names are model_0.pth to model_4.pth

Find your own photos here to test:

'A man in a football shirt is playing football on the court </EOS>'
'A man in a black coat and a man in a hat were standing on the road </EOS>'
'A man in a hat is performing on the stage </EOS>'
'A man in a hat and a man in a hat are walking on the road </EOS>'
'A man with a microphone in his right hand was singing on the stage </EOS>'

There is a bug in the result. Modify the maximum number of iterations to 100:



And how to modify the number and number of hidden layers, the results are not very good. It is speculated that the possible reason is that the Beam Search search and its own data set are dirty

The maximum number of iterations is 50:


5. All codes

import torch
from torch.utils import data
import numpy as np
import tqdm
from torch.nn.utils.rnn import pack_padded_sequence
from beam_search import CaptionGenerator
from PIL import Image
import torchvision as tv
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from torchvision.models import resnet50
from torch.utils.data.dataset import random_split


def new_forward(self, x):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)

    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)

    x = self.avgpool(x)
    x = x.view(x.size(0), -1)
    # x = self.fc(x)
    return x

model_feature = resnet50(pretrained=True)
model_feature.forward = lambda x:new_forward(model_feature, x)
model_feature = model_feature.cuda()

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
transforms = tv.transforms.Compose([
    tv.transforms.Resize(256),
    tv.transforms.CenterCrop(256),
    tv.transforms.ToTensor(),
    normalize
])

class CaptionDataset(data.Dataset):
    def __init__(self):
        data = torch.load('/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/caption_2.pth')
        # ix2word = data['ix2word']
        self.ix2id = data['ix2id']
        self.caption = data['caption']
        word2ix = data['word2ix']
        self.padding = word2ix.get(data.get('padding'))
        self.end = word2ix.get(data.get('end'))

        self.feature = torch.load('/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/results_2048.pth')

    def __getitem__(self, item):
        img = self.feature[item]
        caption = self.caption[item]
        rdn_index = np.random.choice(len(caption), 1)[0]        # Choose one of the five descriptions at random
        caption = caption[rdn_index]
        return img, torch.LongTensor(caption), item

    def __len__(self):
        return len(self.ix2id)

def create_collate_fn(padding, eos, max_length=50):
    def collate_fn(img_cap):
        """
        Splice multiple samples together to form one batch
        Input: list of data,Shape such as
        [(img1, cap1, index1), (img2, cap2, index2) ....]

        The splicing strategy is as follows:
        - batch The description length of each sample is changing, and no word is discarded\
          Select the sentence with the longest length and add all sentences pad As long as
        - Not long enough</PAD>At the end PAD
        - No, START identifier 
        - If the length is exactly the same as the word, then there is No</EOS>

        return:
        - imgs(Tensor): batch_sie*2048
        - cap_tensor(Tensor): batch_size*max_length (I think it is wrong!)
        - lengths(list of int): Count Reg batch_size
        - index(list of int): Count Reg batch_size
        """
        img_cap.sort(key=lambda p: len(p[1]), reverse=True)
        imgs, caps, indexs = zip(*img_cap)
        imgs = torch.cat([img.unsqueeze(0) for img in imgs], 0)     # batch * 2048
        lengths = [min(len(c) + 1, max_length) for c in caps]
        batch_length = max(lengths)
        cap_tensor = torch.LongTensor(batch_length, len(caps)).fill_(padding)
        for i, c in enumerate(caps):
            end_cap = lengths[i] - 1
            if end_cap < batch_length:
                cap_tensor[end_cap, i] = eos
            cap_tensor[:end_cap, i].copy_(c[:end_cap])
        return (imgs, (cap_tensor, lengths), indexs)        # batch * 2048, (max_len * batch, ...), ...

    return collate_fn


batch_size = 32
max_epoch = 50
embedding_dim = 64
hidden_size = 64
lr = 1e-4
num_layers = 2

def get_dataloader():
    dataset = CaptionDataset()
    n_train = int(len(dataset) * 0.9)
    split_train, split_valid = random_split(dataset=dataset, lengths=[n_train, len(dataset) - n_train])
    train_dataloader = data.DataLoader(split_train, batch_size=batch_size, shuffle=True, num_workers=4,
                                 collate_fn=create_collate_fn(dataset.padding, dataset.end))
    valid_dataloader = data.DataLoader(split_valid, batch_size=batch_size, shuffle=True, num_workers=4,
                                       collate_fn=create_collate_fn(dataset.padding, dataset.end))
    return train_dataloader, valid_dataloader


class Net(torch.nn.Module):
    def __init__(self, word2ix, ix2word):
        super(Net,self).__init__()
        self.ix2word = ix2word
        self.word2ix = word2ix
        self.embedding = torch.nn.Embedding(len(word2ix), embedding_dim)
        self.fc = torch.nn.Linear(2048, hidden_size)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers)
        self.classifier = torch.nn.Linear(hidden_size, len(word2ix))

    def forward(self, img_feats, captions, lengths):
        embeddings = self.embedding(captions)       # seq_len * batch * embedding
        img_feats = self.fc(img_feats).unsqueeze(0)     # img_feats is a 2048 dimensional vector, which is transformed into a 256 dimensional vector through the full connection layer. Like word vectors, 1 * batch * hidden_size
        embeddings = torch.cat([img_feats, embeddings], 0)      # img_feats is regarded as the word vector of the first word, (1+seq_len) * batch * hidden_size
        packed_embeddings = pack_padded_sequence(embeddings, lengths)       # Packedsequence, lengths - the effective length of each seq in the batch
        outputs, state = self.rnn(packed_embeddings)    # seq_ len * batch * (1*256), (1*2) * batch * hidden_ Size, the output of LSTM is used as a feature to classify and predict the sequence number of the next word. Because the input is PackedSequence, the output is also PackedSequence. The first element of PackedSequence is Variable and the second element is batch_sizes, that is, the length of each sample in the batch*
        pred = self.classifier(outputs[0])
        return pred, state

    def generate(self, img, eos_token='</EOS>', beam_size=3, max_caption_length=30, length_normalization_factor=0.0):   # The description is generated according to the picture, mainly using the beam search algorithm to get a better description
        cap_gen = CaptionGenerator(embedder=self.embedding,
                                   rnn=self.rnn,
                                   classifier=self.classifier,
                                   eos_id=self.word2ix[eos_token],
                                   beam_size=beam_size,
                                   max_caption_length=max_caption_length,
                                   length_normalization_factor=length_normalization_factor)
        if next(self.parameters()).is_cuda:
            img = img.cuda()
        img = img.unsqueeze(0)
        img = self.fc(img).unsqueeze(0)
        sentences, score = cap_gen.beam_search(img)
        sentences = [' '.join([self.ix2word[idx.item()] for idx in sent])
                     for sent in sentences]
        return sentences



def evaluate(dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for ii, (imgs, (captions, lengths), indexes) in enumerate(dataloader):
            imgs = imgs.to(device)
            captions = captions.to(device)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]
            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            total_loss += loss.item()
    model.train()
    return total_loss

if __name__ == '__main__':
    train_dataloader, valid_dataloader = get_dataloader()
    _data = torch.load('/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/caption_2.pth')
    word2ix, ix2word = _data['word2ix'], _data['ix2word']

    # max_loss = float('inf')     # 221
    max_loss = 263

    device = torch.device('cuda')


    model = Net(word2ix, ix2word)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    model.to(device)

    losses = []
    valid_losses = []


    img_path = '/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/123.jpg'
    raw_img = Image.open(img_path).convert('RGB')
    raw_img = transforms(raw_img)  # 3*256*256
    img_feature = model_feature(raw_img.cuda().unsqueeze(0))
    print(img_feature)

    for epoch in range(max_epoch):
        for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(train_dataloader)):
            optimizer.zero_grad()
            imgs = imgs.to(device)
            captions = captions.to(device)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]
            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

            if (ii + 1) % 20 == 0:  # visualization
                # Visual original picture + visual manual description statement
                # raw_img = _data['ix2id'][indexes[0]]
                # img_path = '/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/caption_validation_images_20170910/' + raw_img
                # raw_img = Image.open(img_path).convert('RGB')
                # raw_img = tv.transforms.ToTensor()(raw_img)
                #
                # raw_caption = captions.data[:, 0]
                # raw_caption = ''.join([_data['ix2word'][ii.item()] for ii in raw_caption])
                #
                # results = model.generate(imgs.data[0])
                #
                # print(img_path, raw_caption, results)
                #
                #
                # print(model.generate(img_feature.squeeze(0)))
                tmp = evaluate(valid_dataloader)
                valid_losses.append(tmp)
                if tmp < max_loss:
                    max_loss = tmp
                    torch.save(model.state_dict(),
                               '/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/model_best.pth')
                    print(max_loss)     # 190 111
    plt.figure(1)
    plt.plot(losses)
    plt.figure(2)
    plt.plot(valid_losses)
    plt.show()


    # model.load_state_dict(torch.load('/mnt/Data1/ysc/ai_challenger_caption_validation_20170910/model_best.pth'))
    # print(model.generate(img_feature.squeeze(0)))

6. Summary

The final effect is not very good. It seems that every time I deal with the image, the effect is not very ideal

Topics: AI neural networks Pytorch Deep Learning NLP