Customize the dataset and teach you to make your own VOC data reader

Posted by Brad on Sat, 05 Mar 2022 09:42:50 +0100

Link from AI Studio project https://aistudio.baidu.com/aistudio/projectdetail/2432755

Project background

I recently found that there was no good example when I read the resnet50+FPN version of Faster-RCNN when I was preloading and loading data, when I read the class loaded by paddlepaddle's official data.

So I did this today, how to make a data loader to read the standard Voc2012 data, and then I will use my own data set to do a custom data reading

Paddy dataset definition and loading document

Introduction to VOC dataset

The Pascal VOC data set provided by ai studio is used this time

Pascal VOC dataset, including voc2007 and voc2012 data, is mainly used for visual tasks such as target detection and semantic separation

The following is the structure of Pascal VOC dataset directory

.
└── VOCdevkit     #root directory
    └── VOC2012   #For data sets of different years, only 2012 and other years such as 2007 are downloaded here
        ├── Annotations        #Store xml files, correspond to the pictures in JPEGImages one by one, explain the contents of the pictures, and so on
        ├── ImageSets          #All stored in this directory are txt files. Each line in the txt file contains the name of a picture, and ± 1 will be added at the end to represent positive and negative samples
        │   ├── Action
        │   ├── Layout
        │   ├── Main
        │   └── Segmentation
        ├── JPEGImages         #Store source pictures
        ├── SegmentationClass  #It stores pictures, which are related to semantic segmentation
        └── SegmentationObject #Images are stored, and instance segmentation is related

Here we will use the VOC2012 data set

Because fast RCNN is used for target detection, we will use the train in the Main of Annotations, JPEGImages, and ImageSets Txt and val.txt

Annotations is the directory where the xml files are stored

JPEGImages directory where image files are stored

train.txt to store the name of the training file

val.txt TXT TXT for storing the name of the verification file

First, we decompress the Pascal VOC data set

!unzip -oq data/data4379/pascalvoc.zip

Because we only use VOC2012 here, move the VOC2012 folder to the root directory

!mv pascalvoc/VOCdevkit/VOC2012 ./

Custom dataset parsing

paddlepaddle official provides a very simple case of custom dataset.

import paddle
from paddle.io import Dataset

BATCH_SIZE = 64
BATCH_NUM = 20

IMAGE_SIZE = (28, 28)
CLASS_NUM = 10


class MyDataset(Dataset):
    """
    Step 1: inherit paddle.io.Dataset class
    """
    def __init__(self, num_samples):
        """
        Step 2: implement the constructor and define the size of the dataset
        """
        super(MyDataset, self).__init__()
        self.num_samples = num_samples

    def __getitem__(self, index):
        """
        Step 3: Implement__getitem__Method, defining and specifying index How to obtain data and return a single piece of data (training data, corresponding label)
        """
        data = paddle.uniform(IMAGE_SIZE, dtype='float32')
        label = paddle.randint(0, CLASS_NUM-1, dtype='int64')

        return data, label

    def __len__(self):
        """
        Step 4: Implement__len__Method to return the total number of data sets
        """
        return self.num_samples

# Test defined dataset
custom_dataset = MyDataset(BATCH_SIZE * BATCH_NUM)

print('=============custom dataset=============')
for data, label in custom_dataset:
    print(data.shape, label.shape)
    break

We can implement it step by step according to his appearance

Create a class and define it

# Define the data reading class and inherit the paddle io. Dataset
class VOCDataset(paddle.io.Dataset):

Implement the constructor to define the data set reading path

In__ init__ Method, we need to define the path to read each folder of VOC2012
At the same time, you also need to read the category file of VOC2012 dataset

I put the category file of VOC2012 dataset in the root directory

Path: pascal_voc_classes.json

def __init__(self,voc_root, year='2012',transforms=None, txt_name:str = 'train.txt'):
        assert year in ['2007','2012'], "year must be in ['2007','2012']"
        self.root = os.path.join(voc_root,f"VOC{year}")
        self.img_root = os.path.join(self.root,'JPEGImages')
        self.annotations_root = os.path.join(self.root,'Annotations')

        txt_path = os.path.join(self.root,"ImageSets",'Main',txt_name)
        assert os.path.exists(txt_path),'not found {} file'.format(txt_name)

        with open(txt_path) as read:
            self.xml_list = [os.path.join(self.annotations_root,line.strip()+'.xml')
                            for line in read.readlines() if len(line.strip()) >0 ]
        

        #check file
        assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
        for xml_path in self.xml_list:
            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
        
        # read class_indict
        json_file = './pascal_voc_classes.json'
        assert os.path.exists(json_file), "{} file not exist.".format(json_file)
        json_file = open(json_file, 'r')
        self.class_dict = json.load(json_file)
        json_file.close()

        self.transforms = transforms
    

Realize__ getitem__ Method to define how to obtain data when specifying index and return a single piece of data (training data, corresponding label)

def __getitem__(self, idx):
        # read xml
        xml_path = self.xml_list[idx]
        with open(xml_path) as fid:
            xml_str = fid.read()
        xml = etree.fromstring(xml_str)
        data = self.parse_xml_to_dict(xml)["annotation"]
        img_path = os.path.join(self.img_root, data["filename"])
        image = Image.open(img_path)
        if image.format != "JPEG":
            raise ValueError("Image '{}' format not JPEG".format(img_path))

        boxes = []
        labels = []
        iscrowd = []



        assert "object" in data, "{} lack of object information.".format(xml_path)
        for obj in data["object"]:
            xmin = float(obj["bndbox"]["xmin"])
            xmax = float(obj["bndbox"]["xmax"])
            ymin = float(obj["bndbox"]["ymin"])
            ymax = float(obj["bndbox"]["ymax"])

            # Further check the data. There may be cases where w or h is 0 in some annotation information. Such data will cause the calculated regression loss to be nan
            if xmax <= xmin or ymax <= ymin:
                print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path))
                continue
            
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(self.class_dict[obj["name"]])
            if "difficult" in obj:
                iscrowd.append(int(obj["difficult"]))
            else:
                iscrowd.append(0)

        # convert everything into a paddle.Tensor
        boxes = paddle.to_tensor(boxes).astype('float32')
        labels = paddle.to_tensor(labels).astype('int32')
        iscrowd = paddle.to_tensor(iscrowd, dtype=paddle.int64)
        image_id = paddle.to_tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])


        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            image, target = self.transforms(image, target)
    
        return image, target
    
    def parse_xml_to_dict(self, xml):
        """
        take xml The document is parsed into dictionary form for reference tensorflow of recursive_parse_xml_to_dict
        Args:
            xml: xml tree obtained by parsing XML file contents using lxml.etree

        Returns:
            Python dictionary holding XML contents.
        """

        if len(xml) == 0:  # Traverse the bottom layer and directly return the information corresponding to the tag
            return {xml.tag: xml.text}

        result = {}
        for child in xml:
            child_result = self.parse_xml_to_dict(child)  # Recursive traversal of label information
            if child.tag != 'object':
                result[child.tag] = child_result[child.tag]
            else:
                if child.tag not in result:  # Because there may be multiple object s, they need to be placed in the list
                    result[child.tag] = []
                result[child.tag].append(child_result[child.tag])
        return {xml.tag: result}

parse_ xml_ to_ Data returned by dict method

{'filename': '2010_001142.jpg', 'folder': 'VOC2012', 'object': [{'name': 'bottle', 'bndbox': {'xmax': '282', 'xmin': '264', 'ymax': '244', 'ymin': '210'}, 'difficult': '0', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '0'}, {'name': 'bottle', 'bndbox': {'xmax': '308', 'xmin': '295', 'ymax': '184', 'ymin': '162'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '0'}, {'name': 'bottle', 'bndbox': {'xmax': '270', 'xmin': '254', 'ymax': '224', 'ymin': '196'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '1'}, {'name': 'bottle', 'bndbox': {'xmax': '292', 'xmin': '281', 'ymax': '225', 'ymin': '204'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '1'}, {'name': 'bottle', 'bndbox': {'xmax': '221', 'xmin': '212', 'ymax': '227', 'ymin': '208'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '0'}, {'name': 'person', 'bndbox': {'xmax': '371', 'xmin': '315', 'ymax': '220', 'ymin': '103'}, 'difficult': '0', 'occluded': '1', 'pose': 'Frontal', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '379', 'xmin': '283', 'ymax': '342', 'ymin': '171'}, 'difficult': '0', 'occluded': '0', 'pose': 'Left', 'truncated': '0'}, {'name': 'person', 'bndbox': {'xmax': '216', 'xmin': '156', 'ymax': '260', 'ymin': '180'}, 'difficult': '0', 'occluded': '1', 'pose': 'Right', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '223', 'xmin': '205', 'ymax': '198', 'ymin': '172'}, 'difficult': '1', 'occluded': '1', 'pose': 'Frontal', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '280', 'xmin': '218', 'ymax': '234', 'ymin': '155'}, 'difficult': '0', 'occluded': '1', 'pose': 'Right', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '343', 'xmin': '292', 'ymax': '241', 'ymin': '185'}, 'difficult': '1', 'occluded': '1', 'pose': 'Left', 'truncated': '1'}], 'segmented': '0', 'size': {'depth': '3', 'height': '375', 'width': '500'}, 'source': {'annotation': 'PASCAL VOC2010', 'database': 'The VOC2010 Database', 'image': 'flickr'}}
!pip install lxml
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already satisfied: lxml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (4.8.0)
[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.
You should consider upgrading via the '/opt/conda/envs/python35-paddle120-env/bin/python -m pip install --upgrade pip' command.[0m
import paddle
import os
import json
from PIL import Image
from lxml import etree


# Define the data reading class and inherit the paddle io. Dataset
class VOCDataset(paddle.io.Dataset):

    def __init__(self,voc_root, year='2012',transforms=None, txt_name:str = 'train.txt'):
        assert year in ['2007','2012'], "year must be in ['2007','2012']"
        self.root = os.path.join(voc_root,f"VOC{year}")
        self.img_root = os.path.join(self.root,'JPEGImages')
        self.annotations_root = os.path.join(self.root,'Annotations')

        txt_path = os.path.join(self.root,"ImageSets",'Main',txt_name)
        assert os.path.exists(txt_path),'not found {} file'.format(txt_name)

        with open(txt_path) as read:
            self.xml_list = [os.path.join(self.annotations_root,line.strip()+'.xml')
                            for line in read.readlines() if len(line.strip()) >0 ]
        

        #check file
        assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
        for xml_path in self.xml_list:
            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
        
        # read class_indict
        json_file = './pascal_voc_classes.json'
        assert os.path.exists(json_file), "{} file not exist.".format(json_file)
        json_file = open(json_file, 'r')
        self.class_dict = json.load(json_file)
        json_file.close()

        self.transforms = transforms

    def __len__(self):
        return len(self.xml_list)

    def __getitem__(self, idx):
        # read xml
        xml_path = self.xml_list[idx]
        with open(xml_path) as fid:
            xml_str = fid.read()
        xml = etree.fromstring(xml_str)
        data = self.parse_xml_to_dict(xml)["annotation"]
        img_path = os.path.join(self.img_root, data["filename"])
        image = Image.open(img_path)
        if image.format != "JPEG":
            raise ValueError("Image '{}' format not JPEG".format(img_path))

        boxes = []
        labels = []
        iscrowd = []



        assert "object" in data, "{} lack of object information.".format(xml_path)
        for obj in data["object"]:
            xmin = float(obj["bndbox"]["xmin"])
            xmax = float(obj["bndbox"]["xmax"])
            ymin = float(obj["bndbox"]["ymin"])
            ymax = float(obj["bndbox"]["ymax"])

            # Further check the data. There may be cases where w or h is 0 in some annotation information. Such data will cause the calculated regression loss to be nan
            if xmax <= xmin or ymax <= ymin:
                print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path))
                continue
            
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(self.class_dict[obj["name"]])
            if "difficult" in obj:
                iscrowd.append(int(obj["difficult"]))
            else:
                iscrowd.append(0)

        # convert everything into a paddle.Tensor
        boxes = paddle.to_tensor(boxes).astype('float32')
        labels = paddle.to_tensor(labels).astype('int32')
        iscrowd = paddle.to_tensor(iscrowd, dtype=paddle.int64)
        image_id = paddle.to_tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])


        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            image, target = self.transforms(image, target)
    
        return image, target
    
    def parse_xml_to_dict(self, xml):
        """
        take xml The document is parsed into dictionary form for reference tensorflow of recursive_parse_xml_to_dict
        Args:
            xml: xml tree obtained by parsing XML file contents using lxml.etree

        Returns:
            Python dictionary holding XML contents.
        """

        if len(xml) == 0:  # Traverse the bottom layer and directly return the information corresponding to the tag
            return {xml.tag: xml.text}

        result = {}
        for child in xml:
            child_result = self.parse_xml_to_dict(child)  # Recursive traversal of label information
            if child.tag != 'object':
                result[child.tag] = child_result[child.tag]
            else:
                if child.tag not in result:  # Because there may be multiple object s, they need to be placed in the list
                    result[child.tag] = []
                result[child.tag].append(child_result[child.tag])
        return {xml.tag: result}
    
    def collate_fn(batch):
        return tuple(zip(*batch))

with open('VOC2012/ImageSets/Main/train.txt') as t:
    pass
train_dataset = VOCDataset('./', "2012")
print(train_dataset.class_dict)
{'aeroplane': 1, 'bicycle': 2, 'bird': 3, 'boat': 4, 'bottle': 5, 'bus': 6, 'car': 7, 'cat': 8, 'chair': 9, 'cow': 10, 'diningtable': 11, 'dog': 12, 'horse': 13, 'motorbike': 14, 'person': 15, 'pottedplant': 16, 'sheep': 17, 'sofa': 18, 'train': 19, 'tvmonitor': 20}

VOC read test

import paddle.vision.transforms as transforms
from draw_box_utils import draw_box
from PIL import Image
import json
import matplotlib.pyplot as plt
import random

# read class_indict
category_index = {}
try:
    json_file = open('./pascal_voc_classes.json', 'r')
    class_dict = json.load(json_file)
    category_index = {v: k for k, v in class_dict.items()}
except Exception as e:
    print(e)
    exit(-1)

data_transform = {
    "train": transforms.Compose([transforms.ToTensor(),
                                 transforms.RandomHorizontalFlip(0.5)]),
    "val": transforms.Compose([transforms.ToTensor()])
}

# load train data set
train_data_set = VOCDataset('./', "2012")
print(len(train_data_set))
for index in random.sample(range(0, len(train_data_set)), k=5):
    img, target = train_data_set[index]
    draw_box(img,
             target["boxes"].numpy(),
             target["labels"].numpy(),
             [1 for i in range(len(target["labels"].numpy()))],
             category_index,
             thresh=0.5,
             line_thickness=5)
    plt.imshow(img)
    plt.show()
5717

Class VOC data

!unzip -oq data/data106197/voc.zip
import paddle
import os
import json
from PIL import Image
from lxml import etree

# Define the data reading class and inherit the paddle io. Dataset
class Selfataset(paddle.io.Dataset):

    def __init__(self,voc_root,transforms=None,txt_name:str = 'train.txt'):
        self.root =voc_root
        self.img_root = os.path.join(self.root,'JPEGImages')
        self.annotations_root = os.path.join(self.root,'Annotations')

        txt_path = os.path.join(self.root,txt_name)
        print(txt_path)
        assert os.path.exists(txt_path),'not found {} file'.format(txt_name)


#self.xml_list = [os.path.join(self.annotations_root,line.strip()+'.xml')
                            #for line in read.readlines() if len(line.strip()) >0 ]
        self.image_list = []
        self.xml_list = []
        with open(txt_path) as read:
            self.path_list = [line.strip() for line in read.readlines() if len(line.strip()) >0 ] 
            for path in self.path_list:
                self.image_list.append(os.path.join(self.root,path.split(' ')[0]))
                self.xml_list.append(os.path.join(self.root,path.split(' ')[1]))
        
        assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
        for xml_path in self.xml_list:
            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
        
        #read class
        self.class_dict = {}
        self.class_path = os.path.join(self.root,'labels.txt')
        print(self.class_path)
        with open(self.class_path) as read:
            self.classes = [class_name.strip() for class_name in read.readlines() ]
            print(self.classes)
            for number,class_name in enumerate(self.classes,1):
                self.class_dict[class_name] = number



        self.transforms = transforms

    def __len__(self):
        return len(self.xml_list)

    def __getitem__(self, idx):
        # read xml
        xml_path = self.xml_list[idx]
        with open(xml_path) as fid:
            xml_str = fid.read()
        xml = etree.fromstring(xml_str)
        data = self.parse_xml_to_dict(xml)["annotation"]
        #print(data)
        img_path = os.path.join(self.img_root, data["frame"]+'.jpg')
        image = Image.open(img_path)
        #if image.format != "JPEG":
            #raise ValueError("Image '{}' format not JPEG".format(img_path))

        boxes = []
        labels = []
        iscrowd = []
        assert "object" in data, "{} lack of object information.".format(xml_path)
        for obj in data["object"]:
            xmin = float(obj["bndbox"]["xmin"])
            xmax = float(obj["bndbox"]["xmax"])
            ymin = float(obj["bndbox"]["ymin"])
            ymax = float(obj["bndbox"]["ymax"])

            # Further check the data. There may be cases where w or h is 0 in some annotation information. Such data will cause the calculated regression loss to be nan
            if xmax <= xmin or ymax <= ymin:
                print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path))
                continue
            
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(self.class_dict[obj["name"]])
            if "difficult" in obj:
                iscrowd.append(int(obj["difficult"]))
            else:
                iscrowd.append(0)

        # convert everything into a paddle.Tensor
        boxes = paddle.to_tensor(boxes).astype('float32')
        labels = paddle.to_tensor(labels).astype('int32')
        iscrowd = paddle.to_tensor(iscrowd, dtype=paddle.int64)
        image_id = paddle.to_tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])


        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            image, target = self.transforms(image, target)
    
        return image, target
    
    def parse_xml_to_dict(self, xml):
        """
        take xml The document is parsed into dictionary form for reference tensorflow of recursive_parse_xml_to_dict
        Args:
            xml: xml tree obtained by parsing XML file contents using lxml.etree

        Returns:
            Python dictionary holding XML contents.
        """

        if len(xml) == 0:  # Traverse the bottom layer and directly return the information corresponding to the tag
            return {xml.tag: xml.text}

        result = {}
        for child in xml:
            child_result = self.parse_xml_to_dict(child)  # Recursive traversal of label information
            if child.tag != 'object':
                result[child.tag] = child_result[child.tag]
            else:
                if child.tag not in result:  # Because there may be multiple object s, they need to be placed in the list
                    result[child.tag] = []
                result[child.tag].append(child_result[child.tag])
        return {xml.tag: result}

    def collate_fn(batch):
        return tuple(zip(*batch))
a = Selfataset('voc',None,'train_list.txt')
voc/train_list.txt
voc/labels.txt
['flv', 'gx', 'mbw']
a.class_dict
{'flv': 1, 'gx': 2, 'mbw': 3}
import paddle.vision.transforms as transforms
from draw_box_utils import draw_box
from PIL import Image
import json
import matplotlib.pyplot as plt
import random

# read class_indict
category_index = {}
try:
    json_file = open('./pascal_voc_classes.json', 'r')
    class_dict = json.load(json_file)
    category_index = {v: k for k, v in class_dict.items()}
except Exception as e:
    print(e)
    exit(-1)

data_transform = {
    "train": transforms.Compose([transforms.ToTensor(),
                                 transforms.RandomHorizontalFlip(0.5)]),
    "val": transforms.Compose([transforms.ToTensor()])
}

# load train data set
train_data_set = Selfataset('voc',None,'train_list.txt')
print(len(train_data_set))
for index in random.sample(range(0, len(train_data_set)), k=5):
    img, target = train_data_set[index]
    draw_box(img,
             target["boxes"].numpy(),
             target["labels"].numpy(),
             [1 for i in range(len(target["labels"].numpy()))],
             category_index,
             thresh=0.6,
             line_thickness=5)
    plt.imshow(img)
    plt.show()
# targetn = []
# for index in range(0, len(train_data_set)):
#     try:
#         img, target = train_data_set[index]
#         targetn.append(target["labels"].numpy())
#     except:
#         pass

voc/train_list.txt
voc/labels.txt
['flv', 'gx', 'mbw']
1216

Topics: Python Computer Vision Deep Learning paddlepaddle