[CUDA basic exercise] laser point cloud top projection depth image

Posted by dr.wong on Fri, 31 Dec 2021 08:21:41 +0100

ok, continue to do a CUDA exercise. Among many methods of laser point cloud target detection, one kind of method is based on the depth image projected from the top view of point cloud from the perspective of BEV. Of course, as far as the overhead projection step is concerned, it can be completed with very simple logic whether in python or c + +. However, in terms of efficiency, when the number of points is large, we can also use CUDA to accelerate. For the convenience of comparison, the example of c + + implementation is given first.

[source code]

#include <stdio.h>
#include "math.h"
#include "bev.h"

int scale_to_255(float z, float z_min, float z_max) {
    return int(((z-z_min)/(z_max-z_min)) * 255);
}

void bev(const float* points,
         int num_points,
         unsigned char* out,
         const std::vector<float>& cloud_range,
         const std::vector<float>& voxel_size,
         const std::vector<int>& grid_size,
         int in_cols) {
    for (int i=0; i<num_points; i++) {
        const float* point = points + i*in_cols;
        int x  = std::floor((point[0] - cloud_range[0]) / voxel_size[0]);
        int y  = std::floor((point[1] - cloud_range[1]) / voxel_size[1]);
        int z  = std::floor((point[2] - cloud_range[2]) / voxel_size[2]);
        if (x<0 || y<0 || z<0 || x>=grid_size[0] || y>=grid_size[1] || z>=grid_size[2]) {
            continue;
        }
        //TODO: add range filter here
        int row = grid_size[0] - x;
        int col = grid_size[1] - y;
        int tmp = scale_to_255(point[2],cloud_range[2],cloud_range[5]);
        int offset = row*grid_size[1] + col;
        out[offset] = tmp > out[offset] ? tmp : out[offset];
        //printf("row:%d,col:%d,offset:%d\n",row,col,offset);
    }
}

Parameter Description:

cloud_range: point cloud range, [xmin,ymin,zmin,xmax,ymax,zmax], frame a three-dimensional rectangular range, and only consider the points within the range

voxel_size: voxel size, [dx,dy,dz] Because it is necessary to do the top projection from 3D to 2D, the z-image will be compressed to 1. In actual use, the size of dz will be set as large as (Zmax Zmin).

grid_size: grid size [grid_x,grid_y,grid_z], which can be calculated according to the first two parameters.

void Test(const pcl::PointCloud<pcl::PointXYZ>::Ptr cloud) {
    printf("recv points:%d \n", cloud->width);
 
    std::vector<int> grid_size(3);
    std::vector<float> voxel_size = {0.09765, 0.09765, 20};
    std::vector<float> cloud_range = {0.0, -50., -5., 100., 50., 15.};
    for (int i = 0; i < 3; i++) {
        grid_size[i] = round((cloud_range[3 + i] - cloud_range[i]) / voxel_size[i]);
    }   
    //printf("grid_size[0]:%d,grid_size[1]:%d\n",grid_size[0],grid_size[1]);
 
    cv::Mat gray(grid_size[0], grid_size[1], CV_8UC1, cv::Scalar(0));
    if (0) {
        //write cuda bev here
    } else {
        std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
        bev(data,num_points,gray.data,cloud_range,voxel_size,grid_size,in_cols);
        std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
        std::chrono::duration<double, std::ratio<1, 1000>> time_span =
                std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1);
        std::cout << "time_cost: " << time_span.count() << " ms" << std::endl;
    }   
 
    {   
        cv::imshow("preview", gray);
        auto key = cv::waitKey(5);
        if (key == 'q' || key == 'Q') {
            exit(0);
        }   
    }   
 
    free(data);
}

Through the above code, you can simply test and display the projected depth image. I set a 100x100x20 area according to voxel_ Grid calculated by size_ Size is [10241024,1].

[test]

recv points:58900 
time_cost: 1.00689 ms
recv points:57131 
time_cost: 2.32245 ms
recv points:56297 
time_cost: 0.959922 ms
recv points:56186 
time_cost: 2.5897 ms
recv points:56627 
time_cost: 2.41959 ms

Obviously, this loop operation in c + + implementation can be replaced by CUDA's parallel operation. It is worth noting that in top projection, the height information of point cloud is encoded in the gray value of the image. When multiple points are projected to a pixel position, only the point with the largest height value is encoded. This is easy to handle in cyclic processing. Through simple comparison, it can be ensured that the point with the highest height is finally encoded. However, in CUDA parallel computing, each point calculates the projection coordinates and their gray values independently. In order to ensure that the pixel gray value encodes the point with the largest height value, I divide the projection process into two steps:

Step 1: only determine the projection position of each point and calculate its gray value;

__device__ int scale_to_255(float z, float z_min, float z_max) {
    return int(((z-z_min)/(z_max-z_min)) * 255);
}

__global__ void InitializeMapEntries(const float* __restrict__  points,
                                     const int num_points,
                                     float range_min_x, float range_min_y, float range_min_z, float range_max_z,
                                     float voxel_size_x, float voxel_size_y, float voxel_size_z,
                                     int grid_x, int grid_y, int grid_z,
                                     int* grid_first_entry, HashEntry* entry_list,
                                     int cols)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = gridDim.x * blockDim.x;

    for(int i = idx ; i < num_points; i += stride)
    {   
        const float* cur_point = points + i * cols;
        int x  = __float2int_rd((cur_point[0] - range_min_x) / voxel_size_x);
        int y  = __float2int_rd((cur_point[1] - range_min_y) / voxel_size_y);
        int z  = __float2int_rd((cur_point[2] - range_min_z) / voxel_size_z);
        if(x<0 || y<0 || z<0 || x>=grid_x || y>=grid_y || z>=grid_z) continue;

        int row = grid_x - x;
        int col = grid_y - y;

        HashEntry* entry = entry_list + i;
        entry->gray_var = scale_to_255(cur_point[2],range_min_z,range_max_z);
        int grid_idx =row * grid_y + col;
        int* address = grid_first_entry + grid_idx;
        int new_var = i;
        int cur_var = *address;
        int assumed;

        do {
            assumed = cur_var;
            cur_var = atomicCAS(address, assumed, new_var);
        } while (assumed != cur_var);
        entry -> next_id = cur_var;
    }
}%                                                

Step 2: compare the height values of all points at each effective pixel position, and select the gray value of the point with the largest height value as the gray value of the pixel.

[core code]

__global__
void ExtractValidOutPixelKernel(const float* points, unsigned char* d_out, HashEntry* entry_list, int* grid_first_entry, int grid_bev_size)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = gridDim.x * blockDim.x;

    for(int i = idx ; i < grid_bev_size; i += stride)
    {
        int entry_idx = grid_first_entry[i];
        if (entry_idx >= 0) {
            HashEntry* entry = entry_list + entry_idx;
            int max_gray = entry->gray_var;
            int next_idx = entry->next_id;
            while(next_idx >= 0)
            {
                HashEntry* entry_next = entry_list + next_idx;
                next_idx = entry_next -> next_id;
                max_gray = entry_next->gray_var > max_gray ? entry_next->gray_var : max_gray;
            }
            d_out[i] = max_gray;
        }
    }
}

void cuda_bev(const float* points,
         int num_points,
         unsigned char* out,
         const std::vector<float>& cloud_range,
         const std::vector<float>& voxel_size,
         const std::vector<int>& grid_size,
         int in_cols)
{
    float * d_points;
    unsigned char* d_out;
    HashEntry* d_hash_list;
    int* d_grid_first_entry;
    int grid_bev_size = grid_size[0]*grid_size[1];

    CHECK(cudaMalloc((void**)&d_points, sizeof(float)*in_cols*num_points));
    CHECK(cudaMalloc((void**)&d_out, sizeof(unsigned char)*grid_bev_size));
    CHECK(cudaMalloc((void**)&d_hash_list, sizeof(HashEntry)*num_points));
    CHECK(cudaMalloc((void**)&d_grid_first_entry, sizeof(int)*grid_bev_size));
    init_int_(d_grid_first_entry, -1, grid_bev_size);
    init_uchar_(d_out, 0, grid_bev_size);
    init_hash_list_(d_hash_list, num_points);

    cudaEvent_t start,stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));
    CHECK(cudaEventRecord(start));
    cudaEventQuery(start);

    CHECK(cudaMemcpy((void*)d_points, points, sizeof(float)*in_cols*num_points, cudaMemcpyHostToDevice));

    int blockSize=512;
    int minGridSize=1;
    int num_thread = num_points;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, InitializeMapEntries, 0, num_thread);
    minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize));
    printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize);
    InitializeMapEntries<<<minGridSize, blockSize>>>(d_points, num_points,
                                                     cloud_range[0], cloud_range[1], cloud_range[2], cloud_range[5],
                                                     voxel_size[0], voxel_size[1], voxel_size[2],
                                                     grid_size[0], grid_size[1], grid_size[2],
                                                     d_grid_first_entry, d_hash_list, in_cols);

    num_thread = grid_bev_size;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, ExtractValidOutPixelKernel, 0, num_thread);
    minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize));
    printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize);
    ExtractValidOutPixelKernel<<<minGridSize, blockSize>>>(points, d_out, d_hash_list, d_grid_first_entry, grid_bev_size);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaMemcpy(out,d_out,grid_bev_size,cudaMemcpyDeviceToHost));

    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));

    float elapsed_time;
    CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
    printf("time cost: %.3f(ms)\n", elapsed_time);

    CHECK(cudaFree(d_points));
    CHECK(cudaFree(d_hash_list));
    CHECK(cudaFree(d_grid_first_entry));
}

[test]

recv points:58900 
time cost: 0.470(ms)
recv points:59541 
time cost: 0.502(ms)
recv points:58875 
time cost: 0.822(ms)
recv points:58856 
time cost: 0.786(ms)
recv points:59224 
time cost: 0.819(ms)
recv points:59459 
time cost: 0.952(ms)
recv points:58728 
time cost: 0.504(ms)
recv points:59197 
time cost: 0.503(ms)
recv points:59404 
time cost: 0.844(ms)

Of course, this test has its irrationality. CUDA here calculates the execution time of the kernel function, ignoring the time of video memory allocation and data handling.
 

Topics: Computer Vision Deep Learning Object Detection CUDA