ok, continue to do a CUDA exercise. Among many methods of laser point cloud target detection, one kind of method is based on the depth image projected from the top view of point cloud from the perspective of BEV. Of course, as far as the overhead projection step is concerned, it can be completed with very simple logic whether in python or c + +. However, in terms of efficiency, when the number of points is large, we can also use CUDA to accelerate. For the convenience of comparison, the example of c + + implementation is given first.
[source code]
#include <stdio.h> #include "math.h" #include "bev.h" int scale_to_255(float z, float z_min, float z_max) { return int(((z-z_min)/(z_max-z_min)) * 255); } void bev(const float* points, int num_points, unsigned char* out, const std::vector<float>& cloud_range, const std::vector<float>& voxel_size, const std::vector<int>& grid_size, int in_cols) { for (int i=0; i<num_points; i++) { const float* point = points + i*in_cols; int x = std::floor((point[0] - cloud_range[0]) / voxel_size[0]); int y = std::floor((point[1] - cloud_range[1]) / voxel_size[1]); int z = std::floor((point[2] - cloud_range[2]) / voxel_size[2]); if (x<0 || y<0 || z<0 || x>=grid_size[0] || y>=grid_size[1] || z>=grid_size[2]) { continue; } //TODO: add range filter here int row = grid_size[0] - x; int col = grid_size[1] - y; int tmp = scale_to_255(point[2],cloud_range[2],cloud_range[5]); int offset = row*grid_size[1] + col; out[offset] = tmp > out[offset] ? tmp : out[offset]; //printf("row:%d,col:%d,offset:%d\n",row,col,offset); } }
Parameter Description:
cloud_range: point cloud range, [xmin,ymin,zmin,xmax,ymax,zmax], frame a three-dimensional rectangular range, and only consider the points within the range
voxel_size: voxel size, [dx,dy,dz] Because it is necessary to do the top projection from 3D to 2D, the z-image will be compressed to 1. In actual use, the size of dz will be set as large as (Zmax Zmin).
grid_size: grid size [grid_x,grid_y,grid_z], which can be calculated according to the first two parameters.
void Test(const pcl::PointCloud<pcl::PointXYZ>::Ptr cloud) { printf("recv points:%d \n", cloud->width); std::vector<int> grid_size(3); std::vector<float> voxel_size = {0.09765, 0.09765, 20}; std::vector<float> cloud_range = {0.0, -50., -5., 100., 50., 15.}; for (int i = 0; i < 3; i++) { grid_size[i] = round((cloud_range[3 + i] - cloud_range[i]) / voxel_size[i]); } //printf("grid_size[0]:%d,grid_size[1]:%d\n",grid_size[0],grid_size[1]); cv::Mat gray(grid_size[0], grid_size[1], CV_8UC1, cv::Scalar(0)); if (0) { //write cuda bev here } else { std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now(); bev(data,num_points,gray.data,cloud_range,voxel_size,grid_size,in_cols); std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now(); std::chrono::duration<double, std::ratio<1, 1000>> time_span = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1000>>>(t2 - t1); std::cout << "time_cost: " << time_span.count() << " ms" << std::endl; } { cv::imshow("preview", gray); auto key = cv::waitKey(5); if (key == 'q' || key == 'Q') { exit(0); } } free(data); }
Through the above code, you can simply test and display the projected depth image. I set a 100x100x20 area according to voxel_ Grid calculated by size_ Size is [10241024,1].
[test]
recv points:58900 time_cost: 1.00689 ms recv points:57131 time_cost: 2.32245 ms recv points:56297 time_cost: 0.959922 ms recv points:56186 time_cost: 2.5897 ms recv points:56627 time_cost: 2.41959 ms
Obviously, this loop operation in c + + implementation can be replaced by CUDA's parallel operation. It is worth noting that in top projection, the height information of point cloud is encoded in the gray value of the image. When multiple points are projected to a pixel position, only the point with the largest height value is encoded. This is easy to handle in cyclic processing. Through simple comparison, it can be ensured that the point with the highest height is finally encoded. However, in CUDA parallel computing, each point calculates the projection coordinates and their gray values independently. In order to ensure that the pixel gray value encodes the point with the largest height value, I divide the projection process into two steps:
Step 1: only determine the projection position of each point and calculate its gray value;
__device__ int scale_to_255(float z, float z_min, float z_max) { return int(((z-z_min)/(z_max-z_min)) * 255); } __global__ void InitializeMapEntries(const float* __restrict__ points, const int num_points, float range_min_x, float range_min_y, float range_min_z, float range_max_z, float voxel_size_x, float voxel_size_y, float voxel_size_z, int grid_x, int grid_y, int grid_z, int* grid_first_entry, HashEntry* entry_list, int cols) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int stride = gridDim.x * blockDim.x; for(int i = idx ; i < num_points; i += stride) { const float* cur_point = points + i * cols; int x = __float2int_rd((cur_point[0] - range_min_x) / voxel_size_x); int y = __float2int_rd((cur_point[1] - range_min_y) / voxel_size_y); int z = __float2int_rd((cur_point[2] - range_min_z) / voxel_size_z); if(x<0 || y<0 || z<0 || x>=grid_x || y>=grid_y || z>=grid_z) continue; int row = grid_x - x; int col = grid_y - y; HashEntry* entry = entry_list + i; entry->gray_var = scale_to_255(cur_point[2],range_min_z,range_max_z); int grid_idx =row * grid_y + col; int* address = grid_first_entry + grid_idx; int new_var = i; int cur_var = *address; int assumed; do { assumed = cur_var; cur_var = atomicCAS(address, assumed, new_var); } while (assumed != cur_var); entry -> next_id = cur_var; } }%
Step 2: compare the height values of all points at each effective pixel position, and select the gray value of the point with the largest height value as the gray value of the pixel.
[core code]
__global__ void ExtractValidOutPixelKernel(const float* points, unsigned char* d_out, HashEntry* entry_list, int* grid_first_entry, int grid_bev_size) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int stride = gridDim.x * blockDim.x; for(int i = idx ; i < grid_bev_size; i += stride) { int entry_idx = grid_first_entry[i]; if (entry_idx >= 0) { HashEntry* entry = entry_list + entry_idx; int max_gray = entry->gray_var; int next_idx = entry->next_id; while(next_idx >= 0) { HashEntry* entry_next = entry_list + next_idx; next_idx = entry_next -> next_id; max_gray = entry_next->gray_var > max_gray ? entry_next->gray_var : max_gray; } d_out[i] = max_gray; } } } void cuda_bev(const float* points, int num_points, unsigned char* out, const std::vector<float>& cloud_range, const std::vector<float>& voxel_size, const std::vector<int>& grid_size, int in_cols) { float * d_points; unsigned char* d_out; HashEntry* d_hash_list; int* d_grid_first_entry; int grid_bev_size = grid_size[0]*grid_size[1]; CHECK(cudaMalloc((void**)&d_points, sizeof(float)*in_cols*num_points)); CHECK(cudaMalloc((void**)&d_out, sizeof(unsigned char)*grid_bev_size)); CHECK(cudaMalloc((void**)&d_hash_list, sizeof(HashEntry)*num_points)); CHECK(cudaMalloc((void**)&d_grid_first_entry, sizeof(int)*grid_bev_size)); init_int_(d_grid_first_entry, -1, grid_bev_size); init_uchar_(d_out, 0, grid_bev_size); init_hash_list_(d_hash_list, num_points); cudaEvent_t start,stop; CHECK(cudaEventCreate(&start)); CHECK(cudaEventCreate(&stop)); CHECK(cudaEventRecord(start)); cudaEventQuery(start); CHECK(cudaMemcpy((void*)d_points, points, sizeof(float)*in_cols*num_points, cudaMemcpyHostToDevice)); int blockSize=512; int minGridSize=1; int num_thread = num_points; cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, InitializeMapEntries, 0, num_thread); minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize)); printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize); InitializeMapEntries<<<minGridSize, blockSize>>>(d_points, num_points, cloud_range[0], cloud_range[1], cloud_range[2], cloud_range[5], voxel_size[0], voxel_size[1], voxel_size[2], grid_size[0], grid_size[1], grid_size[2], d_grid_first_entry, d_hash_list, in_cols); num_thread = grid_bev_size; cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, ExtractValidOutPixelKernel, 0, num_thread); minGridSize = std::min(minGridSize, DivUp(num_thread, blockSize)); printf("%s:%d gridSize:%d,blockSize:%d\n",__FUNCTION__ ,__LINE__,minGridSize,blockSize); ExtractValidOutPixelKernel<<<minGridSize, blockSize>>>(points, d_out, d_hash_list, d_grid_first_entry, grid_bev_size); // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch CHECK(cudaDeviceSynchronize()); CHECK(cudaMemcpy(out,d_out,grid_bev_size,cudaMemcpyDeviceToHost)); CHECK(cudaEventRecord(stop)); CHECK(cudaEventSynchronize(stop)); float elapsed_time; CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); printf("time cost: %.3f(ms)\n", elapsed_time); CHECK(cudaFree(d_points)); CHECK(cudaFree(d_hash_list)); CHECK(cudaFree(d_grid_first_entry)); }
[test]
recv points:58900 time cost: 0.470(ms) recv points:59541 time cost: 0.502(ms) recv points:58875 time cost: 0.822(ms) recv points:58856 time cost: 0.786(ms) recv points:59224 time cost: 0.819(ms) recv points:59459 time cost: 0.952(ms) recv points:58728 time cost: 0.504(ms) recv points:59197 time cost: 0.503(ms) recv points:59404 time cost: 0.844(ms)
Of course, this test has its irrationality. CUDA here calculates the execution time of the kernel function, ignoring the time of video memory allocation and data handling.