Research Article

Performance Optimization of 3D Lattice Boltzmann Flow Solver on a GPU

Algorithm 3

Two separate CUDA kernels for different phases of LBM.
()int  i;
()for(i = 0; i < timeSteps; i++)
()
()collision_kernel<<<GRID, BLOCK>>>
     (source_grid, temp_grid, xdim, ydim,
     zdim, cell_size, grid_size);
()cudaThreadSynchronize();
()streaming_kernel<<<GRID, BLOCK>>>(temp_grid,
     dest_grid, xdim, ydim, zdim, cell_size,
     grid_size);
()cudaThreadSynchronize();
()swap_grid(source_grid, dest_grid);
()