Research Article

Hybrid MPI and CUDA Parallelization for CFD Applications on Multi-GPU HPC Clusters

Algorithm 2

Asynchronous concurrent execution algorithm of CFD on the GPU.
(1)cudaStreamCreate (&stream[j]);
(2) Boundary_Processing_GPU<<<Block_size, Thread_size, stream[0]>>>( );
(3) Time_Step_GPU<<<Block_size, Thread_size, stream[1]>>>( );
(4) Grad_Initial<<<Block_Size, Thread_Size, stream[2]>>>( );
(5) RHS_Initial<<<Block_Size, Thread_Size, stream[3]>>>( );
(6)cudaDeviceSynchronize ( );
(7) Grad_Primitive_Variables_GPU<<<Block_size, Thread_size, stream[0]>>>( );
(8) Convective_Flux_GPU<<<Block_size, Thread_size, stream[1]>>>( );
(9)cudaDeviceSynchronize ( );
(10) Viscous_Flux_GPU<<<Block_size, Thread_size, stream[0]>>>( );
(11) RHS_GPU<<<Block_size, Thread_size, stream[0]>>>( );
(12) Primitive_Variables_Update_GPU<<<Block_size, Thread_size, stream[0]>>>( );
(13)cudaDeviceSynchronize ();
(14)cudaStreamDestroy (stream[j]);