Research Article

Hybrid MPI and CUDA Parallelization for CFD Applications on Multi-GPU HPC Clusters

Algorithm 4

Nonblocking communication mode algorithm.
(1)if device_count>1 then
(2) cudaMemcpyAsync (h_a, d_a, sizeof(float)n, cudaMemcpyDeviceToHost, stream[0]);
(3)MPI_Isend (buf, int count, MPI_Datatype, int dest, int tag, MPI_COMM_WORLD, MPI_Request request);
(4)//Primitive_Variables_Exchange;
(5) Boundary_Processing_GPU<<<Block_size, Thread_size, stream[1]>>> ( );
(6) Time_Step_GPU<<<Block_size, Thread_size, stream[1]>>> ( );
(7)//Grad_Primitive_Variables_Exchange;
(8) Convective_Flux_GPU<<<Block_size, Thread_size, stream[1]>>> ( );
(9)MPI_Irecv (buf, int count, MPI_Datatype, int source, int tag, MPI_COMM_WORLD, MPI_Status status, MPI_Request request);
(10)MPI_Waitall ( );
(11) cudaMemcpyAsync (d_a, h_a, sizeof(float)n, cudaMemcpyHostToDevice, stream[0]);
(12)end if