Research Article

Hybrid MPI and CUDA Parallelization for CFD Applications on Multi-GPU HPC Clusters

Algorithm 1

Parallel algorithm for CFD on multi-GPU HPC clusters.
(1)MPI_Init (&argc, &argv);
(2)Device_Query ( );
(3)cudaMemcpy (d_a, h_a, sizeof(float)n, cudaMemcpyHostToDevice);
(4)//Kernel execution start
(5)for i = 0; i < max_step; i++
(6) Boundary_Processing_GPU<<<Block_size, Thread_size>>> ( );
(7) Time_Step_GPU<<<Block_size, Thread_size>>> ();
(8) Primitive_Variables_Exchange ();
(9) Grad_Primitive_Variables_GPU<<<Block_size, Thread_size>>> ( );
(10) Grad_primitive_Variables_Exchange ( );
(11) Flux_GPU<<<Block_size, Thread_size>>> ( );
(12) Primitive_Variables_Update_GPU<<<Block_size, Thread_size>>> ( );
(13)end for
(14)//kernel execution end
(15)cudaMemcpy (h_a, d_a, sizeof(float)n. cudaMemcpyDeviceToHost);
(16)Flow_post-processing ( );
(17)MPI_finalize ( );