Research Article
Hybrid MPI and CUDA Parallelization for CFD Applications on Multi-GPU HPC Clusters
Algorithm 1
Parallel algorithm for CFD on multi-GPU HPC clusters.
(1) | MPI_Init (&argc, &argv); | (2) | Device_Query ( ); | (3) | cudaMemcpy (d_a, h_a, sizeof(float)n, cudaMemcpyHostToDevice); | (4) | //Kernel execution start | (5) | for i = 0; i < max_step; i++ | (6) | Boundary_Processing_GPU<<<Block_size, Thread_size>>> ( ); | (7) | Time_Step_GPU<<<Block_size, Thread_size>>> (); | (8) | Primitive_Variables_Exchange (); | (9) | Grad_Primitive_Variables_GPU<<<Block_size, Thread_size>>> ( ); | (10) | Grad_primitive_Variables_Exchange ( ); | (11) | Flux_GPU<<<Block_size, Thread_size>>> ( ); | (12) | Primitive_Variables_Update_GPU<<<Block_size, Thread_size>>> ( ); | (13) | end for | (14) | //kernel execution end | (15) | cudaMemcpy (h_a, d_a, sizeof(float)n. cudaMemcpyDeviceToHost); | (16) | Flow_post-processing ( ); | (17) | MPI_finalize ( ); |
|