Research Article

Optimized Data Transfers Based on the OpenCL Event Management Mechanism

Listing 2

A Himeno benchmark code with overlapping communication and computation.
()  cl_command_queue cmd1, cmd2;
()  cl_mem p_new, p_old, p_tmp;
()
()  for(int i(0);i<N;++i){
()   //swap pointers
()   p_tmp = p_new; p_new = p_old; p_old = p_tmp;
()   if(  rank%2 == 0) {
()    //  the upper portion is calculated
()    jacobi_kernel_even_A(cmd1,p_new,…);
()  //  the bottom plane is updated
()  MPI_Irecv(…);
()  clEnqueueReadBuffer(cmd2,p_old,CL_FALSE,…);
()  clFinish(cmd2);//blocking
()  MPI_Send(…);  //blocking
()  MPI_Wait(…);  //blocking
()  clEnqueueWriteBuffer(cmd2,p_old,CL_FALSE,…);
()  //  the lower portion is calculated
()  jacobi_kernel_even_B(cmd2,p_new,…);
()  //  the top plane is updated
()  MPI_Irecv(…);
()  clEnqueueReadBuffer(cmd1,p_new,CL_FALSE,…);
()  clFinish(cmd1);//blocking
()  MPI_Send(…);  //blocking
()  MPI_Wait(…);  //blocking
()  clEnqueueWriteBuffer(cmd1,p_new,CL_FALSE,…);
() }
() else  {
()  jacobi_kernel_odd_B(cmd1,p_new,…);
()  MPI_Irecv(…);
()  clEnqueueReadBuffer(cmd2,p_old,CL_FALSE,…);
()  clFinish(cmd2);//blocking
()  MPI_Send(…);  //blocking
()  MPI_Wait(…);  //blocking
()  clEnqueueWriteBuffer(cmd2,p_old,CL_FALSE,…);
()  jacobi_kernel_odd_A(cmd2,p_new,…);
()  MPI_Irecv(…);
()  clEnqueueReadBuffer(cmd1,p_new,CL_FALSE,…);
()  clFinish(cmd1); //blocking
()  MPI_Send(…);  //blocking MPI_Wait (); //blocking
()  clEnqueueWriteBuffer(cmd1,p_new,CL_FALSE,…);
() }  clFinish(cmd1);clFinish(cmd2);/* error calculation */
() }