Research Article

Multi-GPU Support on Single Node Using Directive-Based Programming Model

Algorithm 9

A multi-GPU implementation of MM using OpenACC extension.
for(d=0; d<num_devices; d++)
{
blocks = n/num_devices;
start = idblocks;
end = (id+1)blocks;
#pragma acc data copyin(A[startn:blocksn])
    copyin(B[0:nn])
    copyout(C[startn:blocksn])
    deviceid(d)
{
#pragma acc parallel deviceid(d)
num_gangs(32) vector_length(32)
{
#pragma acc loop gang
for(i=start; i<end; i++){
#pragma acc loop vector
for(j=0; j<n; j++){
float c = 0.0f;
for(k=0; k<n; k++)
c += A[in+k]    B[kn+j];
C[in+j] = c;
}
}
}
}
for(d=0; d<num_devices; d++){
#pragma acc wait device(d)
}