Research Article

Multi-GPU Support on Single Node Using Directive-Based Programming Model

Algorithm 3

A multi-GPU implementation of MM using hybrid model.
omp_set_num_threads(threads);
#pragma omp parallel
{
int i, j, k;
int id, blocks, start, end;
id = omp_get_thread_num();
blocks = n/threads;
start = idblocks;
end = (id+1)blocks;
acc_set_device_num(id+1, acc_device_not_host);
#pragma acc data copyin(A[startn:blocksn])
copyin(B[0:nn])
copyout(C[startn:blocksn])
{
#pragma acc parallel num_gangs(32) vector_length(32)
{
#pragma acc loop gang
for(i=start; i<end; i++){
#pragma acc loop vector
for(j=0; j<n; j++){
float c = 0.0f;
for(k=0; k<n; k++)
c += A[in+k]    B[kn+j];
C[in+j] = c;
}
}
}