Research Article
Multi-GPU Support on Single Node Using Directive-Based Programming Model
Algorithm 9
A multi-GPU implementation of MM using OpenACC extension.
for(d=0; d<num_devices; d++) | { | blocks = n/num_devices; | start = idblocks; | end = (id+1)blocks; | #pragma acc data copyin(A[startn:blocksn]) | copyin(B[0:nn]) | copyout(C[startn:blocksn]) | deviceid(d) | { | #pragma acc parallel deviceid(d) | num_gangs(32) vector_length(32) | { | #pragma acc loop gang | for(i=start; i<end; i++){ | #pragma acc loop vector | for(j=0; j<n; j++){ | float c = 0.0f; | for(k=0; k<n; k++) | c += A[in+k] B[kn+j]; | C[in+j] = c; | } | } | } | } | for(d=0; d<num_devices; d++){ | #pragma acc wait device(d) | } |
|