Research Article
Multi-GPU Support on Single Node Using Directive-Based Programming Model
Algorithm 3
A multi-GPU implementation of MM using hybrid model.
omp_set_num_threads(threads); | #pragma omp parallel | { | int i, j, k; | int id, blocks, start, end; | id = omp_get_thread_num(); | blocks = n/threads; | start = idblocks; | end = (id+1)blocks; | acc_set_device_num(id+1, acc_device_not_host); | #pragma acc data copyin(A[startn:blocksn]) | copyin(B[0:nn]) | copyout(C[startn:blocksn]) | { | #pragma acc parallel num_gangs(32) vector_length(32) | { | #pragma acc loop gang | for(i=start; i<end; i++){ | #pragma acc loop vector | for(j=0; j<n; j++){ | float c = 0.0f; | for(k=0; k<n; k++) | c += A[in+k] B[kn+j]; | C[in+j] = c; | } | } | } | |
|