| /∗ Allocation of the five shared arrays x, y, D, A, J as in the naive implementation ∗/ |
| // … |
| /∗ Allocation of an additional private x array per thread ∗/ |
| double∗mythread_x_copy = (double∗) malloc(n ∗ sizeof(double)); |
| /∗ Prep-work: check for each block of x whether it has values needed by MYTHREAD; make a private boolean array “block_is_needed” of length nblks ∗/ |
| // … |
| /∗ Transport the needed blocks of x into mythread_x_copy ∗/ |
| for (int b = 0; b < nblks; b++) |
| if (block_is_needed[b]) |
| upc_memget(&mythread_x_copy[b ∗ BLOCKSIZE], &x[b ∗ BLOCKSIZE], min(BLOCKSIZE, n − b ∗ BLOCKSIZE) ∗ sizeof(double)); |
| /∗ SpMV: each thread only goes through its designated blocks ∗/ |
| int mythread_nblks = nblks/THREADS + (MYTHREAD < (nblks% THREADS) ?1 : 0); |
| for (int mb = 0; mb < mythread_nblks; mb++) { |
| int offset = (mb ∗ THREADS + MYTHREAD) ∗ BLOCKSIZE; |
| /∗ casting shared pointers to local pointers ∗/ |
| double∗loc_y = (double∗) (y + offset); |
| double∗loc_D = (double∗) (D + offset); |
| double∗loc_A = (double∗) (A + offset ∗ rnz); |
| int∗loc_J = (int∗) (J + offset ∗ rnz); |
| /∗ computation per block ∗/ |
| for (int k = 0; k < min(BLOCKSIZE, n-offset); k++) { |
| double tmp = 0.0; |
| for (int j = 0; j < rnz; j++) |
| tmp += loc_A[k ∗ rnz + j] ∗ mythread_x_copy[loc_J[k ∗ rnz + j]]; |
| loc_y[k] = loc_D[k] ∗ mythread_x_copy[offset + k] + tmp; |
| } |
| } |