/∗ Allocation of the five shared arrays x, y, D, A, J as in the naive implementation ∗/
// …
/∗ Allocation of an additional private x array per thread ∗/
doublemythread_x_copy = (double) malloc(n ∗ sizeof(double));
/∗ Prep-work: check for each block of x whether it has values needed by MYTHREAD; make a private boolean array “block_is_needed” of length nblks ∗/
// …
/∗ Transport the needed blocks of x into mythread_x_copy ∗/
for (int b = 0; b < nblks; b++)
if (block_is_needed[b])
  upc_memget(&mythread_x_copy[b ∗ BLOCKSIZE],  &x[b ∗ BLOCKSIZE],  min(BLOCKSIZE,  n − b ∗ BLOCKSIZE) ∗ sizeof(double));
/∗ SpMV: each thread only goes through its designated blocks ∗/
int mythread_nblks = nblks/THREADS + (MYTHREAD < (nblks% THREADS) ?1 : 0);
for (int mb = 0; mb < mythread_nblks; mb++) {
int offset = (mb ∗ THREADS + MYTHREAD) ∗ BLOCKSIZE;
 /∗ casting shared pointers to local pointers ∗/
doubleloc_y = (double) (y + offset);
doubleloc_D = (double) (D + offset);
doubleloc_A = (double) (A + offset ∗ rnz);
intloc_J = (int) (J + offset ∗ rnz);
 /∗ computation per block ∗/
for (int k = 0; k < min(BLOCKSIZE, n-offset); k++) {
  double tmp = 0.0;
  for (int j = 0; j < rnz; j++)
   tmp += loc_A[k ∗ rnz + j] ∗ mythread_x_copy[loc_J[k ∗ rnz + j]];
  loc_y[k] = loc_D[k] ∗ mythread_x_copy[offset + k] + tmp;
 }
}
Listing 4: An improved UPC implementation of SpMV by block-wise communication.