Research Article

Performance Optimization and Modeling of Fine-Grained Irregular Communication in UPC

Listing 5

An improved UPC implementation of SpMV by message condensing and consolidation.
/∗ Allocation of the five shared arrays x, y, D, A, J as in the naive implementation ∗/
// …
/∗ Allocation of an additional private x array per thread ∗/
doublemythread_x_copy = (double) malloc(n ∗ sizeof(double));
/∗ Preparation step: create and fill the thread-private arrays of int ∗mythread_num_send_values, int ∗mythread_num_recv_values, int ∗∗mythread_send_value_list, int ∗∗mythread_recv_value_list, double ∗∗mythread_send_buffers. Also, shared_recv_buffers is prepared. ∗/
// …
/∗ Communication procedure starts ∗/
int T, k, mb, offset;
doublelocal_x_ptr = (double)(x + MYTHREAD ∗ BLOCKSIZE);
for (T = 0; T < THREADS; T++)
if (mythread_num_send_values[T] > 0) /∗ pack outgoing messages ∗/
  for (k = 0; k < mythread_num_send_values[T]; k++)
   mythread_send_buffers[T][k] = local_x_ptr[mythread_send_value_list[T][k]];
for (T = 0; T < THREADS; T++)
if (mythread_num_send_values[T] > 0) /∗ send out messages ∗/
  upc_memput(shared_recv_buffers[T ∗ THREADS + MYTHREAD], mythread_send_buffers[T], mythread_num_send_values[T] ∗ sizeof(double));
upc_barrier;
int mythread_nblks = nblks/THREADS + (MYTHREAD < (nblks% THREADS) ?1 : 0);
for (mb = 0; mb < mythread_nblks; mb++) /∗ copy own x-blocks ∗/
 offset = (mb ∗ THREADS + MYTHREAD) ∗ BLOCKSIZE;
 memcpy(&mythread_x_copy[offset], (double)(x + offset), min(BLOCKSIZE, n-offset) ∗ sizeof(double));
}
for (T = 0; T < THREADS; T++)
if (mythread_num_recv_values[T] > 0) {/∗ unpack incoming messages ∗/
  double ∗local_buffer_ptr = (double) shared_recv_buffers[MYTHREAD ∗ THREADS + T];
  for (k = 0; k < mythread_num_recv_values[T]; k++)
   mythread_x_copy[mythread_recv_value_list[T][k]] = local_buffer_ptr[k];
 }
/∗ Communication procedure ends ∗/
/∗ SpMV: each thread only goes through its designated blocks ∗/
for (mb = 0; mb < mythread_nblks; mb++) {
 offset = (mb ∗ THREADS + MYTHREAD) ∗ BLOCKSIZE;
 /∗ casting shared pointers to local pointers ∗/
doubleloc_y = (double) (y + offset);
doubleloc_D = (double) (D + offset);
doubleloc_A = (double) (A + offset ∗ rnz);
intloc_J = (int) (J + offset ∗ rnz);
 /∗ computation per block ∗/
for (k = 0; k < min(BLOCKSIZE, n-offset); k++) {
  double tmp = 0.0;
  for (int j = 0; j < rnz; j++)
   tmp += loc_A[k ∗ rnz + j] ∗ mythread_x_copy[loc_J[k ∗ rnz + j]];
  loc_y[k] = loc_D[k] ∗ mythread_x_copy[offset + k] + tmp;
 }
}