| /∗ Allocation of the five shared arrays x, y, D, A, J as in the naive implementation ∗/ |
| // … |
| /∗ Allocation of an additional private x array per thread ∗/ |
| double∗mythread_x_copy = (double∗) malloc(n ∗ sizeof(double)); |
| /∗ Preparation step: create and fill the thread-private arrays of int ∗mythread_num_send_values, int ∗mythread_num_recv_values, int ∗∗mythread_send_value_list, int ∗∗mythread_recv_value_list, double ∗∗mythread_send_buffers. Also, shared_recv_buffers is prepared. ∗/ |
| // … |
| /∗ Communication procedure starts ∗/ |
| int T, k, mb, offset; |
| double∗local_x_ptr = (double∗)(x + MYTHREAD ∗ BLOCKSIZE); |
| for (T = 0; T < THREADS; T++) |
| if (mythread_num_send_values[T] > 0) /∗ pack outgoing messages ∗/ |
| for (k = 0; k < mythread_num_send_values[T]; k++) |
| mythread_send_buffers[T][k] = local_x_ptr[mythread_send_value_list[T][k]]; |
| for (T = 0; T < THREADS; T++) |
| if (mythread_num_send_values[T] > 0) /∗ send out messages ∗/ |
| upc_memput(shared_recv_buffers[T ∗ THREADS + MYTHREAD], mythread_send_buffers[T], mythread_num_send_values[T] ∗ sizeof(double)); |
| upc_barrier; |
| int mythread_nblks = nblks/THREADS + (MYTHREAD < (nblks% THREADS) ?1 : 0); |
| for (mb = 0; mb < mythread_nblks; mb++) /∗ copy own x-blocks ∗/ |
| offset = (mb ∗ THREADS + MYTHREAD) ∗ BLOCKSIZE; |
| memcpy(&mythread_x_copy[offset], (double∗)(x + offset), min(BLOCKSIZE, n-offset) ∗ sizeof(double)); |
| } |
| for (T = 0; T < THREADS; T++) |
| if (mythread_num_recv_values[T] > 0) {/∗ unpack incoming messages ∗/ |
| double ∗local_buffer_ptr = (double∗) shared_recv_buffers[MYTHREAD ∗ THREADS + T]; |
| for (k = 0; k < mythread_num_recv_values[T]; k++) |
| mythread_x_copy[mythread_recv_value_list[T][k]] = local_buffer_ptr[k]; |
| } |
| /∗ Communication procedure ends ∗/ |
| /∗ SpMV: each thread only goes through its designated blocks ∗/ |
| for (mb = 0; mb < mythread_nblks; mb++) { |
| offset = (mb ∗ THREADS + MYTHREAD) ∗ BLOCKSIZE; |
| /∗ casting shared pointers to local pointers ∗/ |
| double∗loc_y = (double∗) (y + offset); |
| double∗loc_D = (double∗) (D + offset); |
| double∗loc_A = (double∗) (A + offset ∗ rnz); |
| int∗loc_J = (int∗) (J + offset ∗ rnz); |
| /∗ computation per block ∗/ |
| for (k = 0; k < min(BLOCKSIZE, n-offset); k++) { |
| double tmp = 0.0; |
| for (int j = 0; j < rnz; j++) |
| tmp += loc_A[k ∗ rnz + j] ∗ mythread_x_copy[loc_J[k ∗ rnz + j]]; |
| loc_y[k] = loc_D[k] ∗ mythread_x_copy[offset + k] + tmp; |
| } |
| } |