int nblks = n/BLOCKSIZE + (n% BLOCKSIZE) ?1 : 0;
int mythread_nblks = nblks/THREADS + (MYTHREAD < (nblks% THREADS) ?1 : 0);
shared [BLOCKSIZE] double = upc_all_alloc (nblks, BLOCKSIZE ∗ sizeof(double));
double tmp;
intmythread_indices = (int)malloc(mythread_nblks ∗ BLOCKSIZE ∗ sizeof(int));
/∗ let array “mythread_indices” contain random global indices with affinity to “remote threads” ∗/
randomize (mythread_indices, mythread_nblks ∗ BLOCKSIZE);
/∗ start timing … ∗/
for (int mb = 0, i = 0; mb < mythread_nblks; mb++)
for (int k = 0; k < BLOCKSIZE; k++, i++)
  tmp = [mythread_indices[i]];
/∗ stop timing … ∗/
Listing 6: A UPC microbenchmark for measuring the latency of individual remote-memory transfers.