Scientific Programming

Research Article

Cache Locality-Centric Parallel String Matching on Many-Core Accelerator Chips

Pseudocode using multiple CUDA streams to implement our approach.

() ⋯
() cudaArray cuda_arrays = (cudaArray) malloc (num_of_stts sizeof(cudaArray*))
() cudaTextureObject_t* textObj = (cudaTextureObject_t) malloc (num_of_stts sizeof
(cudaTextureObject_t));
() for (int i = 0; i < num_of_stts; i++)
() cuda_arrays[i] = generate_cuda_array (get_dfa_matrix(i), get_dfa_width(i),
get_dfa_height(i));
() textObj[i] = generate_texture_objects (cuda_arrays[i]);
()
() ⋯
() cudaStream_t *streams =
() (cudaStream_t) malloc (nstreams sizeof(cudaStream_t));
() create multiple CUDA streams
() for (int i = 0; i < nstreams; i++)
() cudaStreamCreate (&(streams[i]));
()
() //copy data to GPU memory, each stream copies one segment
() for (int i = 0; i < nstreams; i++)
() long in_offset = i * input_len/nstreams;
() cudaMemcpyAsync (d_input + in_offset, h_input + in_offset, input_len * sizeof
(char)/nstreams, cudaMemcpyHostToDevice, streams[i]);
()
() each stream processes input data with each dfa (texObj [i])
() for (int i = 0; i < nstreams; i++)
() matching <<<blocks, threads, sm_size, streams[i]>>>(texObj[i], d_input,
input_len, pattern_max_len, d_output, output_len);
()
() copy results back to host CPU, each stream copies one segment
() for (int i = 0; i < nstreams; i++)
() long out_offset = i * output_len/nstreams;
() cudaMemcpyAsync (h_output + out_offset, d_output + out_offset, output_len * sizeof
(int)/nstreams, cudaMemcpyDeviceToHost);
()
() for (int i = 0; i < nstreams; i++)
() cudaStreamDestroy (streams[i]);
()
() ⋯