Research Article

Cache Locality-Centric Parallel String Matching on Many-Core Accelerator Chips

Pseudocode 2

Pseudocode using multiple CUDA streams to implement our approach.
()     
()     cudaArray *cuda_arrays =  (cudaArray**) malloc  (num_of_stts  *  sizeof(cudaArray*))
()     cudaTextureObject_t* textObj = (cudaTextureObject_t*)  malloc (num_of_stts * sizeof
         (cudaTextureObject_t));
()     for  (int i = 0; i < num_of_stts;  i++)  
()      cuda_arrays[i] = generate_cuda_array  (get_dfa_matrix(i),  get_dfa_width(i),
          get_dfa_height(i));
()      textObj[i] = generate_texture_objects  (cuda_arrays[i]);
()     
()     
()     cudaStream_t  *streams =
()  (cudaStream_t*) malloc  (nstreams * sizeof(cudaStream_t));
()  create multiple CUDA streams
()  for  (int i = 0; i < nstreams;  i++)  
()   cudaStreamCreate  (&(streams[i]));
()  
()  //copy data to GPU memory, each stream copies one segment
()  for  (int i = 0; i < nstreams; i++)  
()   long in_offset = i * input_len/nstreams;
()   cudaMemcpyAsync  (d_input + in_offset,  h_input + in_offset,  input_len * sizeof
          (char)/nstreams,  cudaMemcpyHostToDevice,  streams[i]);
()  
()  each stream processes input data with each dfa (texObj [i])
()  for (int i = 0; i < nstreams;  i++)  
()   matching <<<blocks,  threads,  sm_size,  streams[i]>>>(texObj[i],  d_input,
         input_len,  pattern_max_len,  d_output,  output_len);
()  
()  copy results back to host CPU, each stream copies one segment
()  for  (int i = 0; i < nstreams;  i++)  
()   long out_offset = i * output_len/nstreams;
()   cudaMemcpyAsync  (h_output + out_offset,  d_output + out_offset,  output_len * sizeof
        (int)/nstreams,  cudaMemcpyDeviceToHost);
()  
()  for  (int i = 0; i < nstreams;  i++)  
()   cudaStreamDestroy  (streams[i]);
()  
()