() ⋯ |
() cudaArray *cuda_arrays = (cudaArray**) malloc (num_of_stts * sizeof(cudaArray*)) |
() cudaTextureObject_t* textObj = (cudaTextureObject_t*) malloc (num_of_stts * sizeof |
(cudaTextureObject_t)); |
() for (int i = 0; i < num_of_stts; i++) |
() cuda_arrays[i] = generate_cuda_array (get_dfa_matrix(i), get_dfa_width(i), |
get_dfa_height(i)); |
() textObj[i] = generate_texture_objects (cuda_arrays[i]); |
() |
() ⋯ |
() cudaStream_t *streams = |
() (cudaStream_t*) malloc (nstreams * sizeof(cudaStream_t)); |
() create multiple CUDA streams |
() for (int i = 0; i < nstreams; i++) |
() cudaStreamCreate (&(streams[i])); |
() |
() //copy data to GPU memory, each stream copies one segment |
() for (int i = 0; i < nstreams; i++) |
() long in_offset = i * input_len/nstreams; |
() cudaMemcpyAsync (d_input + in_offset, h_input + in_offset, input_len * sizeof |
(char)/nstreams, cudaMemcpyHostToDevice, streams[i]); |
() |
() each stream processes input data with each dfa (texObj [i]) |
() for (int i = 0; i < nstreams; i++) |
() matching <<<blocks, threads, sm_size, streams[i]>>>(texObj[i], d_input, |
input_len, pattern_max_len, d_output, output_len); |
() |
() copy results back to host CPU, each stream copies one segment |
() for (int i = 0; i < nstreams; i++) |
() long out_offset = i * output_len/nstreams; |
() cudaMemcpyAsync (h_output + out_offset, d_output + out_offset, output_len * sizeof |
(int)/nstreams, cudaMemcpyDeviceToHost); |
() |
() for (int i = 0; i < nstreams; i++) |
() cudaStreamDestroy (streams[i]); |
() |
() ⋯ |