Research Article
GPU Preconditioning for Block Linear Systems Using Block Incomplete Sparse Approximate Inverses
Algorithm 2
GPU kernel for solving LGSBTS.
(1) | __shared__ double sm[warps_per_block][warp_size]; | (2) | __shared__ double inv[warps_per_block][bsz][bsz]; | (3) | __shared__ double ssol[warps_per_block][bsz][bsz]; | (4) | tid = threadIdx.x + blockIdx.x blockDim.x; | (5) | wpid = threadIdx.x/warp_size; | (6) | gwpid = tid/warp_size; | (7) | rpos = threadIdx.x%warp size; | (8) | range = ; | (9) | if gwpid < nbr then | (10) | if rpos < range then | (11) | r = rpos%bsz; | (12) | c = (rpos/bsz)%bsz; | (13) | InvStIdx = NLColPtr[gwpid]; | (14) | InvEdIdx = NLColPtr[gwpid + 1]; | (15) | for idx = InvStIdx to InvEdIdx − 1 do | (16) | irow = NLRowVal[idx]; | (17) | if rpos < then | (18) | tmpidx = LRowPtr[irow + 1] − 1; | (19) | ssol[wpid][r][c] = L[tmpidx + rpos]; | (20) | inv[wpid][r][c] = 1.0/det ; | (21) | sm[wpid][rpos] = Rhs[idx + rpos]; | (22) | ssol[wpid][r][c] = ; | (23) | NL[idx + rpos] = ssol[wpid][r][c]; | (24) | end if | (25) | myidx = idx + 1 + ; | (26) | triEdIdx = triStIdx + (n − icol) ; | (27) | solIdx = solStIdx + + rpos; | (28) | while myidx < InvEdIdx do | (29) | sm[wpid][rpos] = 0.0; | (30) | myrow = NLRowVal[myidx]; | (31) | tmpidx = LRowPtr[myrow + 1] − 1; | (32) | while tmpidx LRowPtr[myrow] && LColVal[tmpidx] == irow do | (33) | tmpidx−−; | (34) | end while | (35) | if tmpidx LRowPtr[myrow] && LColVal[tmpidx] == irow then | (36) | s[wpid][lane] = L[tmpidx + rpos%]; | (37) | end if | (38) | Rhs[myidx + lane%]− = ; | (39) | myidx+ = warp_size/; | (40) | end while | (41) | end for | (42) | end if | (43) | end if |
|