Actual source code: aijcusparseband.cu
  1: /*
  2:   AIJCUSPARSE methods implemented with Cuda kernels. Uses cuSparse/Thrust maps from AIJCUSPARSE
  3: */
  4: #define PETSC_SKIP_SPINLOCK
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #undef VecType
 11: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 12: #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 600 && PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 13: #define AIJBANDUSEGROUPS 1
 14: #endif
 15: #if defined(AIJBANDUSEGROUPS)
 16: #include <cooperative_groups.h>
 17: #endif

 19: #define CHECK_LAUNCH_ERROR()                                                             \
 20: do {                                                                                     \
 21:   /* Check synchronous errors, i.e. pre-launch */                                        \
 22:   cudaError_t err = cudaGetLastError();                                                  \
 23:   if (cudaSuccess != err) {                                                              \
 24:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 25:   }                                                                                      \
 26:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
 27:   err = cudaDeviceSynchronize();                                                         \
 28:   if (cudaSuccess != err) {                                                              \
 29:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 30:   }                                                                                      \
 31:  } while (0)

 33: /*
 34:   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)

 36:   requires:
 37:      structurally symmetric: fix with transpose/column meta data
 38: */

 40: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
 41: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);

 43: /*
 44:   The GPU LU factor kernel
 45: */
 46: __global__
 47: void __launch_bounds__(1024,1)
 48: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
 49: {
 50:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
 51:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
 52:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

 54:   // set i (row+1)
 55:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
 56:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 57:     if (rowb < end_i && threadIdx.x==0) {
 58:       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
 59:       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
 60:     }
 61:   }
 62: }
 63: // copy AIJ to AIJ_BAND
 64: __global__
 65: void __launch_bounds__(1024,1)
 66: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
 67:                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
 68:                                 const int bi_csr[], PetscScalar ba_csr[])
 69: {
 70:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
 71:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
 72:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

 74:   // zero B
 75:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
 76:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 77:     if (rowb < end_i) {
 78:       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
 79:       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
 80:       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
 81:         if (j<nzb) {
 82:           batmp[j] = 0;
 83:         }
 84:       }
 85:     }
 86:   }

 88:   // copy A into B with CSR format -- these two loops can be fused
 89:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 90:     if (rowb < end_i) {
 91:       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
 92:       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
 93:       const PetscScalar *av    = aa_d + ai_d[rowa];
 94:       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
 95:       /* load in initial (unfactored row) */
 96:       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
 97:         if (j<nza) {
 98:           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
 99:           PetscScalar vala = av[j];
100:           batmp[idx] = vala;
101:         }
102:       }
103:     }
104:   }
105: }
106: // print AIJ_BAND
107: __global__
108: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
109: {
110:   // debug
111:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
112:     printf("B (AIJ) n=%d:\n",(int)n);
113:     for (int rowb=0;rowb<n;rowb++) {
114:       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
115:       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
116:       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
117:       printf(" bi=%d\n",bi_csr[rowb+1]);
118:     }
119:   }
120: }
121: // Band LU kernel ---  ba_csr bi_csr
122: __global__
123: void __launch_bounds__(1024,1)
124:   mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
125: {
126:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
127:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
128:   const PetscInt  start = field*nloc, end = start + nloc;
129: #if defined(AIJBANDUSEGROUPS)
130:   auto g = cooperative_groups::this_grid();
131: #endif
132:   // A22 panel update for each row A(1,:) and col A(:,1)
133:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
134:     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
135:     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
136:     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
137:     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
138:     const PetscScalar Bdd = *pBdd;
139:     const PetscInt    offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
140:     if (threadIdx.x==0) {
141:       for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
142:         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
143:         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
144:         *Aid = *Aid/Bdd;
145:       }
146:     }
147:     __syncthreads(); // synch on threadIdx.x only
148:     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
149:       const PetscInt    bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
150:       PetscScalar       *Aid = ba_csr + bi_csr[myi] + kIdx;
151:       PetscScalar       *Aij =  Aid + 1;
152:       const PetscScalar Lid  = *Aid;
153:       for (int jIdx=threadIdx.x ; jIdx<nzUd; jIdx += blockDim.x) {
154:         Aij[jIdx] -= Lid*baUd[jIdx];
155:       }
156:     }
157: #if defined(AIJBANDUSEGROUPS)
158:     if (use_group_sync) {
159:       g.sync();
160:     } else {
161:       __syncthreads();
162:     }
163: #else
164:     __syncthreads();
165: #endif
166:   } /* endof for (i=0; i<n; i++) { */
167: }

169: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
170: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
171: {
172:   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
173:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
175:   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
176:   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
177:   CsrMatrix                    *matrixA;
178:   const PetscInt               n=A->rmap->n, *ic, *r;
179:   const int                    *ai_d, *aj_d;
180:   const PetscScalar            *aa_d;
181:   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
182:   int                          *bi_t = cusparseTriFactors->i_band_d;
183:   int                          Ni = 10, team_size=9, Nf=1, nVec=56, nconcurrent = 1, nsm = -1; // Nf is batch size - not used

185:   if (A->rmap->n == 0) {
186:     return 0;
187:   }
188:   // cusparse setup
190:   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
192:   matrixA = (CsrMatrix*)matstructA->mat;

195:   // get data
196:   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
197:   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
198:   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
199:   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
200:   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());

202:   WaitForCUDA();
203:   PetscLogGpuTimeBegin();
204:   {
205:     int bw = (int)(2.*(double)n-1. - (double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
206: #if !defined(AIJBANDUSEGROUPS)
207:     Ni = 1/nconcurrent;
208:     Ni = 1;
209: #else
210:     if (!cusparseTriFactors->init_dev_prop) {
211:       int gpuid;
212:       cusparseTriFactors->init_dev_prop = PETSC_TRUE;
213:       cudaGetDevice(&gpuid);
214:       cudaGetDeviceProperties(&cusparseTriFactors->dev_prop, gpuid);
215:     }
216:     nsm = cusparseTriFactors->dev_prop.multiProcessorCount;
217:     Ni = nsm/Nf/nconcurrent;
218: #endif
219:     team_size = bw/Ni + !!(bw%Ni);
220:     nVec = PetscMin(bw, 1024/team_size);
221:     PetscInfo(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n",bw,Ni,nconcurrent,Nf,nsm,team_size,nVec);
222:     {
223:       dim3 dimBlockTeam(nVec,team_size);
224:       dim3 dimBlockLeague(Nf,Ni);
225:       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
226:       CHECK_LAUNCH_ERROR(); // does a sync
227: #if defined(AIJBANDUSEGROUPS)
228:       if (Ni > 1) {
229:         void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t, (void*)&nsm };
230:         cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
231:       } else {
232:         mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
233:       }
234: #else
235:       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
236: #endif
237:       CHECK_LAUNCH_ERROR(); // does a sync
238: #if defined(PETSC_USE_LOG)
239:       PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(PetscLogDouble)(2*bm1 + 1)/3 + (PetscLogDouble)2*(nl-bw)*bw*bw + (PetscLogDouble)nl*(nl+1)/2));
240: #endif
241:     }
242:   }
243:   PetscLogGpuTimeEnd();
244:   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
245:   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
246:   B->ops->solvetranspose = NULL; // need transpose
247:   B->ops->matsolve = NULL;
248:   B->ops->matsolvetranspose = NULL;
249:   return 0;
250: }

252: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
253: {
254:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
255:   IS                 isicol;
256:   const PetscInt     *ic,*ai=a->i,*aj=a->j;
257:   PetscScalar        *ba_t;
258:   int                *bi_t;
259:   PetscInt           i,n=A->rmap->n,Nf=1; // Nf batch size - not used
260:   PetscInt           nzBcsr,bwL,bwU;
261:   PetscBool          missing;
262:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;

265:   MatMissingDiagonal(A,&missing,&i);
268:   MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);

271:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
272:   ISGetIndices(isicol,&ic);

274:   MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
275:   PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
276:   b    = (Mat_SeqAIJ*)(B)->data;

278:   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
279:   bwL = bwU = 0;
280:   for (int rwb=0; rwb<n; rwb++) {
281:     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
282:     for (int j=0;j<anz;j++) {
283:       PetscInt colb = ic[ajtmp[j]];
284:       if (colb<rwa) { // L
285:         if (rwa-colb > bwL) bwL = rwa-colb;
286:       } else {
287:         if (colb-rwa > bwU) bwU = colb-rwa;
288:       }
289:     }
290:   }
291:   ISRestoreIndices(isicol,&ic);
292:   /* only support structurally symmetric, but it might work */
294:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
295:   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
296:   b->maxnz = b->nz = nzBcsr;
297:   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
298:   PetscInfo(A,"Matrix Bandwidth = %" PetscInt_FMT ", nnz = %" PetscInt_FMT "\n",bwL,b->nz);
299:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
300:   cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar)); // include a place for flops
301:   cudaMalloc(&bi_t,(n+1)*sizeof(int));
302:   cusparseTriFactors->a_band_d = ba_t;
303:   cusparseTriFactors->i_band_d = bi_t;
304:   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
305:   PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
306:   {
307:     dim3 dimBlockTeam(1,128);
308:     dim3 dimBlockLeague(Nf,1);
309:     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
310:   }
311:   CHECK_LAUNCH_ERROR(); // does a sync

313:   // setup data
314:   if (!cusparseTriFactors->rpermIndices) {
315:     const PetscInt *r;

317:     ISGetIndices(isrow,&r);
318:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
319:     cusparseTriFactors->rpermIndices->assign(r, r+n);
320:     ISRestoreIndices(isrow,&r);
321:     PetscLogCpuToGpu(n*sizeof(PetscInt));
322:   }
323:   /* upper triangular indices */
324:   if (!cusparseTriFactors->cpermIndices) {
325:     const PetscInt *c;

327:     ISGetIndices(isicol,&c);
328:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
329:     cusparseTriFactors->cpermIndices->assign(c, c+n);
330:     ISRestoreIndices(isicol,&c);
331:     PetscLogCpuToGpu(n*sizeof(PetscInt));
332:   }

334:   /* put together the new matrix */
335:   b->free_a       = PETSC_FALSE;
336:   b->free_ij      = PETSC_FALSE;
337:   b->singlemalloc = PETSC_FALSE;
338:   b->ilen = NULL;
339:   b->imax = NULL;
340:   b->row  = isrow;
341:   b->col  = iscol;
342:   PetscObjectReference((PetscObject)isrow);
343:   PetscObjectReference((PetscObject)iscol);
344:   b->icol = isicol;
345:   PetscMalloc1(n+1,&b->solve_work);

347:   B->factortype            = MAT_FACTOR_LU;
348:   B->info.factor_mallocs   = 0;
349:   B->info.fill_ratio_given = 0;

351:   if (ai[n]) {
352:     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
353:   } else {
354:     B->info.fill_ratio_needed = 0.0;
355:   }
356: #if defined(PETSC_USE_INFO)
357:   if (ai[n] != 0) {
358:     PetscReal af = B->info.fill_ratio_needed;
359:     PetscInfo(A,"Band fill ratio %g\n",(double)af);
360:   } else {
361:     PetscInfo(A,"Empty matrix\n");
362:   }
363: #endif
364:   if (a->inode.size) {
365:     PetscInfo(A,"Warning: using inodes in band solver.\n");
366:   }
367:   MatSeqAIJCheckInode_FactorLU(B);
368:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
369:   B->offloadmask = PETSC_OFFLOAD_GPU;

371:   return 0;
372: }

374: /* Use -pc_factor_mat_solver_type cusparseband */
375: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
376: {
377:   *type = MATSOLVERCUSPARSEBAND;
378:   return 0;
379: }

381: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
382: {
383:   PetscInt       n = A->rmap->n;

385:   MatCreate(PetscObjectComm((PetscObject)A),B);
386:   MatSetSizes(*B,n,n,n,n);
387:   (*B)->factortype = ftype;
388:   (*B)->canuseordering = PETSC_TRUE;
389:   MatSetType(*B,MATSEQAIJCUSPARSE);

391:   if (ftype == MAT_FACTOR_LU) {
392:     MatSetBlockSizesFromMats(*B,A,A);
393:     (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJCUSPARSE;
394:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
395:     PetscStrallocpy(MATORDERINGRCM,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
396:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");

398:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
399:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
400:   return 0;
401: }

403: #define WARP_SIZE 32
404: template <typename T>
405: __forceinline__ __device__
406: T wreduce(T a)
407: {
408:   T b;
409:   #pragma unroll
410:   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
411:     b = __shfl_down_sync(0xffffffff, a, i);
412:     a += b;
413:   }
414:   return a;
415: }
416: // reduce in a block, returns result in thread 0
417: template <typename T, int BLOCK_SIZE>
418: __device__
419: T breduce(T a)
420: {
421:   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
422:   __shared__ double buf[NWARP];
423:   int wid = threadIdx.x / WARP_SIZE;
424:   int laneid = threadIdx.x % WARP_SIZE;
425:   T b = wreduce<T>(a);
426:   if (laneid == 0)
427:     buf[wid] = b;
428:   __syncthreads();
429:   if (wid == 0) {
430:     if (threadIdx.x < NWARP)
431:       a = buf[threadIdx.x];
432:     else
433:       a = 0;
434:     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
435:       a += __shfl_down_sync(0xffffffff, a, i);
436:     }
437:   }
438:   return a;
439: }

441: // Band LU kernel ---  ba_csr bi_csr
442: template <int BLOCK_SIZE>
443: __global__
444: void __launch_bounds__(256,1)
445: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
446: {
447:   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
448:   const PetscScalar *pLi;
449:   const int tid = threadIdx.x;

451:   /* Next, solve L */
452:   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
453:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
454:     const PetscInt col = locDD<bw ? start : (glbDD-bw);
455:     PetscScalar t = 0;
456:     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
457:       t += pLi[idx]*x[j];
458:     }
459: #if defined(PETSC_USE_COMPLEX)
460:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
461:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
462:     t = tt;
463: #else
464:     t = breduce<PetscReal,BLOCK_SIZE>(t);
465: #endif
466:     if (threadIdx.x == 0)
467:       x[glbDD] -= t; // /1.0
468:     __syncthreads();
469:     // inc
470:     pLi += glbDD-col; // get to diagonal
471:     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
472:     else pLi += bw;
473:     pLi += 1; // skip to next row
474:     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
475:   }
476:   /* Then, solve U */
477:   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
478:   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row

480:   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
481:     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
482:     PetscScalar t = 0;
483:     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
484:       t += pLi[-idx]*x[j];
485:     }
486: #if defined(PETSC_USE_COMPLEX)
487:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
488:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
489:     t = tt;
490: #else
491:     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
492: #endif
493:     pLi -= col-glbDD; // diagonal
494:     if (threadIdx.x == 0) {
495:       x[glbDD] -= t;
496:       x[glbDD] /= pLi[0];
497:     }
498:     __syncthreads();
499:     // inc past L to start of previous U
500:     pLi -= bw+1;
501:     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
502:     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
503:   }
504: }

506: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
507: {
508:   const PetscScalar                     *barray;
509:   PetscScalar                           *xarray;
510:   thrust::device_ptr<const PetscScalar> bGPU;
511:   thrust::device_ptr<PetscScalar>       xGPU;
512:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
513:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
514:   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, Nf=1; // Nf is batch size - not used
515:   PetscInt                              bw = (int)(2.*(double)n-1.-(double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)nz))+PETSC_MACHINE_EPSILON))/2; // quadric formula for bandwidth

517:   if (A->rmap->n == 0) {
518:     return 0;
519:   }

521:   /* Get the GPU pointers */
522:   VecCUDAGetArrayWrite(xx,&xarray);
523:   VecCUDAGetArrayRead(bb,&barray);
524:   xGPU = thrust::device_pointer_cast(xarray);
525:   bGPU = thrust::device_pointer_cast(barray);

527:   PetscLogGpuTimeBegin();
528:   /* First, reorder with the row permutation */
529:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
530:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
531:                tempGPU->begin());
532:   constexpr int block = 128;
533:   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
534:   CHECK_LAUNCH_ERROR(); // does a sync

536:   /* Last, reorder with the column permutation */
537:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
538:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
539:                xGPU);

541:   VecCUDARestoreArrayRead(bb,&barray);
542:   VecCUDARestoreArrayWrite(xx,&xarray);
543:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
544:   PetscLogGpuTimeEnd();

546:   return 0;
547: }