petsc-3.12.5 2020-03-29
  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_CXX_COMPLEX_FIX
  3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  5: #include <petscconf.h>
  6:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7:  #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>

  9: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 10: {
 11:   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
 12:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
 13:   PetscErrorCode     ierr;
 14:   PetscInt           i;

 17:   PetscLayoutSetUp(B->rmap);
 18:   PetscLayoutSetUp(B->cmap);
 19:   if (d_nnz) {
 20:     for (i=0; i<B->rmap->n; i++) {
 21:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 22:     }
 23:   }
 24:   if (o_nnz) {
 25:     for (i=0; i<B->rmap->n; i++) {
 26:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 27:     }
 28:   }
 29:   if (!B->preallocated) {
 30:     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
 31:     MatCreate(PETSC_COMM_SELF,&b->A);
 32:     MatPinToCPU(b->A,B->pinnedtocpu);
 33:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 34:     MatSetType(b->A,MATSEQAIJCUSPARSE);
 35:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
 36:     MatCreate(PETSC_COMM_SELF,&b->B);
 37:     MatPinToCPU(b->B,B->pinnedtocpu);
 38:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 39:     MatSetType(b->B,MATSEQAIJCUSPARSE);
 40:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
 41:   }
 42:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 43:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 44:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
 45:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
 46:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
 47:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
 48:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
 49:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);

 51:   B->preallocated = PETSC_TRUE;
 52:   return(0);
 53: }

 55: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 56: {
 57:   /*
 58:      This multiplication sequence is different sequence
 59:      than the CPU version. In particular, the diagonal block
 60:      multiplication kernel is launched in one stream. Then,
 61:      in a separate stream, the data transfers from DeviceToHost
 62:      (with MPI messaging in between), then HostToDevice are
 63:      launched. Once the data transfer stream is synchronized,
 64:      to ensure messaging is complete, the MatMultAdd kernel
 65:      is launched in the original (MatMult) stream to protect
 66:      against race conditions.
 67:   */
 68:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 70:   PetscInt       nt;

 73:   VecGetLocalSize(xx,&nt);
 74:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
 75:   VecScatterInitializeForGPU(a->Mvctx,xx);
 76:   (*a->A->ops->mult)(a->A,xx,yy);
 77:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 78:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 79:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
 80:   VecScatterFinalizeForGPU(a->Mvctx);
 81:   return(0);
 82: }

 84: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
 85: {
 86:   /*
 87:      This multiplication sequence is different sequence
 88:      than the CPU version. In particular, the diagonal block
 89:      multiplication kernel is launched in one stream. Then,
 90:      in a separate stream, the data transfers from DeviceToHost
 91:      (with MPI messaging in between), then HostToDevice are
 92:      launched. Once the data transfer stream is synchronized,
 93:      to ensure messaging is complete, the MatMultAdd kernel
 94:      is launched in the original (MatMult) stream to protect
 95:      against race conditions.
 96:   */
 97:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 99:   PetscInt       nt;

102:   VecGetLocalSize(xx,&nt);
103:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
104:   VecScatterInitializeForGPU(a->Mvctx,xx);
105:   (*a->A->ops->multadd)(a->A,xx,yy,zz);
106:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
107:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
108:   (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
109:   VecScatterFinalizeForGPU(a->Mvctx);
110:   return(0);
111: }

113: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114: {
115:   /* This multiplication sequence is different sequence
116:      than the CPU version. In particular, the diagonal block
117:      multiplication kernel is launched in one stream. Then,
118:      in a separate stream, the data transfers from DeviceToHost
119:      (with MPI messaging in between), then HostToDevice are
120:      launched. Once the data transfer stream is synchronized,
121:      to ensure messaging is complete, the MatMultAdd kernel
122:      is launched in the original (MatMult) stream to protect
123:      against race conditions.

125:      This sequence should only be called for GPU computation. */
126:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
128:   PetscInt       nt;

131:   VecGetLocalSize(xx,&nt);
132:   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
133:   VecScatterInitializeForGPU(a->Mvctx,a->lvec);
134:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
135:   (*a->A->ops->multtranspose)(a->A,xx,yy);
136:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
137:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
138:   VecScatterFinalizeForGPU(a->Mvctx);
139:   return(0);
140: }

142: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
143: {
144:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
145:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

148:   switch (op) {
149:   case MAT_CUSPARSE_MULT_DIAG:
150:     cusparseStruct->diagGPUMatFormat = format;
151:     break;
152:   case MAT_CUSPARSE_MULT_OFFDIAG:
153:     cusparseStruct->offdiagGPUMatFormat = format;
154:     break;
155:   case MAT_CUSPARSE_ALL:
156:     cusparseStruct->diagGPUMatFormat    = format;
157:     cusparseStruct->offdiagGPUMatFormat = format;
158:     break;
159:   default:
160:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
161:   }
162:   return(0);
163: }

165: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
166: {
167:   MatCUSPARSEStorageFormat format;
168:   PetscErrorCode           ierr;
169:   PetscBool                flg;
170:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
171:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

174:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
175:   if (A->factortype==MAT_FACTOR_NONE) {
176:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
177:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
178:     if (flg) {
179:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
180:     }
181:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
183:     if (flg) {
184:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
185:     }
186:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
187:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
188:     if (flg) {
189:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
190:     }
191:   }
192:   PetscOptionsTail();
193:   return(0);
194: }

196: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
197: {
199:   Mat_MPIAIJ     *mpiaij;

202:   mpiaij = (Mat_MPIAIJ*)A->data;
203:   MatAssemblyEnd_MPIAIJ(A,mode);
204:   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
205:     VecSetType(mpiaij->lvec,VECSEQCUDA);
206:   }
207:   return(0);
208: }

210: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
211: {
212:   PetscErrorCode     ierr;
213:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
214:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
215:   cudaError_t        err;
216:   cusparseStatus_t   stat;

219:   try {
220:     MatCUSPARSEClearHandle(a->A);
221:     MatCUSPARSEClearHandle(a->B);
222:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
223:     if (cusparseStruct->stream) {
224:       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
225:     }
226:     delete cusparseStruct;
227:   } catch(char *ex) {
228:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
229:   }
230:   MatDestroy_MPIAIJ(A);
231:   return(0);
232: }

234: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
235: {
236:   PetscErrorCode     ierr;
237:   Mat_MPIAIJ         *a;
238:   Mat_MPIAIJCUSPARSE * cusparseStruct;
239:   cusparseStatus_t   stat;

242:   MatCreate_MPIAIJ(A);
243:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
244:   PetscFree(A->defaultvectype);
245:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

247:   a        = (Mat_MPIAIJ*)A->data;
248:   a->spptr = new Mat_MPIAIJCUSPARSE;

250:   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
251:   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
252:   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
253:   cusparseStruct->stream              = 0;
254:   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);

256:   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
257:   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
258:   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
259:   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
260:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
261:   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;

263:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
264:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);
265:   return(0);
266: }

268: /*@
269:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
270:    (the default parallel PETSc format).  This matrix will ultimately pushed down
271:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
272:    assembly performance the user should preallocate the matrix storage by setting
273:    the parameter nz (or the array nnz).  By setting these parameters accurately,
274:    performance during matrix assembly can be increased by more than a factor of 50.

276:    Collective

278:    Input Parameters:
279: +  comm - MPI communicator, set to PETSC_COMM_SELF
280: .  m - number of rows
281: .  n - number of columns
282: .  nz - number of nonzeros per row (same for all rows)
283: -  nnz - array containing the number of nonzeros in the various rows
284:          (possibly different for each row) or NULL

286:    Output Parameter:
287: .  A - the matrix

289:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
290:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
291:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

293:    Notes:
294:    If nnz is given then nz is ignored

296:    The AIJ format (also called the Yale sparse matrix format or
297:    compressed row storage), is fully compatible with standard Fortran 77
298:    storage.  That is, the stored row and column indices can begin at
299:    either one (as in Fortran) or zero.  See the users' manual for details.

301:    Specify the preallocated storage with either nz or nnz (not both).
302:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
303:    allocation.  For large problems you MUST preallocate memory or you
304:    will get TERRIBLE performance, see the users' manual chapter on matrices.

306:    By default, this format uses inodes (identical nodes) when possible, to
307:    improve numerical efficiency of matrix-vector products and solves. We
308:    search for consecutive rows with the same nonzero structure, thereby
309:    reusing matrix information to achieve increased efficiency.

311:    Level: intermediate

313: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
314: @*/
315: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
316: {
318:   PetscMPIInt    size;

321:   MatCreate(comm,A);
322:   MatSetSizes(*A,m,n,M,N);
323:   MPI_Comm_size(comm,&size);
324:   if (size > 1) {
325:     MatSetType(*A,MATMPIAIJCUSPARSE);
326:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
327:   } else {
328:     MatSetType(*A,MATSEQAIJCUSPARSE);
329:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
330:   }
331:   return(0);
332: }

334: /*MC
335:    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.

337:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
338:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
339:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

341:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
342:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
343:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
344:    for communicators controlling multiple processes.  It is recommended that you call both of
345:    the above preallocation routines for simplicity.

347:    Options Database Keys:
348: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
349: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
351: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

353:   Level: beginner

355:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
356: M
357: M*/