petsc-3.11.4 2019-09-28
  1: #define PETSC_SKIP_SPINLOCK

  3: #include <petscconf.h>
  4:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5:  #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>

  7: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
  8: {
  9:   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
 10:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
 11:   PetscErrorCode     ierr;
 12:   PetscInt           i;

 15:   PetscLayoutSetUp(B->rmap);
 16:   PetscLayoutSetUp(B->cmap);
 17:   if (d_nnz) {
 18:     for (i=0; i<B->rmap->n; i++) {
 19:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 20:     }
 21:   }
 22:   if (o_nnz) {
 23:     for (i=0; i<B->rmap->n; i++) {
 24:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 25:     }
 26:   }
 27:   if (!B->preallocated) {
 28:     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
 29:     MatCreate(PETSC_COMM_SELF,&b->A);
 30:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 31:     MatSetType(b->A,MATSEQAIJCUSPARSE);
 32:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
 33:     MatCreate(PETSC_COMM_SELF,&b->B);
 34:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 35:     MatSetType(b->B,MATSEQAIJCUSPARSE);
 36:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
 37:   }
 38:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 39:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 40:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
 41:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
 42:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
 43:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
 44:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
 45:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);

 47:   B->preallocated = PETSC_TRUE;
 48:   return(0);
 49: }

 51: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 52: {
 53:   /* This multiplication sequence is different sequence
 54:      than the CPU version. In particular, the diagonal block
 55:      multiplication kernel is launched in one stream. Then,
 56:      in a separate stream, the data transfers from DeviceToHost
 57:      (with MPI messaging in between), then HostToDevice are
 58:      launched. Once the data transfer stream is synchronized,
 59:      to ensure messaging is complete, the MatMultAdd kernel
 60:      is launched in the original (MatMult) stream to protect
 61:      against race conditions.

 63:      This sequence should only be called for GPU computation. */
 64:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 66:   PetscInt       nt;

 69:   VecGetLocalSize(xx,&nt);
 70:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
 71:   VecScatterInitializeForGPU(a->Mvctx,xx);
 72:   (*a->A->ops->mult)(a->A,xx,yy);
 73:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 74:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 75:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
 76:   VecScatterFinalizeForGPU(a->Mvctx);
 77:   return(0);
 78: }

 80: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 81: {
 82:   /* This multiplication sequence is different sequence
 83:      than the CPU version. In particular, the diagonal block
 84:      multiplication kernel is launched in one stream. Then,
 85:      in a separate stream, the data transfers from DeviceToHost
 86:      (with MPI messaging in between), then HostToDevice are
 87:      launched. Once the data transfer stream is synchronized,
 88:      to ensure messaging is complete, the MatMultAdd kernel
 89:      is launched in the original (MatMult) stream to protect
 90:      against race conditions.

 92:      This sequence should only be called for GPU computation. */
 93:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 95:   PetscInt       nt;

 98:   VecGetLocalSize(xx,&nt);
 99:   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
100:   VecScatterInitializeForGPU(a->Mvctx,xx);
101:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
102:   (*a->A->ops->multtranspose)(a->A,xx,yy);
103:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
104:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
105:   VecScatterFinalizeForGPU(a->Mvctx);
106:   return(0);
107: }

109: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
110: {
111:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
112:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

115:   switch (op) {
116:   case MAT_CUSPARSE_MULT_DIAG:
117:     cusparseStruct->diagGPUMatFormat = format;
118:     break;
119:   case MAT_CUSPARSE_MULT_OFFDIAG:
120:     cusparseStruct->offdiagGPUMatFormat = format;
121:     break;
122:   case MAT_CUSPARSE_ALL:
123:     cusparseStruct->diagGPUMatFormat    = format;
124:     cusparseStruct->offdiagGPUMatFormat = format;
125:     break;
126:   default:
127:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
128:   }
129:   return(0);
130: }

132: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
133: {
134:   MatCUSPARSEStorageFormat format;
135:   PetscErrorCode           ierr;
136:   PetscBool                flg;
137:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
138:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

141:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
142:   if (A->factortype==MAT_FACTOR_NONE) {
143:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
144:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
145:     if (flg) {
146:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
147:     }
148:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
149:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
150:     if (flg) {
151:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
152:     }
153:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
154:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
155:     if (flg) {
156:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
157:     }
158:   }
159:   PetscOptionsTail();
160:   return(0);
161: }

163: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
164: {
166:   Mat_MPIAIJ     *mpiaij;

169:   mpiaij = (Mat_MPIAIJ*)A->data;
170:   MatAssemblyEnd_MPIAIJ(A,mode);
171:   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
172:     VecSetType(mpiaij->lvec,VECSEQCUDA);
173:   }
174:   return(0);
175: }

177: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
178: {
179:   PetscErrorCode     ierr;
180:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
181:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
182:   cudaError_t        err;
183:   cusparseStatus_t   stat;

186:   try {
187:     MatCUSPARSEClearHandle(a->A);
188:     MatCUSPARSEClearHandle(a->B);
189:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
190:     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
191:     delete cusparseStruct;
192:   } catch(char *ex) {
193:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
194:   }
195:   cusparseStruct = 0;

197:   MatDestroy_MPIAIJ(A);
198:   return(0);
199: }

201: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
202: {
203:   PetscErrorCode     ierr;
204:   Mat_MPIAIJ         *a;
205:   Mat_MPIAIJCUSPARSE * cusparseStruct;
206:   cudaError_t        err;
207:   cusparseStatus_t   stat;

210:   MatCreate_MPIAIJ(A);
211:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
212:   PetscFree(A->defaultvectype);
213:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

215:   a        = (Mat_MPIAIJ*)A->data;
216:   a->spptr = new Mat_MPIAIJCUSPARSE;

218:   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
219:   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
220:   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
221:   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
222:   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);

224:   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
225:   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
226:   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
227:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
228:   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;

230:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
231:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);
232:   return(0);
233: }

235: /*@
236:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
237:    (the default parallel PETSc format).  This matrix will ultimately pushed down
238:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
239:    assembly performance the user should preallocate the matrix storage by setting
240:    the parameter nz (or the array nnz).  By setting these parameters accurately,
241:    performance during matrix assembly can be increased by more than a factor of 50.

243:    Collective on MPI_Comm

245:    Input Parameters:
246: +  comm - MPI communicator, set to PETSC_COMM_SELF
247: .  m - number of rows
248: .  n - number of columns
249: .  nz - number of nonzeros per row (same for all rows)
250: -  nnz - array containing the number of nonzeros in the various rows
251:          (possibly different for each row) or NULL

253:    Output Parameter:
254: .  A - the matrix

256:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
257:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
258:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

260:    Notes:
261:    If nnz is given then nz is ignored

263:    The AIJ format (also called the Yale sparse matrix format or
264:    compressed row storage), is fully compatible with standard Fortran 77
265:    storage.  That is, the stored row and column indices can begin at
266:    either one (as in Fortran) or zero.  See the users' manual for details.

268:    Specify the preallocated storage with either nz or nnz (not both).
269:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
270:    allocation.  For large problems you MUST preallocate memory or you
271:    will get TERRIBLE performance, see the users' manual chapter on matrices.

273:    By default, this format uses inodes (identical nodes) when possible, to
274:    improve numerical efficiency of matrix-vector products and solves. We
275:    search for consecutive rows with the same nonzero structure, thereby
276:    reusing matrix information to achieve increased efficiency.

278:    Level: intermediate

280: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
281: @*/
282: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
283: {
285:   PetscMPIInt    size;

288:   MatCreate(comm,A);
289:   MatSetSizes(*A,m,n,M,N);
290:   MPI_Comm_size(comm,&size);
291:   if (size > 1) {
292:     MatSetType(*A,MATMPIAIJCUSPARSE);
293:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
294:   } else {
295:     MatSetType(*A,MATSEQAIJCUSPARSE);
296:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
297:   }
298:   return(0);
299: }

301: /*MC
302:    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.

304:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
305:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
306:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

308:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
309:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
310:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
311:    for communicators controlling multiple processes.  It is recommended that you call both of
312:    the above preallocation routines for simplicity.

314:    Options Database Keys:
315: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
316: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
317: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
318: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

320:   Level: beginner

322:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
323: M
324: M*/