petsc-3.6.1 2015-08-06
  1: #define PETSC_SKIP_COMPLEX

  3: #include <petscconf.h>
  4: PETSC_CUDA_EXTERN_C_BEGIN
  5: #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
  6: PETSC_CUDA_EXTERN_C_END
  7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>


 12: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 13: {
 14:   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
 15:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
 16:   PetscErrorCode     ierr;
 17:   PetscInt           i;

 20:   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
 21:   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
 22:   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
 23:   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);

 25:   PetscLayoutSetUp(B->rmap);
 26:   PetscLayoutSetUp(B->cmap);
 27:   if (d_nnz) {
 28:     for (i=0; i<B->rmap->n; i++) {
 29:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 30:     }
 31:   }
 32:   if (o_nnz) {
 33:     for (i=0; i<B->rmap->n; i++) {
 34:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 35:     }
 36:   }
 37:   if (!B->preallocated) {
 38:     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
 39:     MatCreate(PETSC_COMM_SELF,&b->A);
 40:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 41:     MatSetType(b->A,MATSEQAIJCUSPARSE);
 42:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
 43:     MatCreate(PETSC_COMM_SELF,&b->B);
 44:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 45:     MatSetType(b->B,MATSEQAIJCUSPARSE);
 46:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
 47:   }
 48:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 49:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 50:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
 51:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
 52:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
 53:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
 54:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
 55:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);

 57:   B->preallocated = PETSC_TRUE;
 58:   return(0);
 59: }

 63: PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
 64: {
 66:   PetscInt rbs,cbs;

 69:   MatGetBlockSizes(mat,&rbs,&cbs);
 70:   if (right) {
 71:     VecCreate(PetscObjectComm((PetscObject)mat),right);
 72:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
 73:     VecSetBlockSize(*right,cbs);
 74:     VecSetType(*right,VECCUSP);
 75:     VecSetLayout(*right,mat->cmap);
 76:   }
 77:   if (left) {
 78:     VecCreate(PetscObjectComm((PetscObject)mat),left);
 79:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
 80:     VecSetBlockSize(*left,rbs);
 81:     VecSetType(*left,VECCUSP);
 82:     VecSetLayout(*left,mat->rmap);


 85:   }
 86:   return(0);
 87: }


 92: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 93: {
 94:   /* This multiplication sequence is different sequence
 95:      than the CPU version. In particular, the diagonal block
 96:      multiplication kernel is launched in one stream. Then,
 97:      in a separate stream, the data transfers from DeviceToHost
 98:      (with MPI messaging in between), then HostToDevice are
 99:      launched. Once the data transfer stream is synchronized,
100:      to ensure messaging is complete, the MatMultAdd kernel
101:      is launched in the original (MatMult) stream to protect
102:      against race conditions.

104:      This sequence should only be called for GPU computation. */
105:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
107:   PetscInt       nt;

110:   VecGetLocalSize(xx,&nt);
111:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
112:   VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
113:   (*a->A->ops->mult)(a->A,xx,yy);
114:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
115:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
116:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
117:   VecScatterFinalizeForGPU(a->Mvctx);
118:   return(0);
119: }

123: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
124: {
125:   /* This multiplication sequence is different sequence
126:      than the CPU version. In particular, the diagonal block
127:      multiplication kernel is launched in one stream. Then,
128:      in a separate stream, the data transfers from DeviceToHost
129:      (with MPI messaging in between), then HostToDevice are
130:      launched. Once the data transfer stream is synchronized,
131:      to ensure messaging is complete, the MatMultAdd kernel
132:      is launched in the original (MatMult) stream to protect
133:      against race conditions.

135:      This sequence should only be called for GPU computation. */
136:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
138:   PetscInt       nt;

141:   VecGetLocalSize(xx,&nt);
142:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
143:   VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
144:   (*a->A->ops->multtranspose)(a->A,xx,yy);
145:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
146:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
147:   (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);
148:   VecScatterFinalizeForGPU(a->Mvctx);
149:   return(0);
150: }

154: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
155: {
156:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
157:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

160:   switch (op) {
161:   case MAT_CUSPARSE_MULT_DIAG:
162:     cusparseStruct->diagGPUMatFormat = format;
163:     break;
164:   case MAT_CUSPARSE_MULT_OFFDIAG:
165:     cusparseStruct->offdiagGPUMatFormat = format;
166:     break;
167:   case MAT_CUSPARSE_ALL:
168:     cusparseStruct->diagGPUMatFormat    = format;
169:     cusparseStruct->offdiagGPUMatFormat = format;
170:     break;
171:   default:
172:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
173:   }
174:   return(0);
175: }

179: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
180: {
181:   MatCUSPARSEStorageFormat format;
182:   PetscErrorCode           ierr;
183:   PetscBool                flg;
184:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
185:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

188:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
189:   PetscObjectOptionsBegin((PetscObject)A);
190:   if (A->factortype==MAT_FACTOR_NONE) {
191:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
192:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
193:     if (flg) {
194:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
195:     }
196:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
197:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
198:     if (flg) {
199:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
200:     }
201:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
202:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
203:     if (flg) {
204:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
205:     }
206:   }
207:   PetscOptionsEnd();
208:   return(0);
209: }

213: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
214: {
215:   PetscErrorCode     ierr;
216:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
217:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
218:   cudaError_t        err;
219:   cusparseStatus_t   stat;

222:   try {
223:     MatCUSPARSEClearHandle(a->A);
224:     MatCUSPARSEClearHandle(a->B);
225:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
226:     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
227:     delete cusparseStruct;
228:   } catch(char *ex) {
229:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
230:   }
231:   cusparseStruct = 0;

233:   MatDestroy_MPIAIJ(A);
234:   return(0);
235: }

239: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
240: {
241:   PetscErrorCode     ierr;
242:   Mat_MPIAIJ         *a;
243:   Mat_MPIAIJCUSPARSE * cusparseStruct;
244:   cudaError_t        err;
245:   cusparseStatus_t   stat;

248:   MatCreate_MPIAIJ(A);
249:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
250:   a        = (Mat_MPIAIJ*)A->data;
251:   a->spptr = new Mat_MPIAIJCUSPARSE;

253:   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
254:   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
255:   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
256:   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
257:   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);

259:   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
260:   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
261:   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
262:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
263:   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;

265:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
266:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);
267:   return(0);
268: }

270: /*@
271:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
272:    (the default parallel PETSc format).  This matrix will ultimately pushed down
273:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
274:    assembly performance the user should preallocate the matrix storage by setting
275:    the parameter nz (or the array nnz).  By setting these parameters accurately,
276:    performance during matrix assembly can be increased by more than a factor of 50.

278:    Collective on MPI_Comm

280:    Input Parameters:
281: +  comm - MPI communicator, set to PETSC_COMM_SELF
282: .  m - number of rows
283: .  n - number of columns
284: .  nz - number of nonzeros per row (same for all rows)
285: -  nnz - array containing the number of nonzeros in the various rows
286:          (possibly different for each row) or NULL

288:    Output Parameter:
289: .  A - the matrix

291:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
292:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
293:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

295:    Notes:
296:    If nnz is given then nz is ignored

298:    The AIJ format (also called the Yale sparse matrix format or
299:    compressed row storage), is fully compatible with standard Fortran 77
300:    storage.  That is, the stored row and column indices can begin at
301:    either one (as in Fortran) or zero.  See the users' manual for details.

303:    Specify the preallocated storage with either nz or nnz (not both).
304:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
305:    allocation.  For large problems you MUST preallocate memory or you
306:    will get TERRIBLE performance, see the users' manual chapter on matrices.

308:    By default, this format uses inodes (identical nodes) when possible, to
309:    improve numerical efficiency of matrix-vector products and solves. We
310:    search for consecutive rows with the same nonzero structure, thereby
311:    reusing matrix information to achieve increased efficiency.

313:    Level: intermediate

315: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
316: @*/
319: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
320: {
322:   PetscMPIInt    size;

325:   MatCreate(comm,A);
326:   MatSetSizes(*A,m,n,M,N);
327:   MPI_Comm_size(comm,&size);
328:   if (size > 1) {
329:     MatSetType(*A,MATMPIAIJCUSPARSE);
330:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
331:   } else {
332:     MatSetType(*A,MATSEQAIJCUSPARSE);
333:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
334:   }
335:   return(0);
336: }

338: /*M
339:    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.

341:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
342:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
343:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

345:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
346:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
347:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
348:    for communicators controlling multiple processes.  It is recommended that you call both of
349:    the above preallocation routines for simplicity.

351:    Options Database Keys:
352: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
353: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
354: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
355: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

357:   Level: beginner

359:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
360: M
361: M*/