Actual source code: mpiaijcusparse.cu
  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_CXX_COMPLEX_FIX
  3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  5: #include <petscconf.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  9: #include <thrust/advance.h>
 10: #include <petscsf.h>

 12: struct VecCUDAEquals
 13: {
 14:   template <typename Tuple>
 15:   __host__ __device__
 16:   void operator()(Tuple t)
 17:   {
 18:     thrust::get<1>(t) = thrust::get<0>(t);
 19:   }
 20: };

 22: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
 23: {
 24:   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
 25:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
 26:   PetscInt           n = cusp->coo_nd + cusp->coo_no;
 27:   PetscErrorCode     ierr;
 28:   cudaError_t        cerr;

 31:   if (cusp->coo_p && v) {
 32:     thrust::device_ptr<const PetscScalar> d_v;
 33:     THRUSTARRAY                           *w = NULL;

 35:     if (isCudaMem(v)) {
 36:       d_v = thrust::device_pointer_cast(v);
 37:     } else {
 38:       w = new THRUSTARRAY(n);
 39:       w->assign(v,v+n);
 40:       PetscLogCpuToGpu(n*sizeof(PetscScalar));
 41:       d_v = w->data();
 42:     }

 44:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
 45:                                                               cusp->coo_pw->begin()));
 46:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
 47:                                                               cusp->coo_pw->end()));
 48:     PetscLogGpuTimeBegin();
 49:     thrust::for_each(zibit,zieit,VecCUDAEquals());
 50:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
 51:     PetscLogGpuTimeEnd();
 52:     delete w;
 53:     MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);
 54:     MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);
 55:   } else {
 56:     MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);
 57:     MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);
 58:   }
 59:   PetscObjectStateIncrease((PetscObject)A);
 60:   A->num_ass++;
 61:   A->assembled        = PETSC_TRUE;
 62:   A->ass_nonzerostate = A->nonzerostate;
 63:   A->offloadmask      = PETSC_OFFLOAD_GPU;
 64:   return(0);
 65: }

 67: template <typename Tuple>
 68: struct IsNotOffDiagT
 69: {
 70:   PetscInt _cstart,_cend;

 72:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
 73:   __host__ __device__
 74:   inline bool operator()(Tuple t)
 75:   {
 76:     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
 77:   }
 78: };

 80: struct IsOffDiag
 81: {
 82:   PetscInt _cstart,_cend;

 84:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
 85:   __host__ __device__
 86:   inline bool operator() (const PetscInt &c)
 87:   {
 88:     return c < _cstart || c >= _cend;
 89:   }
 90: };

 92: struct GlobToLoc
 93: {
 94:   PetscInt _start;

 96:   GlobToLoc(PetscInt start) : _start(start) {}
 97:   __host__ __device__
 98:   inline PetscInt operator() (const PetscInt &c)
 99:   {
100:     return c - _start;
101:   }
102: };

104: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105: {
106:   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
107:   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108:   PetscErrorCode         ierr;
109:   PetscInt               *jj;
110:   size_t                 noff = 0;
111:   THRUSTINTARRAY         d_i(n);
112:   THRUSTINTARRAY         d_j(n);
113:   ISLocalToGlobalMapping l2g;
114:   cudaError_t            cerr;

117:   PetscLayoutSetUp(B->rmap);
118:   PetscLayoutSetUp(B->cmap);
119:   if (b->A) { MatCUSPARSEClearHandle(b->A); }
120:   if (b->B) { MatCUSPARSEClearHandle(b->B); }
121:   PetscFree(b->garray);
122:   VecDestroy(&b->lvec);
123:   MatDestroy(&b->A);
124:   MatDestroy(&b->B);

126:   PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
127:   d_i.assign(coo_i,coo_i+n);
128:   d_j.assign(coo_j,coo_j+n);
129:   delete cusp->coo_p;
130:   delete cusp->coo_pw;
131:   cusp->coo_p = NULL;
132:   cusp->coo_pw = NULL;
133:   PetscLogGpuTimeBegin();
134:   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135:   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137:     cusp->coo_p = new THRUSTINTARRAY(n);
138:     cusp->coo_pw = new THRUSTARRAY(n);
139:     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142:     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143:     firstoffd = mzipp.get_iterator_tuple().get<1>();
144:   }
145:   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146:   cusp->coo_no = thrust::distance(firstoffd,d_j.end());

148:   /* from global to local */
149:   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150:   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
152:   PetscLogGpuTimeEnd();

154:   /* copy offdiag column indices to map on the CPU */
155:   PetscMalloc1(cusp->coo_no,&jj);
156:   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
157:   auto o_j = d_j.begin();
158:   PetscLogGpuTimeBegin();
159:   thrust::advance(o_j,cusp->coo_nd);
160:   thrust::sort(thrust::device,o_j,d_j.end());
161:   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
162:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
163:   PetscLogGpuTimeEnd();
164:   noff = thrust::distance(o_j,wit);
165:   PetscMalloc1(noff+1,&b->garray);
166:   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
167:   PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));
168:   ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);
169:   ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);
170:   ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);
171:   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
172:   ISLocalToGlobalMappingDestroy(&l2g);

174:   MatCreate(PETSC_COMM_SELF,&b->A);
175:   MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
176:   MatSetType(b->A,MATSEQAIJCUSPARSE);
177:   PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
178:   MatCreate(PETSC_COMM_SELF,&b->B);
179:   MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);
180:   MatSetType(b->B,MATSEQAIJCUSPARSE);
181:   PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);

183:   /* GPU memory, cusparse specific call handles it internally */
184:   MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());
185:   MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);
186:   PetscFree(jj);

188:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);
189:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);
190:   MatCUSPARSESetHandle(b->A,cusp->handle);
191:   MatCUSPARSESetHandle(b->B,cusp->handle);
192:   /*
193:   MatCUSPARSESetStream(b->A,cusp->stream);
194:   MatCUSPARSESetStream(b->B,cusp->stream);
195:   */
196:   MatSetUpMultiply_MPIAIJ(B);
197:   B->preallocated = PETSC_TRUE;
198:   B->nonzerostate++;

200:   MatBindToCPU(b->A,B->boundtocpu);
201:   MatBindToCPU(b->B,B->boundtocpu);
202:   B->offloadmask = PETSC_OFFLOAD_CPU;
203:   B->assembled = PETSC_FALSE;
204:   B->was_assembled = PETSC_FALSE;
205:   return(0);
206: }

208: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
209: {
210:   Mat            Ad,Ao;
211:   const PetscInt *cmap;

215:   MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);
216:   MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);
217:   if (glob) {
218:     PetscInt cst, i, dn, on, *gidx;

220:     MatGetLocalSize(Ad,NULL,&dn);
221:     MatGetLocalSize(Ao,NULL,&on);
222:     MatGetOwnershipRangeColumn(A,&cst,NULL);
223:     PetscMalloc1(dn+on,&gidx);
224:     for (i=0; i<dn; i++) gidx[i]    = cst + i;
225:     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
226:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
227:   }
228:   return(0);
229: }

231: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
232: {
233:   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
234:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
235:   PetscErrorCode     ierr;
236:   PetscInt           i;

239:   PetscLayoutSetUp(B->rmap);
240:   PetscLayoutSetUp(B->cmap);
241:   if (PetscDefined(USE_DEBUG) && d_nnz) {
242:     for (i=0; i<B->rmap->n; i++) {
243:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
244:     }
245:   }
246:   if (PetscDefined(USE_DEBUG) && o_nnz) {
247:     for (i=0; i<B->rmap->n; i++) {
248:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
249:     }
250:   }
251: #if defined(PETSC_USE_CTABLE)
252:   PetscTableDestroy(&b->colmap);
253: #else
254:   PetscFree(b->colmap);
255: #endif
256:   PetscFree(b->garray);
257:   VecDestroy(&b->lvec);
258:   VecScatterDestroy(&b->Mvctx);
259:   /* Because the B will have been resized we simply destroy it and create a new one each time */
260:   MatDestroy(&b->B);
261:   if (!b->A) {
262:     MatCreate(PETSC_COMM_SELF,&b->A);
263:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
264:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
265:   }
266:   if (!b->B) {
267:     PetscMPIInt size;
268:     MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
269:     MatCreate(PETSC_COMM_SELF,&b->B);
270:     MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);
271:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
272:   }
273:   MatSetType(b->A,MATSEQAIJCUSPARSE);
274:   MatSetType(b->B,MATSEQAIJCUSPARSE);
275:   MatBindToCPU(b->A,B->boundtocpu);
276:   MatBindToCPU(b->B,B->boundtocpu);
277:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
278:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
279:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
280:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
281:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
282:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
283:   /* Let A, B use b's handle with pre-set stream
284:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
285:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);
286:   */
287:   B->preallocated = PETSC_TRUE;
288:   return(0);
289: }

291: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
292: {
293:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
295:   PetscInt       nt;

298:   VecGetLocalSize(xx,&nt);
299:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
300:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
301:   (*a->A->ops->mult)(a->A,xx,yy);
302:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
303:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
304:   return(0);
305: }

307: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
308: {
309:   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;

313:   if (A->factortype == MAT_FACTOR_NONE) {
314:     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
315:     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
316:     if (d_mat) {
317:       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
318:       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
319:       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
320:       cudaError_t  err;
321:       PetscScalar  *vals;
322:       PetscInfo(A,"Zero device matrix diag and offfdiag\n");
323:       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
324:       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
325:       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
326:       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
327:     }
328:   }
329:   MatZeroEntries(l->A);
330:   MatZeroEntries(l->B);

332:   return(0);
333: }
334: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
335: {
336:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
338:   PetscInt       nt;

341:   VecGetLocalSize(xx,&nt);
342:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
343:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
344:   (*a->A->ops->multadd)(a->A,xx,yy,zz);
345:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
346:   (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
347:   return(0);
348: }

350: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
351: {
352:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
354:   PetscInt       nt;

357:   VecGetLocalSize(xx,&nt);
358:   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
359:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
360:   (*a->A->ops->multtranspose)(a->A,xx,yy);
361:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
362:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
363:   return(0);
364: }

366: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
367: {
368:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
369:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

372:   switch (op) {
373:   case MAT_CUSPARSE_MULT_DIAG:
374:     cusparseStruct->diagGPUMatFormat = format;
375:     break;
376:   case MAT_CUSPARSE_MULT_OFFDIAG:
377:     cusparseStruct->offdiagGPUMatFormat = format;
378:     break;
379:   case MAT_CUSPARSE_ALL:
380:     cusparseStruct->diagGPUMatFormat    = format;
381:     cusparseStruct->offdiagGPUMatFormat = format;
382:     break;
383:   default:
384:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
385:   }
386:   return(0);
387: }

389: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
390: {
391:   MatCUSPARSEStorageFormat format;
392:   PetscErrorCode           ierr;
393:   PetscBool                flg;
394:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
395:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

398:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
399:   if (A->factortype==MAT_FACTOR_NONE) {
400:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
401:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
402:     if (flg) {
403:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
404:     }
405:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
406:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
407:     if (flg) {
408:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
409:     }
410:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
411:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
412:     if (flg) {
413:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
414:     }
415:   }
416:   PetscOptionsTail();
417:   return(0);
418: }

420: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
421: {
422:   PetscErrorCode             ierr;
423:   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
424:   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
425:   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
427:   MatAssemblyEnd_MPIAIJ(A,mode);
428:   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
429:     VecSetType(mpiaij->lvec,VECSEQCUDA);
430:    #if defined(PETSC_HAVE_NVSHMEM)
431:     {
432:       PetscMPIInt result;
433:       PetscBool   useNvshmem = PETSC_FALSE;
434:       PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);
435:       if (useNvshmem) {
436:         MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);
437:         if (result == MPI_IDENT || result == MPI_CONGRUENT) {VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);}
438:       }
439:     }
440:    #endif
441:   }
442:   if (d_mat) {
443:     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
444:   }
445:   return(0);
446: }

448: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
449: {
450:   PetscErrorCode     ierr;
451:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
452:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
453:   cudaError_t        err;
454:   cusparseStatus_t   stat;

457:   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
458:   if (cusparseStruct->deviceMat) {
459:     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
460:     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
461:     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
462:     PetscInfo(A,"Have device matrix\n");
463:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
464:     if (jaca->compressedrow.use) {
465:       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
466:     }
467:     if (jacb->compressedrow.use) {
468:       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
469:     }
470:     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
471:     err = cudaFree(d_mat);CHKERRCUDA(err);
472:   }
473:   try {
474:     if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
475:     if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
476:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
477:     /* We want cusparseStruct to use PetscDefaultCudaStream
478:     if (cusparseStruct->stream) {
479:       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
480:     }
481:     */
482:     delete cusparseStruct->coo_p;
483:     delete cusparseStruct->coo_pw;
484:     delete cusparseStruct;
485:   } catch(char *ex) {
486:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
487:   }
488:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
489:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
490:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
491:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
492:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
493:   MatDestroy_MPIAIJ(A);
494:   return(0);
495: }

497: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
498: {
499:   PetscErrorCode     ierr;
500:   Mat_MPIAIJ         *a;
501:   Mat_MPIAIJCUSPARSE *cusparseStruct;
502:   cusparseStatus_t   stat;
503:   Mat                A;

506:   if (reuse == MAT_INITIAL_MATRIX) {
507:     MatDuplicate(B,MAT_COPY_VALUES,newmat);
508:   } else if (reuse == MAT_REUSE_MATRIX) {
509:     MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
510:   }
511:   A = *newmat;
512:   A->boundtocpu = PETSC_FALSE;
513:   PetscFree(A->defaultvectype);
514:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

516:   a = (Mat_MPIAIJ*)A->data;
517:   if (a->A) { MatSetType(a->A,MATSEQAIJCUSPARSE); }
518:   if (a->B) { MatSetType(a->B,MATSEQAIJCUSPARSE); }
519:   if (a->lvec) {
520:     VecSetType(a->lvec,VECSEQCUDA);
521:   }

523:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
524:     a->spptr = new Mat_MPIAIJCUSPARSE;

526:     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
527:     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
528:     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
529:     cusparseStruct->coo_p               = NULL;
530:     cusparseStruct->coo_pw              = NULL;
531:     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
532:     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
533:     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
534:     cusparseStruct->deviceMat = NULL;
535:   }

537:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
538:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
539:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
540:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
541:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
542:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
543:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
544:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

546:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
547:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
548:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
549:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
550:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);
551:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);
552:   return(0);
553: }

555: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
556: {

560:   PetscCUDAInitializeCheck();
561:   MatCreate_MPIAIJ(A);
562:   MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
563:   return(0);
564: }

566: /*@
567:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
568:    (the default parallel PETSc format).  This matrix will ultimately pushed down
569:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
570:    assembly performance the user should preallocate the matrix storage by setting
571:    the parameter nz (or the array nnz).  By setting these parameters accurately,
572:    performance during matrix assembly can be increased by more than a factor of 50.

574:    Collective

576:    Input Parameters:
577: +  comm - MPI communicator, set to PETSC_COMM_SELF
578: .  m - number of rows
579: .  n - number of columns
580: .  nz - number of nonzeros per row (same for all rows)
581: -  nnz - array containing the number of nonzeros in the various rows
582:          (possibly different for each row) or NULL

584:    Output Parameter:
585: .  A - the matrix

587:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
588:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
589:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

591:    Notes:
592:    If nnz is given then nz is ignored

594:    The AIJ format (also called the Yale sparse matrix format or
595:    compressed row storage), is fully compatible with standard Fortran 77
596:    storage.  That is, the stored row and column indices can begin at
597:    either one (as in Fortran) or zero.  See the users' manual for details.

599:    Specify the preallocated storage with either nz or nnz (not both).
600:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
601:    allocation.  For large problems you MUST preallocate memory or you
602:    will get TERRIBLE performance, see the users' manual chapter on matrices.

604:    By default, this format uses inodes (identical nodes) when possible, to
605:    improve numerical efficiency of matrix-vector products and solves. We
606:    search for consecutive rows with the same nonzero structure, thereby
607:    reusing matrix information to achieve increased efficiency.

609:    Level: intermediate

611: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
612: @*/
613: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
614: {
616:   PetscMPIInt    size;

619:   MatCreate(comm,A);
620:   MatSetSizes(*A,m,n,M,N);
621:   MPI_Comm_size(comm,&size);
622:   if (size > 1) {
623:     MatSetType(*A,MATMPIAIJCUSPARSE);
624:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
625:   } else {
626:     MatSetType(*A,MATSEQAIJCUSPARSE);
627:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
628:   }
629:   return(0);
630: }

632: /*MC
633:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.

635:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
636:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
637:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

639:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
640:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
641:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
642:    for communicators controlling multiple processes.  It is recommended that you call both of
643:    the above preallocation routines for simplicity.

645:    Options Database Keys:
646: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
647: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
648: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
649: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

651:   Level: beginner

653:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
654: M*/

656: /*MC
657:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.

659:   Level: beginner

661:  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
662: M*/

664: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
665: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
666: {
667: #if defined(PETSC_USE_CTABLE)
668:   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
669: #else
670:   PetscSplitCSRDataStructure **p_d_mat;
671:   PetscMPIInt                size,rank;
672:   MPI_Comm                   comm;
673:   PetscErrorCode             ierr;
674:   int                        *ai,*bi,*aj,*bj;
675:   PetscScalar                *aa,*ba;

678:   PetscObjectGetComm((PetscObject)A,&comm);
679:   MPI_Comm_size(comm,&size);
680:   MPI_Comm_rank(comm,&rank);
681:   if (A->factortype == MAT_FACTOR_NONE) {
682:     CsrMatrix *matrixA,*matrixB=NULL;
683:     if (size == 1) {
684:       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
685:       p_d_mat = &cusparsestruct->deviceMat;
686:       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
687:       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
688:         matrixA = (CsrMatrix*)matstruct->mat;
689:         bi = bj = NULL; ba = NULL;
690:       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
691:     } else {
692:       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
693:       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
694:       p_d_mat = &spptr->deviceMat;
695:       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
696:       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
697:       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
698:       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
699:       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
700:         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
701:         matrixA = (CsrMatrix*)matstructA->mat;
702:         matrixB = (CsrMatrix*)matstructB->mat;
703:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
704:         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
705:         ba = thrust::raw_pointer_cast(matrixB->values->data());
706:       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
707:     }
708:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
709:     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
710:     aa = thrust::raw_pointer_cast(matrixA->values->data());
711:   } else {
712:     *B = NULL;
713:     return(0);
714:   }
715:   // act like MatSetValues because not called on host
716:   if (A->assembled) {
717:     if (A->was_assembled) {
718:       PetscInfo(A,"Assemble more than once already\n");
719:     }
720:     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
721:   } else {
722:     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
723:   }
724:   if (!*p_d_mat) {
725:     cudaError_t                 err;
726:     PetscSplitCSRDataStructure  *d_mat, h_mat;
727:     Mat_SeqAIJ                  *jaca;
728:     PetscInt                    n = A->rmap->n, nnz;
729:     // create and copy
730:     PetscInfo(A,"Create device matrix\n");
731:     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
732:     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
733:     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
734:     if (size == 1) {
735:       jaca = (Mat_SeqAIJ*)A->data;
736:       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
737:       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
738:       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
739:       h_mat.offdiag.a = NULL;
740:     } else {
741:       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
742:       Mat_SeqAIJ  *jacb;
743:       jaca = (Mat_SeqAIJ*)aij->A->data;
744:       jacb = (Mat_SeqAIJ*)aij->B->data;
745:       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
746:       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
747:       // create colmap - this is ussually done (lazy) in MatSetValues
748:       aij->donotstash = PETSC_TRUE;
749:       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
750:       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
751:       PetscCalloc1(A->cmap->N+1,&aij->colmap);
752:       aij->colmap[A->cmap->N] = -9;
753:       PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));
754:       {
755:         PetscInt ii;
756:         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
757:       }
758:       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
759:       // allocate B copy data
760:       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
761:       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
762:       nnz = jacb->i[n];

764:       if (jacb->compressedrow.use) {
765:         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
766:         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
767:       } else h_mat.offdiag.i = bi;
768:       h_mat.offdiag.j = bj;
769:       h_mat.offdiag.a = ba;

771:       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
772:       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
773:       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
774:       h_mat.offdiag.n = n;
775:     }
776:     // allocate A copy data
777:     nnz = jaca->i[n];
778:     h_mat.diag.n = n;
779:     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
780:     MPI_Comm_rank(comm,&h_mat.rank);
781:     if (jaca->compressedrow.use) {
782:       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
783:       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
784:     } else {
785:       h_mat.diag.i = ai;
786:     }
787:     h_mat.diag.j = aj;
788:     h_mat.diag.a = aa;
789:     // copy pointers and metdata to device
790:     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
791:     PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);
792:   } else {
793:     *B = *p_d_mat;
794:   }
795:   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
796:   return(0);
797: #endif
798: }