Actual source code: mpiaijcusparse.cu
  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  4: #include <petscconf.h>
  5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  6: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  8: #include <thrust/advance.h>
  9: #include <petscsf.h>

 11: struct VecCUDAEquals
 12: {
 13:   template <typename Tuple>
 14:   __host__ __device__
 15:   void operator()(Tuple t)
 16:   {
 17:     thrust::get<1>(t) = thrust::get<0>(t);
 18:   }
 19: };

 21: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
 22: {
 23:   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
 24:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
 25:   PetscInt           n = cusp->coo_nd + cusp->coo_no;
 26:   PetscErrorCode     ierr;

 29:   if (cusp->coo_p && v) {
 30:     thrust::device_ptr<const PetscScalar> d_v;
 31:     THRUSTARRAY                           *w = NULL;

 33:     if (isCudaMem(v)) {
 34:       d_v = thrust::device_pointer_cast(v);
 35:     } else {
 36:       w = new THRUSTARRAY(n);
 37:       w->assign(v,v+n);
 38:       PetscLogCpuToGpu(n*sizeof(PetscScalar));
 39:       d_v = w->data();
 40:     }

 42:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
 43:                                                               cusp->coo_pw->begin()));
 44:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
 45:                                                               cusp->coo_pw->end()));
 46:     PetscLogGpuTimeBegin();
 47:     thrust::for_each(zibit,zieit,VecCUDAEquals());
 48:     PetscLogGpuTimeEnd();
 49:     delete w;
 50:     MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);
 51:     MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);
 52:   } else {
 53:     MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);
 54:     MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);
 55:   }
 56:   PetscObjectStateIncrease((PetscObject)A);
 57:   A->num_ass++;
 58:   A->assembled        = PETSC_TRUE;
 59:   A->ass_nonzerostate = A->nonzerostate;
 60:   A->offloadmask      = PETSC_OFFLOAD_GPU;
 61:   return(0);
 62: }

 64: template <typename Tuple>
 65: struct IsNotOffDiagT
 66: {
 67:   PetscInt _cstart,_cend;

 69:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
 70:   __host__ __device__
 71:   inline bool operator()(Tuple t)
 72:   {
 73:     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
 74:   }
 75: };

 77: struct IsOffDiag
 78: {
 79:   PetscInt _cstart,_cend;

 81:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
 82:   __host__ __device__
 83:   inline bool operator() (const PetscInt &c)
 84:   {
 85:     return c < _cstart || c >= _cend;
 86:   }
 87: };

 89: struct GlobToLoc
 90: {
 91:   PetscInt _start;

 93:   GlobToLoc(PetscInt start) : _start(start) {}
 94:   __host__ __device__
 95:   inline PetscInt operator() (const PetscInt &c)
 96:   {
 97:     return c - _start;
 98:   }
 99: };

101: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102: {
103:   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
104:   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105:   PetscErrorCode         ierr;
106:   PetscInt               *jj;
107:   size_t                 noff = 0;
108:   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
109:   THRUSTINTARRAY         d_j(n);
110:   ISLocalToGlobalMapping l2g;
111:   cudaError_t            cerr;

114:   PetscLayoutSetUp(B->rmap);
115:   PetscLayoutSetUp(B->cmap);
116:   if (b->A) { MatCUSPARSEClearHandle(b->A); }
117:   if (b->B) { MatCUSPARSEClearHandle(b->B); }
118:   PetscFree(b->garray);
119:   VecDestroy(&b->lvec);
120:   MatDestroy(&b->A);
121:   MatDestroy(&b->B);

123:   PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
124:   d_i.assign(coo_i,coo_i+n);
125:   d_j.assign(coo_j,coo_j+n);
126:   delete cusp->coo_p;
127:   delete cusp->coo_pw;
128:   cusp->coo_p = NULL;
129:   cusp->coo_pw = NULL;
130:   PetscLogGpuTimeBegin();
131:   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132:   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134:     cusp->coo_p = new THRUSTINTARRAY(n);
135:     cusp->coo_pw = new THRUSTARRAY(n);
136:     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139:     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140:     firstoffd = mzipp.get_iterator_tuple().get<1>();
141:   }
142:   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143:   cusp->coo_no = thrust::distance(firstoffd,d_j.end());

145:   /* from global to local */
146:   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147:   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148:   PetscLogGpuTimeEnd();

150:   /* copy offdiag column indices to map on the CPU */
151:   PetscMalloc1(cusp->coo_no,&jj); /* jj[] will store compacted col ids of the offdiag part */
152:   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153:   auto o_j = d_j.begin();
154:   PetscLogGpuTimeBegin();
155:   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
156:   thrust::sort(thrust::device,o_j,d_j.end());
157:   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
158:   PetscLogGpuTimeEnd();
159:   noff = thrust::distance(o_j,wit);
160:   PetscMalloc1(noff,&b->garray);
161:   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162:   PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));
163:   ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);
164:   ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);
165:   ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);
166:   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167:   ISLocalToGlobalMappingDestroy(&l2g);

169:   MatCreate(PETSC_COMM_SELF,&b->A);
170:   MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
171:   MatSetType(b->A,MATSEQAIJCUSPARSE);
172:   PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
173:   MatCreate(PETSC_COMM_SELF,&b->B);
174:   MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);
175:   MatSetType(b->B,MATSEQAIJCUSPARSE);
176:   PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);

178:   /* GPU memory, cusparse specific call handles it internally */
179:   MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());
180:   MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);
181:   PetscFree(jj);

183:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);
184:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);
185:   MatCUSPARSESetHandle(b->A,cusp->handle);
186:   MatCUSPARSESetHandle(b->B,cusp->handle);
187:   /*
188:   MatCUSPARSESetStream(b->A,cusp->stream);
189:   MatCUSPARSESetStream(b->B,cusp->stream);
190:   */
191:   MatSetUpMultiply_MPIAIJ(B);
192:   B->preallocated = PETSC_TRUE;
193:   B->nonzerostate++;

195:   MatBindToCPU(b->A,B->boundtocpu);
196:   MatBindToCPU(b->B,B->boundtocpu);
197:   B->offloadmask = PETSC_OFFLOAD_CPU;
198:   B->assembled = PETSC_FALSE;
199:   B->was_assembled = PETSC_FALSE;
200:   return(0);
201: }

203: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204: {
205:   Mat            Ad,Ao;
206:   const PetscInt *cmap;

210:   MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);
211:   MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);
212:   if (glob) {
213:     PetscInt cst, i, dn, on, *gidx;

215:     MatGetLocalSize(Ad,NULL,&dn);
216:     MatGetLocalSize(Ao,NULL,&on);
217:     MatGetOwnershipRangeColumn(A,&cst,NULL);
218:     PetscMalloc1(dn+on,&gidx);
219:     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220:     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
222:   }
223:   return(0);
224: }

226: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227: {
228:   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230:   PetscErrorCode     ierr;
231:   PetscInt           i;

234:   PetscLayoutSetUp(B->rmap);
235:   PetscLayoutSetUp(B->cmap);
236:   if (PetscDefined(USE_DEBUG) && d_nnz) {
237:     for (i=0; i<B->rmap->n; i++) {
238:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239:     }
240:   }
241:   if (PetscDefined(USE_DEBUG) && o_nnz) {
242:     for (i=0; i<B->rmap->n; i++) {
243:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244:     }
245:   }
246: #if defined(PETSC_USE_CTABLE)
247:   PetscTableDestroy(&b->colmap);
248: #else
249:   PetscFree(b->colmap);
250: #endif
251:   PetscFree(b->garray);
252:   VecDestroy(&b->lvec);
253:   VecScatterDestroy(&b->Mvctx);
254:   /* Because the B will have been resized we simply destroy it and create a new one each time */
255:   MatDestroy(&b->B);
256:   if (!b->A) {
257:     MatCreate(PETSC_COMM_SELF,&b->A);
258:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
259:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
260:   }
261:   if (!b->B) {
262:     PetscMPIInt size;
263:     MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
264:     MatCreate(PETSC_COMM_SELF,&b->B);
265:     MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);
266:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
267:   }
268:   MatSetType(b->A,MATSEQAIJCUSPARSE);
269:   MatSetType(b->B,MATSEQAIJCUSPARSE);
270:   MatBindToCPU(b->A,B->boundtocpu);
271:   MatBindToCPU(b->B,B->boundtocpu);
272:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
273:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
274:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
275:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
276:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
277:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
278:   /* Let A, B use b's handle with pre-set stream
279:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
280:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);
281:   */
282:   B->preallocated = PETSC_TRUE;
283:   return(0);
284: }

286: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287: {
288:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
290:   PetscInt       nt;

293:   VecGetLocalSize(xx,&nt);
294:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
296:   (*a->A->ops->mult)(a->A,xx,yy);
297:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
298:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
299:   return(0);
300: }

302: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
303: {
304:   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;

308:   MatZeroEntries(l->A);
309:   MatZeroEntries(l->B);
310:   return(0);
311: }

313: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
314: {
315:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
317:   PetscInt       nt;

320:   VecGetLocalSize(xx,&nt);
321:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
322:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
323:   (*a->A->ops->multadd)(a->A,xx,yy,zz);
324:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
325:   (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
326:   return(0);
327: }

329: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
330: {
331:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
333:   PetscInt       nt;

336:   VecGetLocalSize(xx,&nt);
337:   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
338:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
339:   (*a->A->ops->multtranspose)(a->A,xx,yy);
340:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
341:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
342:   return(0);
343: }

345: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
346: {
347:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
348:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

351:   switch (op) {
352:   case MAT_CUSPARSE_MULT_DIAG:
353:     cusparseStruct->diagGPUMatFormat = format;
354:     break;
355:   case MAT_CUSPARSE_MULT_OFFDIAG:
356:     cusparseStruct->offdiagGPUMatFormat = format;
357:     break;
358:   case MAT_CUSPARSE_ALL:
359:     cusparseStruct->diagGPUMatFormat    = format;
360:     cusparseStruct->offdiagGPUMatFormat = format;
361:     break;
362:   default:
363:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
364:   }
365:   return(0);
366: }

368: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
369: {
370:   MatCUSPARSEStorageFormat format;
371:   PetscErrorCode           ierr;
372:   PetscBool                flg;
373:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
374:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

377:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
378:   if (A->factortype==MAT_FACTOR_NONE) {
379:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
380:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
381:     if (flg) {
382:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
383:     }
384:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
385:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
386:     if (flg) {
387:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
388:     }
389:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
391:     if (flg) {
392:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
393:     }
394:   }
395:   PetscOptionsTail();
396:   return(0);
397: }

399: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
400: {
401:   PetscErrorCode     ierr;
402:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
403:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
404:   PetscObjectState   onnz = A->nonzerostate;

407:   MatAssemblyEnd_MPIAIJ(A,mode);
408:   if (mpiaij->lvec) { VecSetType(mpiaij->lvec,VECSEQCUDA); }
409:   if (onnz != A->nonzerostate && cusp->deviceMat) {
410:     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
411:     cudaError_t                cerr;

413:     PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
414:     PetscNew(&h_mat);
415:     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
416:     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
417:     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
418:     PetscFree(h_mat);
419:     cusp->deviceMat = NULL;
420:   }
421:   return(0);
422: }

424: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
425: {
426:   PetscErrorCode     ierr;
427:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
428:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
429:   cusparseStatus_t   stat;

432:   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
433:   if (cusparseStruct->deviceMat) {
434:     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
435:     cudaError_t                cerr;

437:     PetscInfo(A,"Have device matrix\n");
438:     PetscNew(&h_mat);
439:     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
440:     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
441:     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
442:     PetscFree(h_mat);
443:   }
444:   try {
445:     if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
446:     if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
447:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
448:     /* We want cusparseStruct to use PetscDefaultCudaStream
449:     if (cusparseStruct->stream) {
450:       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
451:     }
452:     */
453:     delete cusparseStruct->coo_p;
454:     delete cusparseStruct->coo_pw;
455:     delete cusparseStruct;
456:   } catch(char *ex) {
457:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
458:   }
459:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
460:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
461:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
462:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
463:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
464:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);
465:   MatDestroy_MPIAIJ(A);
466:   return(0);
467: }

469: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
470: {
471:   PetscErrorCode     ierr;
472:   Mat_MPIAIJ         *a;
473:   Mat_MPIAIJCUSPARSE *cusparseStruct;
474:   cusparseStatus_t   stat;
475:   Mat                A;

478:   if (reuse == MAT_INITIAL_MATRIX) {
479:     MatDuplicate(B,MAT_COPY_VALUES,newmat);
480:   } else if (reuse == MAT_REUSE_MATRIX) {
481:     MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
482:   }
483:   A = *newmat;
484:   A->boundtocpu = PETSC_FALSE;
485:   PetscFree(A->defaultvectype);
486:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

488:   a = (Mat_MPIAIJ*)A->data;
489:   if (a->A) { MatSetType(a->A,MATSEQAIJCUSPARSE); }
490:   if (a->B) { MatSetType(a->B,MATSEQAIJCUSPARSE); }
491:   if (a->lvec) {
492:     VecSetType(a->lvec,VECSEQCUDA);
493:   }

495:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
496:     a->spptr = new Mat_MPIAIJCUSPARSE;

498:     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
499:     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
500:     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
501:     cusparseStruct->coo_p               = NULL;
502:     cusparseStruct->coo_pw              = NULL;
503:     cusparseStruct->stream              = 0;
504:     cusparseStruct->deviceMat           = NULL;
505:     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
506:   }

508:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
509:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
510:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
511:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
512:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
513:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
514:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
515:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

517:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
518:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
519:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
520:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
521:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);
522:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);
523: #if defined(PETSC_HAVE_HYPRE)
524:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
525: #endif
526:   return(0);
527: }

529: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
530: {

534:   PetscCUDAInitializeCheck();
535:   MatCreate_MPIAIJ(A);
536:   MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
537:   return(0);
538: }

540: /*@
541:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
542:    (the default parallel PETSc format).  This matrix will ultimately pushed down
543:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
544:    assembly performance the user should preallocate the matrix storage by setting
545:    the parameter nz (or the array nnz).  By setting these parameters accurately,
546:    performance during matrix assembly can be increased by more than a factor of 50.

548:    Collective

550:    Input Parameters:
551: +  comm - MPI communicator, set to PETSC_COMM_SELF
552: .  m - number of rows
553: .  n - number of columns
554: .  nz - number of nonzeros per row (same for all rows)
555: -  nnz - array containing the number of nonzeros in the various rows
556:          (possibly different for each row) or NULL

558:    Output Parameter:
559: .  A - the matrix

561:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
562:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
563:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

565:    Notes:
566:    If nnz is given then nz is ignored

568:    The AIJ format (also called the Yale sparse matrix format or
569:    compressed row storage), is fully compatible with standard Fortran 77
570:    storage.  That is, the stored row and column indices can begin at
571:    either one (as in Fortran) or zero.  See the users' manual for details.

573:    Specify the preallocated storage with either nz or nnz (not both).
574:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
575:    allocation.  For large problems you MUST preallocate memory or you
576:    will get TERRIBLE performance, see the users' manual chapter on matrices.

578:    By default, this format uses inodes (identical nodes) when possible, to
579:    improve numerical efficiency of matrix-vector products and solves. We
580:    search for consecutive rows with the same nonzero structure, thereby
581:    reusing matrix information to achieve increased efficiency.

583:    Level: intermediate

585: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
586: @*/
587: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
588: {
590:   PetscMPIInt    size;

593:   MatCreate(comm,A);
594:   MatSetSizes(*A,m,n,M,N);
595:   MPI_Comm_size(comm,&size);
596:   if (size > 1) {
597:     MatSetType(*A,MATMPIAIJCUSPARSE);
598:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
599:   } else {
600:     MatSetType(*A,MATSEQAIJCUSPARSE);
601:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
602:   }
603:   return(0);
604: }

606: /*MC
607:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.

609:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
610:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
611:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

613:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
614:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
615:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
616:    for communicators controlling multiple processes.  It is recommended that you call both of
617:    the above preallocation routines for simplicity.

619:    Options Database Keys:
620: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
621: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
622: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
623: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

625:   Level: beginner

627:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
628: M*/

630: /*MC
631:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.

633:   Level: beginner

635:  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
636: M*/

638: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);

640: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
641: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
642: {
643:   PetscSplitCSRDataStructure d_mat;
644:   PetscMPIInt                size;
645:   PetscErrorCode             ierr;
646:   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
647:   PetscScalar                *aa = NULL,*ba = NULL;
648:   Mat_SeqAIJ                 *jaca = NULL;
649:   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
650:   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;

653:   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
654:   if (A->factortype != MAT_FACTOR_NONE) {
655:     *B = NULL;
656:     return(0);
657:   }
658:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
659:   if (size == 1) {
660:     PetscBool isseqaij;

662:     PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);
663:     if (isseqaij) {
664:       jaca = (Mat_SeqAIJ*)A->data;
665:       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
666:       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
667:       d_mat = cusparsestructA->deviceMat;
668:       MatSeqAIJCUSPARSECopyToGPU(A);
669:     } else {
670:       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
671:       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
672:       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
673:       jaca = (Mat_SeqAIJ*)aij->A->data;
674:       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
675:       d_mat = spptr->deviceMat;
676:       MatSeqAIJCUSPARSECopyToGPU(aij->A);
677:     }
678:     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
679:       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
680:       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
681:       matrixA = (CsrMatrix*)matstruct->mat;
682:       bi = NULL;
683:       bj = NULL;
684:       ba = NULL;
685:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
686:   } else {
687:     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
688:     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
689:     jaca = (Mat_SeqAIJ*)aij->A->data;
690:     Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
691:     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;

693:     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
694:     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
695:     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
696:     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
697:     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
698:       MatSeqAIJCUSPARSECopyToGPU(aij->A);
699:       MatSeqAIJCUSPARSECopyToGPU(aij->B);
700:       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
701:       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
702:       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
703:       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
704:       matrixA = (CsrMatrix*)matstructA->mat;
705:       matrixB = (CsrMatrix*)matstructB->mat;
706:       if (jacb->compressedrow.use) {
707:         if (!cusparsestructB->rowoffsets_gpu) {
708:           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
709:           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
710:         }
711:         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
712:       } else {
713:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
714:       }
715:       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
716:       ba = thrust::raw_pointer_cast(matrixB->values->data());
717:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
718:     d_mat = spptr->deviceMat;
719:   }
720:   if (jaca->compressedrow.use) {
721:     if (!cusparsestructA->rowoffsets_gpu) {
722:       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
723:       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
724:     }
725:     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
726:   } else {
727:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
728:   }
729:   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
730:   aa = thrust::raw_pointer_cast(matrixA->values->data());

732:   if (!d_mat) {
733:     cudaError_t                cerr;
734:     PetscSplitCSRDataStructure h_mat;

736:     // create and populate strucy on host and copy on device
737:     PetscInfo(A,"Create device matrix\n");
738:     PetscNew(&h_mat);
739:     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
740:     if (size > 1) { /* need the colmap array */
741:       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
742:       int        *colmap;
743:       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;

745:       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");

747:       PetscCalloc1(N+1,&colmap);
748:       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);

750:       h_mat->offdiag.i = bi;
751:       h_mat->offdiag.j = bj;
752:       h_mat->offdiag.a = ba;
753:       h_mat->offdiag.n = A->rmap->n;

755:       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr);
756:       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
757:       PetscFree(colmap);
758:     }
759:     h_mat->rstart = A->rmap->rstart;
760:     h_mat->rend   = A->rmap->rend;
761:     h_mat->cstart = A->cmap->rstart;
762:     h_mat->cend   = A->cmap->rend;
763:     h_mat->N      = A->cmap->N;
764:     h_mat->diag.i = ai;
765:     h_mat->diag.j = aj;
766:     h_mat->diag.a = aa;
767:     h_mat->diag.n = A->rmap->n;
768:     h_mat->rank   = PetscGlobalRank;
769:     // copy pointers and metadata to device
770:     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
771:     PetscFree(h_mat);
772:   } else {
773:     PetscInfo(A,"Reusing device matrix\n");
774:   }
775:   *B = d_mat;
776:   A->offloadmask = PETSC_OFFLOAD_GPU;
777:   return(0);
778: }