Actual source code: mpiaijcusparse.cu
  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  4: #include <petscconf.h>
  5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  6: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  8: #include <thrust/advance.h>
  9: #include <thrust/partition.h>
 10: #include <thrust/sort.h>
 11: #include <thrust/unique.h>
 12: #include <petscsf.h>

 14: struct VecCUDAEquals
 15: {
 16:   template <typename Tuple>
 17:   __host__ __device__
 18:   void operator()(Tuple t)
 19:   {
 20:     thrust::get<1>(t) = thrust::get<0>(t);
 21:   }
 22: };

 24: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
 25: {
 26:   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
 27:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;

 29:   if (!cusparseStruct) return 0;
 30:   if (cusparseStruct->use_extended_coo) {
 31:     cudaFree(cusparseStruct->Aimap1_d);
 32:     cudaFree(cusparseStruct->Ajmap1_d);
 33:     cudaFree(cusparseStruct->Aperm1_d);
 34:     cudaFree(cusparseStruct->Bimap1_d);
 35:     cudaFree(cusparseStruct->Bjmap1_d);
 36:     cudaFree(cusparseStruct->Bperm1_d);
 37:     cudaFree(cusparseStruct->Aimap2_d);
 38:     cudaFree(cusparseStruct->Ajmap2_d);
 39:     cudaFree(cusparseStruct->Aperm2_d);
 40:     cudaFree(cusparseStruct->Bimap2_d);
 41:     cudaFree(cusparseStruct->Bjmap2_d);
 42:     cudaFree(cusparseStruct->Bperm2_d);
 43:     cudaFree(cusparseStruct->Cperm1_d);
 44:     cudaFree(cusparseStruct->sendbuf_d);
 45:     cudaFree(cusparseStruct->recvbuf_d);
 46:   }
 47:   cusparseStruct->use_extended_coo = PETSC_FALSE;
 48:   delete cusparseStruct->coo_p;
 49:   delete cusparseStruct->coo_pw;
 50:   cusparseStruct->coo_p  = NULL;
 51:   cusparseStruct->coo_pw = NULL;
 52:   return 0;
 53: }

 55: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
 56: {
 57:   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
 58:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
 59:   PetscInt           n = cusp->coo_nd + cusp->coo_no;

 61:   if (cusp->coo_p && v) {
 62:     thrust::device_ptr<const PetscScalar> d_v;
 63:     THRUSTARRAY                           *w = NULL;

 65:     if (isCudaMem(v)) {
 66:       d_v = thrust::device_pointer_cast(v);
 67:     } else {
 68:       w = new THRUSTARRAY(n);
 69:       w->assign(v,v+n);
 70:       PetscLogCpuToGpu(n*sizeof(PetscScalar));
 71:       d_v = w->data();
 72:     }

 74:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
 75:                                                               cusp->coo_pw->begin()));
 76:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
 77:                                                               cusp->coo_pw->end()));
 78:     PetscLogGpuTimeBegin();
 79:     thrust::for_each(zibit,zieit,VecCUDAEquals());
 80:     PetscLogGpuTimeEnd();
 81:     delete w;
 82:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);
 83:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);
 84:   } else {
 85:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);
 86:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);
 87:   }
 88:   return 0;
 89: }

 91: template <typename Tuple>
 92: struct IsNotOffDiagT
 93: {
 94:   PetscInt _cstart,_cend;

 96:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
 97:   __host__ __device__
 98:   inline bool operator()(Tuple t)
 99:   {
100:     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
101:   }
102: };

104: struct IsOffDiag
105: {
106:   PetscInt _cstart,_cend;

108:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
109:   __host__ __device__
110:   inline bool operator() (const PetscInt &c)
111:   {
112:     return c < _cstart || c >= _cend;
113:   }
114: };

116: struct GlobToLoc
117: {
118:   PetscInt _start;

120:   GlobToLoc(PetscInt start) : _start(start) {}
121:   __host__ __device__
122:   inline PetscInt operator() (const PetscInt &c)
123:   {
124:     return c - _start;
125:   }
126: };

128: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
129: {
130:   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
131:   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
132:   PetscInt               N,*jj;
133:   size_t                 noff = 0;
134:   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
135:   THRUSTINTARRAY         d_j(n);
136:   ISLocalToGlobalMapping l2g;

138:   MatDestroy(&b->A);
139:   MatDestroy(&b->B);

141:   PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
142:   d_i.assign(coo_i,coo_i+n);
143:   d_j.assign(coo_j,coo_j+n);
144:   PetscLogGpuTimeBegin();
145:   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
146:   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
147:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
148:     cusp->coo_p = new THRUSTINTARRAY(n);
149:     cusp->coo_pw = new THRUSTARRAY(n);
150:     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
151:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
152:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
153:     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
154:     firstoffd = mzipp.get_iterator_tuple().get<1>();
155:   }
156:   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
157:   cusp->coo_no = thrust::distance(firstoffd,d_j.end());

159:   /* from global to local */
160:   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
161:   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
162:   PetscLogGpuTimeEnd();

164:   /* copy offdiag column indices to map on the CPU */
165:   PetscMalloc1(cusp->coo_no,&jj); /* jj[] will store compacted col ids of the offdiag part */
166:   cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);
167:   auto o_j = d_j.begin();
168:   PetscLogGpuTimeBegin();
169:   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
170:   thrust::sort(thrust::device,o_j,d_j.end());
171:   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
172:   PetscLogGpuTimeEnd();
173:   noff = thrust::distance(o_j,wit);
174:   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
175:   PetscMalloc1(noff,&b->garray);
176:   cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);
177:   PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));
178:   ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);
179:   ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);
180:   ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);
182:   ISLocalToGlobalMappingDestroy(&l2g);

184:   MatCreate(PETSC_COMM_SELF,&b->A);
185:   MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
186:   MatSetType(b->A,MATSEQAIJCUSPARSE);
187:   PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
188:   MatCreate(PETSC_COMM_SELF,&b->B);
189:   MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);
190:   MatSetType(b->B,MATSEQAIJCUSPARSE);
191:   PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);

193:   /* GPU memory, cusparse specific call handles it internally */
194:   MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());
195:   MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);
196:   PetscFree(jj);

198:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);
199:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);

201:   MatBindToCPU(b->A,B->boundtocpu);
202:   MatBindToCPU(b->B,B->boundtocpu);
203:   MatSetUpMultiply_MPIAIJ(B);
204:   return 0;
205: }

207: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
208: {
209:   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
210:   Mat_MPIAIJCUSPARSE *mpidev;
211:   PetscBool           coo_basic = PETSC_TRUE;
212:   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
213:   PetscInt            rstart,rend;

215:   PetscFree(mpiaij->garray);
216:   VecDestroy(&mpiaij->lvec);
217: #if defined(PETSC_USE_CTABLE)
218:   PetscTableDestroy(&mpiaij->colmap);
219: #else
220:   PetscFree(mpiaij->colmap);
221: #endif
222:   VecScatterDestroy(&mpiaij->Mvctx);
223:   mat->assembled                                                                      = PETSC_FALSE;
224:   mat->was_assembled                                                                  = PETSC_FALSE;
225:   MatResetPreallocationCOO_MPIAIJ(mat);
226:   MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);
227:   if (coo_i) {
228:     PetscLayoutGetRange(mat->rmap,&rstart,&rend);
229:     PetscGetMemType(coo_i,&mtype);
230:     if (PetscMemTypeHost(mtype)) {
231:       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
232:         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
233:       }
234:     }
235:   }
236:   /* All ranks must agree on the value of coo_basic */
237:   MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));
238:   if (coo_basic) {
239:     MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);
240:   } else {
241:     MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);
242:     mat->offloadmask = PETSC_OFFLOAD_CPU;
243:     /* creates the GPU memory */
244:     MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);
245:     MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);
246:     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
247:     mpidev->use_extended_coo = PETSC_TRUE;

249:     cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));
250:     cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));
251:     cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));

253:     cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));
254:     cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));
255:     cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));

257:     cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));
258:     cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));
259:     cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));

261:     cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));
262:     cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));
263:     cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));

265:     cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));
266:     cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));
267:     cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));

269:     cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);
270:     cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
271:     cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);

273:     cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);
274:     cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
275:     cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);

277:     cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);
278:     cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
279:     cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);

281:     cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);
282:     cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
283:     cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);

285:     cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);
286:   }
287:   return 0;
288: }

290: __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
291: {
292:   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
293:   const PetscCount grid_size = gridDim.x * blockDim.x;
294:   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
295: }

297: __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
298: {
299:   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
300:   const PetscCount grid_size  = gridDim.x * blockDim.x;
301:   for (; i<nnz; i            += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
302: }

304: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
305: {
306:   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
307:   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
308:   Mat                 A      = mpiaij->A,B = mpiaij->B;
309:   PetscCount          Annz1  = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
310:   PetscScalar        *Aa,*Ba = NULL;
311:   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
312:   const PetscScalar  *v1     = v;
313:   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
314:   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
315:   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
316:   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
317:   PetscMemType        memtype;

319:   if (mpidev->use_extended_coo) {
320:     PetscMPIInt size;

322:     MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);
323:     PetscGetMemType(v,&memtype);
324:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
325:       cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));
326:       cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
327:     }

329:     if (imode == INSERT_VALUES) {
330:       MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa); /* write matrix values */
331:       cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));
332:       if (size > 1) {
333:         MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);
334:         cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));
335:       }
336:     } else {
337:       MatSeqAIJCUSPARSEGetArray(A,&Aa); /* read & write matrix values */
338:       if (size > 1) MatSeqAIJCUSPARSEGetArray(B,&Ba);
339:     }

341:     /* Pack entries to be sent to remote */
342:     if (mpiaij->sendlen) {
343:       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
344:       cudaPeekAtLastError();
345:     }

347:     /* Send remote entries to their owner and overlap the communication with local computation */
348:     if (size > 1) PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);
349:     /* Add local entries to A and B */
350:     if (Annz1) {
351:       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
352:       cudaPeekAtLastError();
353:     }
354:     if (Bnnz1) {
355:       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
356:       cudaPeekAtLastError();
357:     }
358:     if (size > 1) PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);

360:     /* Add received remote entries to A and B */
361:     if (Annz2) {
362:       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
363:       cudaPeekAtLastError();
364:     }
365:     if (Bnnz2) {
366:       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
367:       cudaPeekAtLastError();
368:     }

370:     if (imode == INSERT_VALUES) {
371:       MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);
372:       if (size > 1) MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);
373:     } else {
374:       MatSeqAIJCUSPARSERestoreArray(A,&Aa);
375:       if (size > 1) MatSeqAIJCUSPARSERestoreArray(B,&Ba);
376:     }
377:     if (PetscMemTypeHost(memtype)) cudaFree((void*)v1);
378:   } else {
379:     MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);
380:   }
381:   mat->offloadmask = PETSC_OFFLOAD_GPU;
382:   return 0;
383: }

385: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
386: {
387:   Mat             Ad,Ao;
388:   const PetscInt *cmap;

390:   MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);
391:   MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);
392:   if (glob) {
393:     PetscInt cst, i, dn, on, *gidx;

395:     MatGetLocalSize(Ad,NULL,&dn);
396:     MatGetLocalSize(Ao,NULL,&on);
397:     MatGetOwnershipRangeColumn(A,&cst,NULL);
398:     PetscMalloc1(dn+on,&gidx);
399:     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
400:     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
401:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
402:   }
403:   return 0;
404: }

406: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
407: {
408:   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
409:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
410:   PetscInt           i;

412:   PetscLayoutSetUp(B->rmap);
413:   PetscLayoutSetUp(B->cmap);
414:   if (PetscDefined(USE_DEBUG) && d_nnz) {
415:     for (i=0; i<B->rmap->n; i++) {
417:     }
418:   }
419:   if (PetscDefined(USE_DEBUG) && o_nnz) {
420:     for (i=0; i<B->rmap->n; i++) {
422:     }
423:   }
424: #if defined(PETSC_USE_CTABLE)
425:   PetscTableDestroy(&b->colmap);
426: #else
427:   PetscFree(b->colmap);
428: #endif
429:   PetscFree(b->garray);
430:   VecDestroy(&b->lvec);
431:   VecScatterDestroy(&b->Mvctx);
432:   /* Because the B will have been resized we simply destroy it and create a new one each time */
433:   MatDestroy(&b->B);
434:   if (!b->A) {
435:     MatCreate(PETSC_COMM_SELF,&b->A);
436:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
437:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
438:   }
439:   if (!b->B) {
440:     PetscMPIInt size;
441:     MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
442:     MatCreate(PETSC_COMM_SELF,&b->B);
443:     MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);
444:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
445:   }
446:   MatSetType(b->A,MATSEQAIJCUSPARSE);
447:   MatSetType(b->B,MATSEQAIJCUSPARSE);
448:   MatBindToCPU(b->A,B->boundtocpu);
449:   MatBindToCPU(b->B,B->boundtocpu);
450:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
451:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
452:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
453:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
454:   B->preallocated = PETSC_TRUE;
455:   return 0;
456: }

458: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
459: {
460:   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;

462:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
463:   (*a->A->ops->mult)(a->A,xx,yy);
464:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
465:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
466:   return 0;
467: }

469: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
470: {
471:   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;

473:   MatZeroEntries(l->A);
474:   MatZeroEntries(l->B);
475:   return 0;
476: }

478: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
479: {
480:   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;

482:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
483:   (*a->A->ops->multadd)(a->A,xx,yy,zz);
484:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
485:   (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
486:   return 0;
487: }

489: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
490: {
491:   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;

493:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
494:   (*a->A->ops->multtranspose)(a->A,xx,yy);
495:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
496:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
497:   return 0;
498: }

500: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
501: {
502:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
503:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

505:   switch (op) {
506:   case MAT_CUSPARSE_MULT_DIAG:
507:     cusparseStruct->diagGPUMatFormat = format;
508:     break;
509:   case MAT_CUSPARSE_MULT_OFFDIAG:
510:     cusparseStruct->offdiagGPUMatFormat = format;
511:     break;
512:   case MAT_CUSPARSE_ALL:
513:     cusparseStruct->diagGPUMatFormat    = format;
514:     cusparseStruct->offdiagGPUMatFormat = format;
515:     break;
516:   default:
517:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
518:   }
519:   return 0;
520: }

522: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
523: {
524:   MatCUSPARSEStorageFormat format;
525:   PetscBool                flg;
526:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
527:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

529:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
530:   if (A->factortype==MAT_FACTOR_NONE) {
531:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
532:                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
533:     if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
534:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
535:                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
536:     if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
537:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
538:                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
539:     if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
540:   }
541:   PetscOptionsTail();
542:   return 0;
543: }

545: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
546: {
547:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
548:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
549:   PetscObjectState   onnz = A->nonzerostate;

551:   MatAssemblyEnd_MPIAIJ(A,mode);
552:   if (mpiaij->lvec) VecSetType(mpiaij->lvec,VECSEQCUDA);
553:   if (onnz != A->nonzerostate && cusp->deviceMat) {
554:     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;

556:     PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
557:     PetscNew(&h_mat);
558:     cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);
559:     cudaFree(h_mat->colmap);
560:     if (h_mat->allocated_indices) {
561:       cudaFree(h_mat->diag.i);
562:       cudaFree(h_mat->diag.j);
563:       if (h_mat->offdiag.j) {
564:         cudaFree(h_mat->offdiag.i);
565:         cudaFree(h_mat->offdiag.j);
566:       }
567:     }
568:     cudaFree(d_mat);
569:     PetscFree(h_mat);
570:     cusp->deviceMat = NULL;
571:   }
572:   return 0;
573: }

575: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
576: {
577:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
578:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;

581:   if (cusparseStruct->deviceMat) {
582:     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;

584:     PetscInfo(A,"Have device matrix\n");
585:     PetscNew(&h_mat);
586:     cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);
587:     cudaFree(h_mat->colmap);
588:     if (h_mat->allocated_indices) {
589:       cudaFree(h_mat->diag.i);
590:       cudaFree(h_mat->diag.j);
591:       if (h_mat->offdiag.j) {
592:         cudaFree(h_mat->offdiag.i);
593:         cudaFree(h_mat->offdiag.j);
594:       }
595:     }
596:     cudaFree(d_mat);
597:     PetscFree(h_mat);
598:   }
599:   /* Free COO */
600:   MatResetPreallocationCOO_MPIAIJCUSPARSE(A);
601:   delete cusparseStruct;
602:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
603:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
604:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
605:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
606:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
607:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);
608:   MatDestroy_MPIAIJ(A);
609:   return 0;
610: }

612: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
613: {
614:   Mat_MPIAIJ *a;
615:   Mat         A;

617:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
618:   if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(B,MAT_COPY_VALUES,newmat);
619:   else if (reuse == MAT_REUSE_MATRIX) MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
620:   A = *newmat;
621:   A->boundtocpu = PETSC_FALSE;
622:   PetscFree(A->defaultvectype);
623:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

625:   a = (Mat_MPIAIJ*)A->data;
626:   if (a->A) MatSetType(a->A,MATSEQAIJCUSPARSE);
627:   if (a->B) MatSetType(a->B,MATSEQAIJCUSPARSE);
628:   if (a->lvec) VecSetType(a->lvec,VECSEQCUDA);

630:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
631:     a->spptr = new Mat_MPIAIJCUSPARSE;
632:   }

634:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
635:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
636:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
637:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
638:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
639:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
640:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
641:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

643:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
644:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
645:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
646:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
647:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);
648:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);
649: #if defined(PETSC_HAVE_HYPRE)
650:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
651: #endif
652:   return 0;
653: }

655: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
656: {
657:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
658:   MatCreate_MPIAIJ(A);
659:   MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
660:   return 0;
661: }

663: /*@
664:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
665:    (the default parallel PETSc format).  This matrix will ultimately pushed down
666:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
667:    assembly performance the user should preallocate the matrix storage by setting
668:    the parameter nz (or the array nnz).  By setting these parameters accurately,
669:    performance during matrix assembly can be increased by more than a factor of 50.

671:    Collective

673:    Input Parameters:
674: +  comm - MPI communicator, set to PETSC_COMM_SELF
675: .  m - number of rows
676: .  n - number of columns
677: .  nz - number of nonzeros per row (same for all rows)
678: -  nnz - array containing the number of nonzeros in the various rows
679:          (possibly different for each row) or NULL

681:    Output Parameter:
682: .  A - the matrix

684:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
685:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
686:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

688:    Notes:
689:    If nnz is given then nz is ignored

691:    The AIJ format (also called the Yale sparse matrix format or
692:    compressed row storage), is fully compatible with standard Fortran 77
693:    storage.  That is, the stored row and column indices can begin at
694:    either one (as in Fortran) or zero.  See the users' manual for details.

696:    Specify the preallocated storage with either nz or nnz (not both).
697:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
698:    allocation.  For large problems you MUST preallocate memory or you
699:    will get TERRIBLE performance, see the users' manual chapter on matrices.

701:    By default, this format uses inodes (identical nodes) when possible, to
702:    improve numerical efficiency of matrix-vector products and solves. We
703:    search for consecutive rows with the same nonzero structure, thereby
704:    reusing matrix information to achieve increased efficiency.

706:    Level: intermediate

708: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
709: @*/
710: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
711: {
712:   PetscMPIInt size;

714:   MatCreate(comm,A);
715:   MatSetSizes(*A,m,n,M,N);
716:   MPI_Comm_size(comm,&size);
717:   if (size > 1) {
718:     MatSetType(*A,MATMPIAIJCUSPARSE);
719:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
720:   } else {
721:     MatSetType(*A,MATSEQAIJCUSPARSE);
722:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
723:   }
724:   return 0;
725: }

727: /*MC
728:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.

730:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
731:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
732:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

734:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
735:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
736:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
737:    for communicators controlling multiple processes.  It is recommended that you call both of
738:    the above preallocation routines for simplicity.

740:    Options Database Keys:
741: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
742: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
743: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
744: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

746:   Level: beginner

748:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
749: M*/

751: /*MC
752:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.

754:   Level: beginner

756:  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
757: M*/

759: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
760: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
761: {
762:   PetscSplitCSRDataStructure d_mat;
763:   PetscMPIInt                size;
764:   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
765:   PetscScalar                *aa = NULL,*ba = NULL;
766:   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
767:   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
768:   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;

771:   if (A->factortype != MAT_FACTOR_NONE) {
772:     *B = NULL;
773:     return 0;
774:   }
775:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
776:   // get jaca
777:   if (size == 1) {
778:     PetscBool isseqaij;

780:     PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);
781:     if (isseqaij) {
782:       jaca = (Mat_SeqAIJ*)A->data;
784:       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
785:       d_mat = cusparsestructA->deviceMat;
786:       MatSeqAIJCUSPARSECopyToGPU(A);
787:     } else {
788:       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
790:       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
791:       jaca = (Mat_SeqAIJ*)aij->A->data;
792:       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
793:       d_mat = spptr->deviceMat;
794:       MatSeqAIJCUSPARSECopyToGPU(aij->A);
795:     }
796:     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
797:       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
799:       matrixA = (CsrMatrix*)matstruct->mat;
800:       bi = NULL;
801:       bj = NULL;
802:       ba = NULL;
803:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
804:   } else {
805:     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
807:     jaca = (Mat_SeqAIJ*)aij->A->data;
808:     jacb = (Mat_SeqAIJ*)aij->B->data;
809:     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;

812:     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
813:     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
815:     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
816:       MatSeqAIJCUSPARSECopyToGPU(aij->A);
817:       MatSeqAIJCUSPARSECopyToGPU(aij->B);
818:       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
819:       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
822:       matrixA = (CsrMatrix*)matstructA->mat;
823:       matrixB = (CsrMatrix*)matstructB->mat;
824:       if (jacb->compressedrow.use) {
825:         if (!cusparsestructB->rowoffsets_gpu) {
826:           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
827:           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
828:         }
829:         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
830:       } else {
831:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
832:       }
833:       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
834:       ba = thrust::raw_pointer_cast(matrixB->values->data());
835:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
836:     d_mat = spptr->deviceMat;
837:   }
838:   if (jaca->compressedrow.use) {
839:     if (!cusparsestructA->rowoffsets_gpu) {
840:       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
841:       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
842:     }
843:     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
844:   } else {
845:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
846:   }
847:   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
848:   aa = thrust::raw_pointer_cast(matrixA->values->data());

850:   if (!d_mat) {
851:     PetscSplitCSRDataStructure h_mat;

853:     // create and populate strucy on host and copy on device
854:     PetscInfo(A,"Create device matrix\n");
855:     PetscNew(&h_mat);
856:     cudaMalloc((void**)&d_mat,sizeof(*d_mat));
857:     if (size > 1) { /* need the colmap array */
858:       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
859:       PetscInt   *colmap;
860:       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;


864:       PetscCalloc1(N+1,&colmap);
865:       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
866: #if defined(PETSC_USE_64BIT_INDICES)
867:       { // have to make a long version of these
868:         int        *h_bi32, *h_bj32;
869:         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
870:         PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);
871:         cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);
872:         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
873:         cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);
874:         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];

876:         cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));
877:         cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);
878:         cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));
879:         cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);

881:         h_mat->offdiag.i = d_bi64;
882:         h_mat->offdiag.j = d_bj64;
883:         h_mat->allocated_indices = PETSC_TRUE;

885:         PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);
886:       }
887: #else
888:       h_mat->offdiag.i = (PetscInt*)bi;
889:       h_mat->offdiag.j = (PetscInt*)bj;
890:       h_mat->allocated_indices = PETSC_FALSE;
891: #endif
892:       h_mat->offdiag.a = ba;
893:       h_mat->offdiag.n = A->rmap->n;

895:       cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));
896:       cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);
897:       PetscFree(colmap);
898:     }
899:     h_mat->rstart = A->rmap->rstart;
900:     h_mat->rend   = A->rmap->rend;
901:     h_mat->cstart = A->cmap->rstart;
902:     h_mat->cend   = A->cmap->rend;
903:     h_mat->M      = A->cmap->N;
904: #if defined(PETSC_USE_64BIT_INDICES)
905:     {
907:       int        *h_ai32, *h_aj32;
908:       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
909:       PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);
910:       cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);
911:       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
912:       cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);
913:       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];

915:       cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));
916:       cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);
917:       cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));
918:       cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);

920:       h_mat->diag.i = d_ai64;
921:       h_mat->diag.j = d_aj64;
922:       h_mat->allocated_indices = PETSC_TRUE;

924:       PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);
925:     }
926: #else
927:     h_mat->diag.i = (PetscInt*)ai;
928:     h_mat->diag.j = (PetscInt*)aj;
929:     h_mat->allocated_indices = PETSC_FALSE;
930: #endif
931:     h_mat->diag.a = aa;
932:     h_mat->diag.n = A->rmap->n;
933:     h_mat->rank   = PetscGlobalRank;
934:     // copy pointers and metadata to device
935:     cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);
936:     PetscFree(h_mat);
937:   } else {
938:     PetscInfo(A,"Reusing device matrix\n");
939:   }
940:   *B = d_mat;
941:   A->offloadmask = PETSC_OFFLOAD_GPU;
942:   return 0;
943: }