Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4: #include <petscconf.h>
5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
6: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8: #include <thrust/advance.h>
9: #include <thrust/partition.h>
10: #include <thrust/sort.h>
11: #include <thrust/unique.h>
12: #include <petscsf.h>
14: struct VecCUDAEquals
15: {
16: template <typename Tuple>
17: __host__ __device__
18: void operator()(Tuple t)
19: {
20: thrust::get<1>(t) = thrust::get<0>(t);
21: }
22: };
24: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25: {
26: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
27: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
29: if (!cusparseStruct) return 0;
30: if (cusparseStruct->use_extended_coo) {
31: cudaFree(cusparseStruct->Aimap1_d);
32: cudaFree(cusparseStruct->Ajmap1_d);
33: cudaFree(cusparseStruct->Aperm1_d);
34: cudaFree(cusparseStruct->Bimap1_d);
35: cudaFree(cusparseStruct->Bjmap1_d);
36: cudaFree(cusparseStruct->Bperm1_d);
37: cudaFree(cusparseStruct->Aimap2_d);
38: cudaFree(cusparseStruct->Ajmap2_d);
39: cudaFree(cusparseStruct->Aperm2_d);
40: cudaFree(cusparseStruct->Bimap2_d);
41: cudaFree(cusparseStruct->Bjmap2_d);
42: cudaFree(cusparseStruct->Bperm2_d);
43: cudaFree(cusparseStruct->Cperm1_d);
44: cudaFree(cusparseStruct->sendbuf_d);
45: cudaFree(cusparseStruct->recvbuf_d);
46: }
47: cusparseStruct->use_extended_coo = PETSC_FALSE;
48: delete cusparseStruct->coo_p;
49: delete cusparseStruct->coo_pw;
50: cusparseStruct->coo_p = NULL;
51: cusparseStruct->coo_pw = NULL;
52: return 0;
53: }
55: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
56: {
57: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
58: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
59: PetscInt n = cusp->coo_nd + cusp->coo_no;
61: if (cusp->coo_p && v) {
62: thrust::device_ptr<const PetscScalar> d_v;
63: THRUSTARRAY *w = NULL;
65: if (isCudaMem(v)) {
66: d_v = thrust::device_pointer_cast(v);
67: } else {
68: w = new THRUSTARRAY(n);
69: w->assign(v,v+n);
70: PetscLogCpuToGpu(n*sizeof(PetscScalar));
71: d_v = w->data();
72: }
74: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
75: cusp->coo_pw->begin()));
76: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
77: cusp->coo_pw->end()));
78: PetscLogGpuTimeBegin();
79: thrust::for_each(zibit,zieit,VecCUDAEquals());
80: PetscLogGpuTimeEnd();
81: delete w;
82: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);
83: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);
84: } else {
85: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);
86: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);
87: }
88: return 0;
89: }
91: template <typename Tuple>
92: struct IsNotOffDiagT
93: {
94: PetscInt _cstart,_cend;
96: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
97: __host__ __device__
98: inline bool operator()(Tuple t)
99: {
100: return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
101: }
102: };
104: struct IsOffDiag
105: {
106: PetscInt _cstart,_cend;
108: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
109: __host__ __device__
110: inline bool operator() (const PetscInt &c)
111: {
112: return c < _cstart || c >= _cend;
113: }
114: };
116: struct GlobToLoc
117: {
118: PetscInt _start;
120: GlobToLoc(PetscInt start) : _start(start) {}
121: __host__ __device__
122: inline PetscInt operator() (const PetscInt &c)
123: {
124: return c - _start;
125: }
126: };
128: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
129: {
130: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
131: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
132: PetscInt N,*jj;
133: size_t noff = 0;
134: THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
135: THRUSTINTARRAY d_j(n);
136: ISLocalToGlobalMapping l2g;
138: MatDestroy(&b->A);
139: MatDestroy(&b->B);
141: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
142: d_i.assign(coo_i,coo_i+n);
143: d_j.assign(coo_j,coo_j+n);
144: PetscLogGpuTimeBegin();
145: auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
146: auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
147: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
148: cusp->coo_p = new THRUSTINTARRAY(n);
149: cusp->coo_pw = new THRUSTARRAY(n);
150: thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
151: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
152: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
153: auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
154: firstoffd = mzipp.get_iterator_tuple().get<1>();
155: }
156: cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
157: cusp->coo_no = thrust::distance(firstoffd,d_j.end());
159: /* from global to local */
160: thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
161: thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
162: PetscLogGpuTimeEnd();
164: /* copy offdiag column indices to map on the CPU */
165: PetscMalloc1(cusp->coo_no,&jj); /* jj[] will store compacted col ids of the offdiag part */
166: cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);
167: auto o_j = d_j.begin();
168: PetscLogGpuTimeBegin();
169: thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
170: thrust::sort(thrust::device,o_j,d_j.end());
171: auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
172: PetscLogGpuTimeEnd();
173: noff = thrust::distance(o_j,wit);
174: /* allocate the garray, the columns of B will not be mapped in the multiply setup */
175: PetscMalloc1(noff,&b->garray);
176: cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);
177: PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));
178: ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);
179: ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);
180: ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);
182: ISLocalToGlobalMappingDestroy(&l2g);
184: MatCreate(PETSC_COMM_SELF,&b->A);
185: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
186: MatSetType(b->A,MATSEQAIJCUSPARSE);
187: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
188: MatCreate(PETSC_COMM_SELF,&b->B);
189: MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);
190: MatSetType(b->B,MATSEQAIJCUSPARSE);
191: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
193: /* GPU memory, cusparse specific call handles it internally */
194: MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());
195: MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);
196: PetscFree(jj);
198: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);
199: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);
201: MatBindToCPU(b->A,B->boundtocpu);
202: MatBindToCPU(b->B,B->boundtocpu);
203: MatSetUpMultiply_MPIAIJ(B);
204: return 0;
205: }
207: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
208: {
209: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data;
210: Mat_MPIAIJCUSPARSE *mpidev;
211: PetscBool coo_basic = PETSC_TRUE;
212: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
213: PetscInt rstart,rend;
215: PetscFree(mpiaij->garray);
216: VecDestroy(&mpiaij->lvec);
217: #if defined(PETSC_USE_CTABLE)
218: PetscTableDestroy(&mpiaij->colmap);
219: #else
220: PetscFree(mpiaij->colmap);
221: #endif
222: VecScatterDestroy(&mpiaij->Mvctx);
223: mat->assembled = PETSC_FALSE;
224: mat->was_assembled = PETSC_FALSE;
225: MatResetPreallocationCOO_MPIAIJ(mat);
226: MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);
227: if (coo_i) {
228: PetscLayoutGetRange(mat->rmap,&rstart,&rend);
229: PetscGetMemType(coo_i,&mtype);
230: if (PetscMemTypeHost(mtype)) {
231: for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
232: if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
233: }
234: }
235: }
236: /* All ranks must agree on the value of coo_basic */
237: MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));
238: if (coo_basic) {
239: MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);
240: } else {
241: MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);
242: mat->offloadmask = PETSC_OFFLOAD_CPU;
243: /* creates the GPU memory */
244: MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);
245: MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);
246: mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
247: mpidev->use_extended_coo = PETSC_TRUE;
249: cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));
250: cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));
251: cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));
253: cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));
254: cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));
255: cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));
257: cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));
258: cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));
259: cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));
261: cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));
262: cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));
263: cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));
265: cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));
266: cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));
267: cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));
269: cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);
270: cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
271: cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);
273: cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);
274: cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
275: cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);
277: cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);
278: cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
279: cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);
281: cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);
282: cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);
283: cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);
285: cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);
286: }
287: return 0;
288: }
290: __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
291: {
292: PetscCount i = blockIdx.x*blockDim.x + threadIdx.x;
293: const PetscCount grid_size = gridDim.x * blockDim.x;
294: for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
295: }
297: __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
298: {
299: PetscCount i = blockIdx.x*blockDim.x + threadIdx.x;
300: const PetscCount grid_size = gridDim.x * blockDim.x;
301: for (; i<nnz; i += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
302: }
304: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
305: {
306: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
307: Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
308: Mat A = mpiaij->A,B = mpiaij->B;
309: PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
310: PetscScalar *Aa,*Ba = NULL;
311: PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
312: const PetscScalar *v1 = v;
313: const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
314: const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
315: const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
316: const PetscCount *Cperm1 = mpidev->Cperm1_d;
317: PetscMemType memtype;
319: if (mpidev->use_extended_coo) {
320: PetscMPIInt size;
322: MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);
323: PetscGetMemType(v,&memtype);
324: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
325: cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));
326: cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
327: }
329: if (imode == INSERT_VALUES) {
330: MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa); /* write matrix values */
331: cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));
332: if (size > 1) {
333: MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);
334: cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));
335: }
336: } else {
337: MatSeqAIJCUSPARSEGetArray(A,&Aa); /* read & write matrix values */
338: if (size > 1) MatSeqAIJCUSPARSEGetArray(B,&Ba);
339: }
341: /* Pack entries to be sent to remote */
342: if (mpiaij->sendlen) {
343: MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
344: cudaPeekAtLastError();
345: }
347: /* Send remote entries to their owner and overlap the communication with local computation */
348: if (size > 1) PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);
349: /* Add local entries to A and B */
350: if (Annz1) {
351: MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
352: cudaPeekAtLastError();
353: }
354: if (Bnnz1) {
355: MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
356: cudaPeekAtLastError();
357: }
358: if (size > 1) PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);
360: /* Add received remote entries to A and B */
361: if (Annz2) {
362: MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
363: cudaPeekAtLastError();
364: }
365: if (Bnnz2) {
366: MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
367: cudaPeekAtLastError();
368: }
370: if (imode == INSERT_VALUES) {
371: MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);
372: if (size > 1) MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);
373: } else {
374: MatSeqAIJCUSPARSERestoreArray(A,&Aa);
375: if (size > 1) MatSeqAIJCUSPARSERestoreArray(B,&Ba);
376: }
377: if (PetscMemTypeHost(memtype)) cudaFree((void*)v1);
378: } else {
379: MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);
380: }
381: mat->offloadmask = PETSC_OFFLOAD_GPU;
382: return 0;
383: }
385: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
386: {
387: Mat Ad,Ao;
388: const PetscInt *cmap;
390: MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);
391: MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);
392: if (glob) {
393: PetscInt cst, i, dn, on, *gidx;
395: MatGetLocalSize(Ad,NULL,&dn);
396: MatGetLocalSize(Ao,NULL,&on);
397: MatGetOwnershipRangeColumn(A,&cst,NULL);
398: PetscMalloc1(dn+on,&gidx);
399: for (i = 0; i<dn; i++) gidx[i] = cst + i;
400: for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
401: ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
402: }
403: return 0;
404: }
406: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
407: {
408: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
409: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
410: PetscInt i;
412: PetscLayoutSetUp(B->rmap);
413: PetscLayoutSetUp(B->cmap);
414: if (PetscDefined(USE_DEBUG) && d_nnz) {
415: for (i=0; i<B->rmap->n; i++) {
417: }
418: }
419: if (PetscDefined(USE_DEBUG) && o_nnz) {
420: for (i=0; i<B->rmap->n; i++) {
422: }
423: }
424: #if defined(PETSC_USE_CTABLE)
425: PetscTableDestroy(&b->colmap);
426: #else
427: PetscFree(b->colmap);
428: #endif
429: PetscFree(b->garray);
430: VecDestroy(&b->lvec);
431: VecScatterDestroy(&b->Mvctx);
432: /* Because the B will have been resized we simply destroy it and create a new one each time */
433: MatDestroy(&b->B);
434: if (!b->A) {
435: MatCreate(PETSC_COMM_SELF,&b->A);
436: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
437: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
438: }
439: if (!b->B) {
440: PetscMPIInt size;
441: MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
442: MatCreate(PETSC_COMM_SELF,&b->B);
443: MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);
444: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
445: }
446: MatSetType(b->A,MATSEQAIJCUSPARSE);
447: MatSetType(b->B,MATSEQAIJCUSPARSE);
448: MatBindToCPU(b->A,B->boundtocpu);
449: MatBindToCPU(b->B,B->boundtocpu);
450: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
451: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
452: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
453: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
454: B->preallocated = PETSC_TRUE;
455: return 0;
456: }
458: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
459: {
460: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
462: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
463: (*a->A->ops->mult)(a->A,xx,yy);
464: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
465: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
466: return 0;
467: }
469: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
470: {
471: Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
473: MatZeroEntries(l->A);
474: MatZeroEntries(l->B);
475: return 0;
476: }
478: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
479: {
480: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
482: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
483: (*a->A->ops->multadd)(a->A,xx,yy,zz);
484: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
485: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
486: return 0;
487: }
489: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
490: {
491: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
493: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
494: (*a->A->ops->multtranspose)(a->A,xx,yy);
495: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
496: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
497: return 0;
498: }
500: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
501: {
502: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
503: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
505: switch (op) {
506: case MAT_CUSPARSE_MULT_DIAG:
507: cusparseStruct->diagGPUMatFormat = format;
508: break;
509: case MAT_CUSPARSE_MULT_OFFDIAG:
510: cusparseStruct->offdiagGPUMatFormat = format;
511: break;
512: case MAT_CUSPARSE_ALL:
513: cusparseStruct->diagGPUMatFormat = format;
514: cusparseStruct->offdiagGPUMatFormat = format;
515: break;
516: default:
517: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
518: }
519: return 0;
520: }
522: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
523: {
524: MatCUSPARSEStorageFormat format;
525: PetscBool flg;
526: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
527: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
529: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
530: if (A->factortype==MAT_FACTOR_NONE) {
531: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
532: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
533: if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
534: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
535: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
536: if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
537: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
538: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
539: if (flg) MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
540: }
541: PetscOptionsTail();
542: return 0;
543: }
545: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
546: {
547: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
548: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
549: PetscObjectState onnz = A->nonzerostate;
551: MatAssemblyEnd_MPIAIJ(A,mode);
552: if (mpiaij->lvec) VecSetType(mpiaij->lvec,VECSEQCUDA);
553: if (onnz != A->nonzerostate && cusp->deviceMat) {
554: PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
556: PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
557: PetscNew(&h_mat);
558: cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);
559: cudaFree(h_mat->colmap);
560: if (h_mat->allocated_indices) {
561: cudaFree(h_mat->diag.i);
562: cudaFree(h_mat->diag.j);
563: if (h_mat->offdiag.j) {
564: cudaFree(h_mat->offdiag.i);
565: cudaFree(h_mat->offdiag.j);
566: }
567: }
568: cudaFree(d_mat);
569: PetscFree(h_mat);
570: cusp->deviceMat = NULL;
571: }
572: return 0;
573: }
575: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
576: {
577: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
578: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
581: if (cusparseStruct->deviceMat) {
582: PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
584: PetscInfo(A,"Have device matrix\n");
585: PetscNew(&h_mat);
586: cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);
587: cudaFree(h_mat->colmap);
588: if (h_mat->allocated_indices) {
589: cudaFree(h_mat->diag.i);
590: cudaFree(h_mat->diag.j);
591: if (h_mat->offdiag.j) {
592: cudaFree(h_mat->offdiag.i);
593: cudaFree(h_mat->offdiag.j);
594: }
595: }
596: cudaFree(d_mat);
597: PetscFree(h_mat);
598: }
599: /* Free COO */
600: MatResetPreallocationCOO_MPIAIJCUSPARSE(A);
601: delete cusparseStruct;
602: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
603: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
604: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
605: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
606: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
607: PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);
608: MatDestroy_MPIAIJ(A);
609: return 0;
610: }
612: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
613: {
614: Mat_MPIAIJ *a;
615: Mat A;
617: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
618: if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(B,MAT_COPY_VALUES,newmat);
619: else if (reuse == MAT_REUSE_MATRIX) MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
620: A = *newmat;
621: A->boundtocpu = PETSC_FALSE;
622: PetscFree(A->defaultvectype);
623: PetscStrallocpy(VECCUDA,&A->defaultvectype);
625: a = (Mat_MPIAIJ*)A->data;
626: if (a->A) MatSetType(a->A,MATSEQAIJCUSPARSE);
627: if (a->B) MatSetType(a->B,MATSEQAIJCUSPARSE);
628: if (a->lvec) VecSetType(a->lvec,VECSEQCUDA);
630: if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
631: a->spptr = new Mat_MPIAIJCUSPARSE;
632: }
634: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
635: A->ops->mult = MatMult_MPIAIJCUSPARSE;
636: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
637: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
638: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
639: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
640: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
641: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
643: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
644: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
645: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
646: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
647: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);
648: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);
649: #if defined(PETSC_HAVE_HYPRE)
650: PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
651: #endif
652: return 0;
653: }
655: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
656: {
657: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
658: MatCreate_MPIAIJ(A);
659: MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
660: return 0;
661: }
663: /*@
664: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
665: (the default parallel PETSc format). This matrix will ultimately pushed down
666: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
667: assembly performance the user should preallocate the matrix storage by setting
668: the parameter nz (or the array nnz). By setting these parameters accurately,
669: performance during matrix assembly can be increased by more than a factor of 50.
671: Collective
673: Input Parameters:
674: + comm - MPI communicator, set to PETSC_COMM_SELF
675: . m - number of rows
676: . n - number of columns
677: . nz - number of nonzeros per row (same for all rows)
678: - nnz - array containing the number of nonzeros in the various rows
679: (possibly different for each row) or NULL
681: Output Parameter:
682: . A - the matrix
684: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
685: MatXXXXSetPreallocation() paradigm instead of this routine directly.
686: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
688: Notes:
689: If nnz is given then nz is ignored
691: The AIJ format (also called the Yale sparse matrix format or
692: compressed row storage), is fully compatible with standard Fortran 77
693: storage. That is, the stored row and column indices can begin at
694: either one (as in Fortran) or zero. See the users' manual for details.
696: Specify the preallocated storage with either nz or nnz (not both).
697: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
698: allocation. For large problems you MUST preallocate memory or you
699: will get TERRIBLE performance, see the users' manual chapter on matrices.
701: By default, this format uses inodes (identical nodes) when possible, to
702: improve numerical efficiency of matrix-vector products and solves. We
703: search for consecutive rows with the same nonzero structure, thereby
704: reusing matrix information to achieve increased efficiency.
706: Level: intermediate
708: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
709: @*/
710: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
711: {
712: PetscMPIInt size;
714: MatCreate(comm,A);
715: MatSetSizes(*A,m,n,M,N);
716: MPI_Comm_size(comm,&size);
717: if (size > 1) {
718: MatSetType(*A,MATMPIAIJCUSPARSE);
719: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
720: } else {
721: MatSetType(*A,MATSEQAIJCUSPARSE);
722: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
723: }
724: return 0;
725: }
727: /*MC
728: MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
730: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
731: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
732: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
734: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
735: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
736: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
737: for communicators controlling multiple processes. It is recommended that you call both of
738: the above preallocation routines for simplicity.
740: Options Database Keys:
741: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
742: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
743: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
744: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
746: Level: beginner
748: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
749: M*/
751: /*MC
752: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
754: Level: beginner
756: .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
757: M*/
759: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
760: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
761: {
762: PetscSplitCSRDataStructure d_mat;
763: PetscMPIInt size;
764: int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
765: PetscScalar *aa = NULL,*ba = NULL;
766: Mat_SeqAIJ *jaca = NULL, *jacb = NULL;
767: Mat_SeqAIJCUSPARSE *cusparsestructA = NULL;
768: CsrMatrix *matrixA = NULL,*matrixB = NULL;
771: if (A->factortype != MAT_FACTOR_NONE) {
772: *B = NULL;
773: return 0;
774: }
775: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
776: // get jaca
777: if (size == 1) {
778: PetscBool isseqaij;
780: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);
781: if (isseqaij) {
782: jaca = (Mat_SeqAIJ*)A->data;
784: cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
785: d_mat = cusparsestructA->deviceMat;
786: MatSeqAIJCUSPARSECopyToGPU(A);
787: } else {
788: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
790: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
791: jaca = (Mat_SeqAIJ*)aij->A->data;
792: cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
793: d_mat = spptr->deviceMat;
794: MatSeqAIJCUSPARSECopyToGPU(aij->A);
795: }
796: if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
797: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
799: matrixA = (CsrMatrix*)matstruct->mat;
800: bi = NULL;
801: bj = NULL;
802: ba = NULL;
803: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
804: } else {
805: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
807: jaca = (Mat_SeqAIJ*)aij->A->data;
808: jacb = (Mat_SeqAIJ*)aij->B->data;
809: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
812: cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
813: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
815: if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
816: MatSeqAIJCUSPARSECopyToGPU(aij->A);
817: MatSeqAIJCUSPARSECopyToGPU(aij->B);
818: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
819: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
822: matrixA = (CsrMatrix*)matstructA->mat;
823: matrixB = (CsrMatrix*)matstructB->mat;
824: if (jacb->compressedrow.use) {
825: if (!cusparsestructB->rowoffsets_gpu) {
826: cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
827: cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
828: }
829: bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
830: } else {
831: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
832: }
833: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
834: ba = thrust::raw_pointer_cast(matrixB->values->data());
835: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
836: d_mat = spptr->deviceMat;
837: }
838: if (jaca->compressedrow.use) {
839: if (!cusparsestructA->rowoffsets_gpu) {
840: cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
841: cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
842: }
843: ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
844: } else {
845: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
846: }
847: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
848: aa = thrust::raw_pointer_cast(matrixA->values->data());
850: if (!d_mat) {
851: PetscSplitCSRDataStructure h_mat;
853: // create and populate strucy on host and copy on device
854: PetscInfo(A,"Create device matrix\n");
855: PetscNew(&h_mat);
856: cudaMalloc((void**)&d_mat,sizeof(*d_mat));
857: if (size > 1) { /* need the colmap array */
858: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
859: PetscInt *colmap;
860: PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N;
864: PetscCalloc1(N+1,&colmap);
865: for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
866: #if defined(PETSC_USE_64BIT_INDICES)
867: { // have to make a long version of these
868: int *h_bi32, *h_bj32;
869: PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
870: PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);
871: cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);
872: for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
873: cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);
874: for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
876: cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));
877: cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);
878: cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));
879: cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);
881: h_mat->offdiag.i = d_bi64;
882: h_mat->offdiag.j = d_bj64;
883: h_mat->allocated_indices = PETSC_TRUE;
885: PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);
886: }
887: #else
888: h_mat->offdiag.i = (PetscInt*)bi;
889: h_mat->offdiag.j = (PetscInt*)bj;
890: h_mat->allocated_indices = PETSC_FALSE;
891: #endif
892: h_mat->offdiag.a = ba;
893: h_mat->offdiag.n = A->rmap->n;
895: cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));
896: cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);
897: PetscFree(colmap);
898: }
899: h_mat->rstart = A->rmap->rstart;
900: h_mat->rend = A->rmap->rend;
901: h_mat->cstart = A->cmap->rstart;
902: h_mat->cend = A->cmap->rend;
903: h_mat->M = A->cmap->N;
904: #if defined(PETSC_USE_64BIT_INDICES)
905: {
907: int *h_ai32, *h_aj32;
908: PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
909: PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);
910: cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);
911: for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
912: cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);
913: for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
915: cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));
916: cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);
917: cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));
918: cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);
920: h_mat->diag.i = d_ai64;
921: h_mat->diag.j = d_aj64;
922: h_mat->allocated_indices = PETSC_TRUE;
924: PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);
925: }
926: #else
927: h_mat->diag.i = (PetscInt*)ai;
928: h_mat->diag.j = (PetscInt*)aj;
929: h_mat->allocated_indices = PETSC_FALSE;
930: #endif
931: h_mat->diag.a = aa;
932: h_mat->diag.n = A->rmap->n;
933: h_mat->rank = PetscGlobalRank;
934: // copy pointers and metadata to device
935: cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);
936: PetscFree(h_mat);
937: } else {
938: PetscInfo(A,"Reusing device matrix\n");
939: }
940: *B = d_mat;
941: A->offloadmask = PETSC_OFFLOAD_GPU;
942: return 0;
943: }