Actual source code: mpiaijhipsparse.hip.cpp

  1: /* Portions of this code are under:
  2:    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  3: */
  4: #define PETSC_SKIP_SPINLOCK

  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
  8: #include <../src/mat/impls/aij/mpi/mpihipsparse/mpihipsparsematimpl.h>
  9: #include <thrust/advance.h>
 10: #include <thrust/partition.h>
 11: #include <thrust/sort.h>
 12: #include <thrust/unique.h>
 13: #include <petscsf.h>

 15: struct VecHIPEquals {
 16:   template <typename Tuple>
 17:   __host__ __device__ void operator()(Tuple t)
 18:   {
 19:     thrust::get<1>(t) = thrust::get<0>(t);
 20:   }
 21: };

 23: static PetscErrorCode MatResetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat)
 24: {
 25:   auto *aij             = static_cast<Mat_MPIAIJ *>(mat->data);
 26:   auto *hipsparseStruct = static_cast<Mat_MPIAIJHIPSPARSE *>(aij->spptr);

 28:   if (!hipsparseStruct) return 0;
 29:   if (hipsparseStruct->use_extended_coo) {
 30:     hipFree(hipsparseStruct->Ajmap1_d);
 31:     hipFree(hipsparseStruct->Aperm1_d);
 32:     hipFree(hipsparseStruct->Bjmap1_d);
 33:     hipFree(hipsparseStruct->Bperm1_d);
 34:     hipFree(hipsparseStruct->Aimap2_d);
 35:     hipFree(hipsparseStruct->Ajmap2_d);
 36:     hipFree(hipsparseStruct->Aperm2_d);
 37:     hipFree(hipsparseStruct->Bimap2_d);
 38:     hipFree(hipsparseStruct->Bjmap2_d);
 39:     hipFree(hipsparseStruct->Bperm2_d);
 40:     hipFree(hipsparseStruct->Cperm1_d);
 41:     hipFree(hipsparseStruct->sendbuf_d);
 42:     hipFree(hipsparseStruct->recvbuf_d);
 43:   }
 44:   hipsparseStruct->use_extended_coo = PETSC_FALSE;
 45:   delete hipsparseStruct->coo_p;
 46:   delete hipsparseStruct->coo_pw;
 47:   hipsparseStruct->coo_p  = nullptr;
 48:   hipsparseStruct->coo_pw = nullptr;
 49:   return 0;
 50: }

 52: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
 53: {
 54:   Mat_MPIAIJ          *a    = (Mat_MPIAIJ *)A->data;
 55:   Mat_MPIAIJHIPSPARSE *cusp = (Mat_MPIAIJHIPSPARSE *)a->spptr;
 56:   PetscInt             n    = cusp->coo_nd + cusp->coo_no;

 58:   if (cusp->coo_p && v) {
 59:     thrust::device_ptr<const PetscScalar> d_v;
 60:     THRUSTARRAY                          *w = NULL;

 62:     if (isHipMem(v)) {
 63:       d_v = thrust::device_pointer_cast(v);
 64:     } else {
 65:       w = new THRUSTARRAY(n);
 66:       w->assign(v, v + n);
 67:       PetscLogCpuToGpu(n * sizeof(PetscScalar));
 68:       d_v = w->data();
 69:     }

 71:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
 72:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
 73:     PetscLogGpuTimeBegin();
 74:     thrust::for_each(zibit, zieit, VecHIPEquals());
 75:     PetscLogGpuTimeEnd();
 76:     delete w;
 77:     MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode);
 78:     MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode);
 79:   } else {
 80:     MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->A, v, imode);
 81:     MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->B, v ? v + cusp->coo_nd : nullptr, imode);
 82:   }
 83:   return 0;
 84: }

 86: template <typename Tuple>
 87: struct IsNotOffDiagT {
 88:   PetscInt _cstart, _cend;

 90:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 91:   __host__ __device__ bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
 92: };

 94: struct IsOffDiag {
 95:   PetscInt _cstart, _cend;

 97:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 98:   __host__ __device__ bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
 99: };

101: struct GlobToLoc {
102:   PetscInt _start;

104:   GlobToLoc(PetscInt start) : _start(start) { }
105:   __host__ __device__ PetscInt operator()(const PetscInt &c) { return c - _start; }
106: };

108: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109: {
110:   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
111:   Mat_MPIAIJHIPSPARSE   *cusp = (Mat_MPIAIJHIPSPARSE *)b->spptr;
112:   PetscInt               N, *jj;
113:   size_t                 noff = 0;
114:   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115:   THRUSTINTARRAY         d_j(n);
116:   ISLocalToGlobalMapping l2g;

118:   MatDestroy(&b->A);
119:   MatDestroy(&b->B);

121:   PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
122:   d_i.assign(coo_i, coo_i + n);
123:   d_j.assign(coo_j, coo_j + n);
124:   delete cusp->coo_p;
125:   delete cusp->coo_pw;
126:   cusp->coo_p  = NULL;
127:   cusp->coo_pw = NULL;
128:   PetscLogGpuTimeBegin();
129:   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
130:   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
131:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
132:     cusp->coo_p  = new THRUSTINTARRAY(n);
133:     cusp->coo_pw = new THRUSTARRAY(n);
134:     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
135:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
136:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
137:     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
138:     firstoffd  = mzipp.get_iterator_tuple().get<1>();
139:   }
140:   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
141:   cusp->coo_no = thrust::distance(firstoffd, d_j.end());

143:   /* from global to local */
144:   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
145:   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
146:   PetscLogGpuTimeEnd();

148:   /* copy offdiag column indices to map on the CPU */
149:   PetscMalloc1(cusp->coo_no, &jj); /* jj[] will store compacted col ids of the offdiag part */
150:   hipMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), hipMemcpyDeviceToHost);
151:   auto o_j = d_j.begin();
152:   PetscLogGpuTimeBegin();
153:   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
154:   thrust::sort(thrust::device, o_j, d_j.end());
155:   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
156:   PetscLogGpuTimeEnd();
157:   noff = thrust::distance(o_j, wit);
158:   PetscMalloc1(noff, &b->garray);
159:   hipMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), hipMemcpyDeviceToHost);
160:   PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt));
161:   ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g);
162:   ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH);
163:   ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj);
165:   ISLocalToGlobalMappingDestroy(&l2g);
166:   MatCreate(PETSC_COMM_SELF, &b->A);
167:   MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
168:   MatSetType(b->A, MATSEQAIJHIPSPARSE);
169:   PetscLogObjectParent((PetscObject)B, (PetscObject)b->A);
170:   MatCreate(PETSC_COMM_SELF, &b->B);
171:   MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff);
172:   MatSetType(b->B, MATSEQAIJHIPSPARSE);
173:   PetscLogObjectParent((PetscObject)B, (PetscObject)b->B);

175:   /* GPU memory, hipsparse specific call handles it internally */
176:   MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get());
177:   MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj);
178:   PetscFree(jj);

180:   MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, cusp->diagGPUMatFormat);
181:   MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, cusp->offdiagGPUMatFormat);
182:   MatBindToCPU(b->A, B->boundtocpu);
183:   MatBindToCPU(b->B, B->boundtocpu);
184:   MatSetUpMultiply_MPIAIJ(B);
185:   return 0;
186: }

188: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
189: {
190:   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
191:   Mat_MPIAIJHIPSPARSE *mpidev;
192:   PetscBool            coo_basic = PETSC_TRUE;
193:   PetscMemType         mtype     = PETSC_MEMTYPE_DEVICE;
194:   PetscInt             rstart, rend;

196:   PetscFree(mpiaij->garray);
197:   VecDestroy(&mpiaij->lvec);
198: #if defined(PETSC_USE_CTABLE)
199:   PetscTableDestroy(&mpiaij->colmap);
200: #else
201:   PetscFree(mpiaij->colmap);
202: #endif
203:   VecScatterDestroy(&mpiaij->Mvctx);
204:   mat->assembled     = PETSC_FALSE;
205:   mat->was_assembled = PETSC_FALSE;
206:   MatResetPreallocationCOO_MPIAIJ(mat);
207:   MatResetPreallocationCOO_MPIAIJHIPSPARSE(mat);
208:   if (coo_i) {
209:     PetscLayoutGetRange(mat->rmap, &rstart, &rend);
210:     PetscGetMemType(coo_i, &mtype);
211:     if (PetscMemTypeHost(mtype)) {
212:       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
213:         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
214:           coo_basic = PETSC_FALSE;
215:           break;
216:         }
217:       }
218:     }
219:   }
220:   /* All ranks must agree on the value of coo_basic */
221:   MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat));
222:   if (coo_basic) {
223:     MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j);
224:   } else {
225:     MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j);
226:     mat->offloadmask = PETSC_OFFLOAD_CPU;
227:     /* creates the GPU memory */
228:     MatSeqAIJHIPSPARSECopyToGPU(mpiaij->A);
229:     MatSeqAIJHIPSPARSECopyToGPU(mpiaij->B);
230:     mpidev                   = static_cast<Mat_MPIAIJHIPSPARSE *>(mpiaij->spptr);
231:     mpidev->use_extended_coo = PETSC_TRUE;

233:     hipMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount));
234:     hipMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount));

236:     hipMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount));
237:     hipMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount));

239:     hipMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount));
240:     hipMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount));
241:     hipMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount));

243:     hipMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount));
244:     hipMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount));
245:     hipMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount));

247:     hipMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount));
248:     hipMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar));
249:     hipMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar));

251:     hipMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
252:     hipMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), hipMemcpyHostToDevice);

254:     hipMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
255:     hipMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), hipMemcpyHostToDevice);

257:     hipMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), hipMemcpyHostToDevice);
258:     hipMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
259:     hipMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), hipMemcpyHostToDevice);

261:     hipMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), hipMemcpyHostToDevice);
262:     hipMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice);
263:     hipMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), hipMemcpyHostToDevice);

265:     hipMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), hipMemcpyHostToDevice);
266:   }
267:   return 0;
268: }

270: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
271: {
272:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
273:   const PetscCount grid_size = gridDim.x * blockDim.x;
274:   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
275: }

277: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
278: {
279:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
280:   const PetscCount grid_size = gridDim.x * blockDim.x;
281:   for (; i < Annz + Bnnz; i += grid_size) {
282:     PetscScalar sum = 0.0;
283:     if (i < Annz) {
284:       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
285:       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
286:     } else {
287:       i -= Annz;
288:       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
289:       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
290:     }
291:   }
292: }

294: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
295: {
296:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
297:   const PetscCount grid_size = gridDim.x * blockDim.x;
298:   for (; i < Annz2 + Bnnz2; i += grid_size) {
299:     if (i < Annz2) {
300:       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
301:     } else {
302:       i -= Annz2;
303:       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
304:     }
305:   }
306: }

308: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
309: {
310:   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
311:   Mat_MPIAIJHIPSPARSE *mpidev = static_cast<Mat_MPIAIJHIPSPARSE *>(mpiaij->spptr);
312:   Mat                  A = mpiaij->A, B = mpiaij->B;
313:   PetscCount           Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
314:   PetscScalar         *Aa, *Ba = NULL;
315:   PetscScalar         *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
316:   const PetscScalar   *v1     = v;
317:   const PetscCount    *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
318:   const PetscCount    *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
319:   const PetscCount    *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
320:   const PetscCount    *Cperm1 = mpidev->Cperm1_d;
321:   PetscMemType         memtype;

323:   if (mpidev->use_extended_coo) {
324:     PetscMPIInt size;

326:     MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
327:     PetscGetMemType(v, &memtype);
328:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329:       hipMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar));
330:       hipMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice);
331:     }

333:     if (imode == INSERT_VALUES) {
334:       MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa); /* write matrix values */
335:       MatSeqAIJHIPSPARSEGetArrayWrite(B, &Ba);
336:     } else {
337:       MatSeqAIJHIPSPARSEGetArray(A, &Aa); /* read & write matrix values */
338:       MatSeqAIJHIPSPARSEGetArray(B, &Ba);
339:     }

341:     /* Pack entries to be sent to remote */
342:     if (mpiaij->sendlen) {
343:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatPackCOOValues), dim3((mpiaij->sendlen + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, mpiaij->sendlen, Cperm1, vsend);
344:       hipPeekAtLastError();
345:     }

347:     /* Send remote entries to their owner and overlap the communication with local computation */
348:     PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_HIP, vsend, PETSC_MEMTYPE_HIP, v2, MPI_REPLACE);
349:     /* Add local entries to A and B */
350:     if (Annz + Bnnz > 0) {
351:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddLocalCOOValues), dim3((Annz + Bnnz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352:       hipPeekAtLastError();
353:     }
354:     PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE);

356:     /* Add received remote entries to A and B */
357:     if (Annz2 + Bnnz2 > 0) {
358:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddRemoteCOOValues), dim3((Annz2 + Bnnz2 + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359:       hipPeekAtLastError();
360:     }

362:     if (imode == INSERT_VALUES) {
363:       MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa);
364:       MatSeqAIJHIPSPARSERestoreArrayWrite(B, &Ba);
365:     } else {
366:       MatSeqAIJHIPSPARSERestoreArray(A, &Aa);
367:       MatSeqAIJHIPSPARSERestoreArray(B, &Ba);
368:     }
369:     if (PetscMemTypeHost(memtype)) hipFree((void *)v1);
370:   } else {
371:     MatSetValuesCOO_MPIAIJHIPSPARSE_Basic(mat, v, imode);
372:   }
373:   mat->offloadmask = PETSC_OFFLOAD_GPU;
374:   return 0;
375: }

377: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378: {
379:   Mat             Ad, Ao;
380:   const PetscInt *cmap;

382:   MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap);
383:   MatSeqAIJHIPSPARSEMergeMats(Ad, Ao, scall, A_loc);
384:   if (glob) {
385:     PetscInt cst, i, dn, on, *gidx;

387:     MatGetLocalSize(Ad, NULL, &dn);
388:     MatGetLocalSize(Ao, NULL, &on);
389:     MatGetOwnershipRangeColumn(A, &cst, NULL);
390:     PetscMalloc1(dn + on, &gidx);
391:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
392:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
393:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
394:   }
395:   return 0;
396: }

398: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
399: {
400:   Mat_MPIAIJ          *b               = (Mat_MPIAIJ *)B->data;
401:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)b->spptr;
402:   PetscInt             i;

404:   PetscLayoutSetUp(B->rmap);
405:   PetscLayoutSetUp(B->cmap);
406:   if (PetscDefined(USE_DEBUG) && d_nnz) {
408:   }
409:   if (PetscDefined(USE_DEBUG) && o_nnz) {
411:   }
412: #if defined(PETSC_USE_CTABLE)
413:   PetscTableDestroy(&b->colmap);
414: #else
415:   PetscFree(b->colmap);
416: #endif
417:   PetscFree(b->garray);
418:   VecDestroy(&b->lvec);
419:   VecScatterDestroy(&b->Mvctx);
420:   /* Because the B will have been resized we simply destroy it and create a new one each time */
421:   MatDestroy(&b->B);
422:   if (!b->A) {
423:     MatCreate(PETSC_COMM_SELF, &b->A);
424:     MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
425:     PetscLogObjectParent((PetscObject)B, (PetscObject)b->A);
426:   }
427:   if (!b->B) {
428:     PetscMPIInt size;
429:     MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
430:     MatCreate(PETSC_COMM_SELF, &b->B);
431:     MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0);
432:     PetscLogObjectParent((PetscObject)B, (PetscObject)b->B);
433:   }
434:   MatSetType(b->A, MATSEQAIJHIPSPARSE);
435:   MatSetType(b->B, MATSEQAIJHIPSPARSE);
436:   MatBindToCPU(b->A, B->boundtocpu);
437:   MatBindToCPU(b->B, B->boundtocpu);
438:   MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz);
439:   MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz);
440:   MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, hipsparseStruct->diagGPUMatFormat);
441:   MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, hipsparseStruct->offdiagGPUMatFormat);
442:   B->preallocated = PETSC_TRUE;
443:   return 0;
444: }

446: PetscErrorCode MatMult_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
447: {
448:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

450:   VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
451:   (*a->A->ops->mult)(a->A, xx, yy);
452:   VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
453:   (*a->B->ops->multadd)(a->B, a->lvec, yy, yy);
454:   return 0;
455: }

457: PetscErrorCode MatZeroEntries_MPIAIJHIPSPARSE(Mat A)
458: {
459:   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;

461:   MatZeroEntries(l->A);
462:   MatZeroEntries(l->B);
463:   return 0;
464: }

466: PetscErrorCode MatMultAdd_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
467: {
468:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

470:   VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
471:   (*a->A->ops->multadd)(a->A, xx, yy, zz);
472:   VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
473:   (*a->B->ops->multadd)(a->B, a->lvec, zz, zz);
474:   return 0;
475: }

477: PetscErrorCode MatMultTranspose_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
478: {
479:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

481:   (*a->B->ops->multtranspose)(a->B, xx, a->lvec);
482:   (*a->A->ops->multtranspose)(a->A, xx, yy);
483:   VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
484:   VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
485:   return 0;
486: }

488: PetscErrorCode MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
489: {
490:   Mat_MPIAIJ          *a               = (Mat_MPIAIJ *)A->data;
491:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;

493:   switch (op) {
494:   case MAT_HIPSPARSE_MULT_DIAG:
495:     hipsparseStruct->diagGPUMatFormat = format;
496:     break;
497:   case MAT_HIPSPARSE_MULT_OFFDIAG:
498:     hipsparseStruct->offdiagGPUMatFormat = format;
499:     break;
500:   case MAT_HIPSPARSE_ALL:
501:     hipsparseStruct->diagGPUMatFormat    = format;
502:     hipsparseStruct->offdiagGPUMatFormat = format;
503:     break;
504:   default:
505:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. Only MAT_HIPSPARSE_MULT_DIAG, MAT_HIPSPARSE_MULT_DIAG, and MAT_HIPSPARSE_MULT_ALL are currently supported.", op);
506:   }
507:   return 0;
508: }

510: PetscErrorCode MatSetFromOptions_MPIAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
511: {
512:   MatHIPSPARSEStorageFormat format;
513:   PetscBool                 flg;
514:   Mat_MPIAIJ               *a               = (Mat_MPIAIJ *)A->data;
515:   Mat_MPIAIJHIPSPARSE      *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;

517:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJHIPSPARSE options");
518:   if (A->factortype == MAT_FACTOR_NONE) {
519:     PetscOptionsEnum("-mat_hipsparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
520:     if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_DIAG, format);
521:     PetscOptionsEnum("-mat_hipsparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg);
522:     if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_OFFDIAG, format);
523:     PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
524:     if (flg) MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format);
525:   }
526:   PetscOptionsHeadEnd();
527:   return 0;
528: }

530: PetscErrorCode MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A, MatAssemblyType mode)
531: {
532:   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)A->data;
533:   Mat_MPIAIJHIPSPARSE *cusp   = (Mat_MPIAIJHIPSPARSE *)mpiaij->spptr;
534:   PetscObjectState     onnz   = A->nonzerostate;

536:   MatAssemblyEnd_MPIAIJ(A, mode);
537:   if (mpiaij->lvec) VecSetType(mpiaij->lvec, VECSEQHIP);
538:   if (onnz != A->nonzerostate && cusp->deviceMat) {
539:     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;

541:     PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
542:     PetscNew(&h_mat);
543:     hipMemcpy(h_mat, d_mat, sizeof(*d_mat), hipMemcpyDeviceToHost);
544:     hipFree(h_mat->colmap);
545:     if (h_mat->allocated_indices) {
546:       hipFree(h_mat->diag.i);
547:       hipFree(h_mat->diag.j);
548:       if (h_mat->offdiag.j) {
549:         hipFree(h_mat->offdiag.i);
550:         hipFree(h_mat->offdiag.j);
551:       }
552:     }
553:     hipFree(d_mat);
554:     PetscFree(h_mat);
555:     cusp->deviceMat = NULL;
556:   }
557:   return 0;
558: }

560: PetscErrorCode MatDestroy_MPIAIJHIPSPARSE(Mat A)
561: {
562:   Mat_MPIAIJ          *aij             = (Mat_MPIAIJ *)A->data;
563:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)aij->spptr;

566:   if (hipsparseStruct->deviceMat) {
567:     PetscSplitCSRDataStructure d_mat = hipsparseStruct->deviceMat, h_mat;

569:     PetscInfo(A, "Have device matrix\n");
570:     PetscNew(&h_mat);
571:     hipMemcpy(h_mat, d_mat, sizeof(*d_mat), hipMemcpyDeviceToHost);
572:     hipFree(h_mat->colmap);
573:     if (h_mat->allocated_indices) {
574:       hipFree(h_mat->diag.i);
575:       hipFree(h_mat->diag.j);
576:       if (h_mat->offdiag.j) {
577:         hipFree(h_mat->offdiag.i);
578:         hipFree(h_mat->offdiag.j);
579:       }
580:     }
581:     hipFree(d_mat);
582:     PetscFree(h_mat);
583:   }
584:   /* Free COO */
585:   MatResetPreallocationCOO_MPIAIJHIPSPARSE(A);
586:   delete hipsparseStruct;
587:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
588:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
589:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
590:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
591:   PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL);
592:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", NULL);
593:   MatDestroy_MPIAIJ(A);
594:   return 0;
595: }

597: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
598: {
599:   Mat_MPIAIJ *a;
600:   Mat         A;

602:   PetscDeviceInitialize(PETSC_DEVICE_HIP);
603:   if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(B, MAT_COPY_VALUES, newmat);
604:   else if (reuse == MAT_REUSE_MATRIX) MatCopy(B, *newmat, SAME_NONZERO_PATTERN);
605:   A             = *newmat;
606:   A->boundtocpu = PETSC_FALSE;
607:   PetscFree(A->defaultvectype);
608:   PetscStrallocpy(VECHIP, &A->defaultvectype);

610:   a = (Mat_MPIAIJ *)A->data;
611:   if (a->A) MatSetType(a->A, MATSEQAIJHIPSPARSE);
612:   if (a->B) MatSetType(a->B, MATSEQAIJHIPSPARSE);
613:   if (a->lvec) VecSetType(a->lvec, VECSEQHIP);

615:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) a->spptr = new Mat_MPIAIJHIPSPARSE;

617:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJHIPSPARSE;
618:   A->ops->mult                  = MatMult_MPIAIJHIPSPARSE;
619:   A->ops->multadd               = MatMultAdd_MPIAIJHIPSPARSE;
620:   A->ops->multtranspose         = MatMultTranspose_MPIAIJHIPSPARSE;
621:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJHIPSPARSE;
622:   A->ops->destroy               = MatDestroy_MPIAIJHIPSPARSE;
623:   A->ops->zeroentries           = MatZeroEntries_MPIAIJHIPSPARSE;
624:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

626:   PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJHIPSPARSE);
627:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE);
628:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE);
629:   PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_MPIAIJHIPSPARSE);
630:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJHIPSPARSE);
631:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJHIPSPARSE);
632: #if defined(PETSC_HAVE_HYPRE)
633:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE);
634: #endif
635:   return 0;
636: }

638: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJHIPSPARSE(Mat A)
639: {
640:   PetscDeviceInitialize(PETSC_DEVICE_HIP);
641:   MatCreate_MPIAIJ(A);
642:   MatConvert_MPIAIJ_MPIAIJHIPSPARSE(A, MATMPIAIJHIPSPARSE, MAT_INPLACE_MATRIX, &A);
643:   return 0;
644: }

646: /*@
647:    MatCreateAIJHIPSPARSE - Creates a sparse matrix in AIJ (compressed row) format
648:    (the default parallel PETSc format).  This matrix will ultimately pushed down
649:    to AMD GPUs and use the HIPSPARSE library for calculations. For good matrix
650:    assembly performance the user should preallocate the matrix storage by setting
651:    the parameter nz (or the array nnz).  By setting these parameters accurately,
652:    performance during matrix assembly can be increased by more than a factor of 50.

654:    Collective

656:    Input Parameters:
657: +  comm - MPI communicator, set to PETSC_COMM_SELF
658: .  m - number of rows
659: .  n - number of columns
660: .  nz - number of nonzeros per row (same for all rows)
661: -  nnz - array containing the number of nonzeros in the various rows
662:          (possibly different for each row) or NULL

664:    Output Parameter:
665: .  A - the matrix

667:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
668:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
669:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

671:    Notes:
672:    If nnz is given then nz is ignored

674:    The AIJ format (also called the Yale sparse matrix format or
675:    compressed row storage), is fully compatible with standard Fortran 77
676:    storage.  That is, the stored row and column indices can begin at
677:    either one (as in Fortran) or zero.  See the users' manual for details.

679:    Specify the preallocated storage with either nz or nnz (not both).
680:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
681:    allocation.  For large problems you MUST preallocate memory or you
682:    will get TERRIBLE performance, see the users' manual chapter on matrices.

684:    By default, this format uses inodes (identical nodes) when possible, to
685:    improve numerical efficiency of matrix-vector products and solves. We
686:    search for consecutive rows with the same nonzero structure, thereby
687:    reusing matrix information to achieve increased efficiency.

689:    Level: intermediate

691: .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJHIPSPARSE`, `MATAIJHIPSPARSE`
692: @*/
693: PetscErrorCode MatCreateAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
694: {
695:   PetscMPIInt size;

697:   MatCreate(comm, A);
698:   MatSetSizes(*A, m, n, M, N);
699:   MPI_Comm_size(comm, &size);
700:   if (size > 1) {
701:     MatSetType(*A, MATMPIAIJHIPSPARSE);
702:     MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
703:   } else {
704:     MatSetType(*A, MATSEQAIJHIPSPARSE);
705:     MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
706:   }
707:   return 0;
708: }

710: /*MC
711:    MATAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJHIPSPARSE.

713:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
714:    CSR, ELL, or Hybrid format. All matrix calculations are performed on AMD GPUs using the HIPSPARSE library.

716:    This matrix type is identical to MATSEQAIJHIPSPARSE when constructed with a single process communicator,
717:    and MATMPIAIJHIPSPARSE otherwise.  As a result, for single process communicators,
718:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
719:    for communicators controlling multiple processes.  It is recommended that you call both of
720:    the above preallocation routines for simplicity.

722:    Options Database Keys:
723: +  -mat_type mpiaijhipsparse - sets the matrix type to "mpiaijhipsparse" during a call to MatSetFromOptions()
724: .  -mat_hipsparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
725: .  -mat_hipsparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
726: -  -mat_hipsparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

728:   Level: beginner

730:  .seealso: `MatCreateAIJHIPSPARSE()`, `MATSEQAIJHIPSPARSE`, `MATMPIAIJHIPSPARSE`, `MatCreateSeqAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
731: M*/

733: /*MC
734:    MATMPIAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJHIPSPARSE.

736:   Level: beginner

738:  .seealso: `MATAIJHIPSPARSE`, `MATSEQAIJHIPSPARSE`
739: M*/

741: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
742: PetscErrorCode MatHIPSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
743: {
744:   PetscSplitCSRDataStructure d_mat;
745:   PetscMPIInt                size;
746:   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
747:   PetscScalar               *aa = NULL, *ba = NULL;
748:   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
749:   Mat_SeqAIJHIPSPARSE       *hipsparsestructA = NULL;
750:   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;

753:   if (A->factortype != MAT_FACTOR_NONE) {
754:     *B = NULL;
755:     return 0;
756:   }
757:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
758:   // get jaca
759:   if (size == 1) {
760:     PetscBool isseqaij;

762:     PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij);
763:     if (isseqaij) {
764:       jaca = (Mat_SeqAIJ *)A->data;
766:       hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)A->spptr;
767:       d_mat            = hipsparsestructA->deviceMat;
768:       MatSeqAIJHIPSPARSECopyToGPU(A);
769:     } else {
770:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
772:       Mat_MPIAIJHIPSPARSE *spptr = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
773:       jaca                       = (Mat_SeqAIJ *)aij->A->data;
774:       hipsparsestructA           = (Mat_SeqAIJHIPSPARSE *)aij->A->spptr;
775:       d_mat                      = spptr->deviceMat;
776:       MatSeqAIJHIPSPARSECopyToGPU(aij->A);
777:     }
778:     if (hipsparsestructA->format == MAT_HIPSPARSE_CSR) {
779:       Mat_SeqAIJHIPSPARSEMultStruct *matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat;
781:       matrixA = (CsrMatrix *)matstruct->mat;
782:       bi      = NULL;
783:       bj      = NULL;
784:       ba      = NULL;
785:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_HIPSPARSE_CSR");
786:   } else {
787:     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
789:     jaca                       = (Mat_SeqAIJ *)aij->A->data;
790:     jacb                       = (Mat_SeqAIJ *)aij->B->data;
791:     Mat_MPIAIJHIPSPARSE *spptr = (Mat_MPIAIJHIPSPARSE *)aij->spptr;

794:     hipsparsestructA                      = (Mat_SeqAIJHIPSPARSE *)aij->A->spptr;
795:     Mat_SeqAIJHIPSPARSE *hipsparsestructB = (Mat_SeqAIJHIPSPARSE *)aij->B->spptr;
797:     if (hipsparsestructB->format == MAT_HIPSPARSE_CSR) {
798:       MatSeqAIJHIPSPARSECopyToGPU(aij->A);
799:       MatSeqAIJHIPSPARSECopyToGPU(aij->B);
800:       Mat_SeqAIJHIPSPARSEMultStruct *matstructA = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat;
801:       Mat_SeqAIJHIPSPARSEMultStruct *matstructB = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructB->mat;
804:       matrixA = (CsrMatrix *)matstructA->mat;
805:       matrixB = (CsrMatrix *)matstructB->mat;
806:       if (jacb->compressedrow.use) {
807:         if (!hipsparsestructB->rowoffsets_gpu) {
808:           hipsparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
809:           hipsparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
810:         }
811:         bi = thrust::raw_pointer_cast(hipsparsestructB->rowoffsets_gpu->data());
812:       } else {
813:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
814:       }
815:       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
816:       ba = thrust::raw_pointer_cast(matrixB->values->data());
817:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_HIPSPARSE_CSR");
818:     d_mat = spptr->deviceMat;
819:   }
820:   if (jaca->compressedrow.use) {
821:     if (!hipsparsestructA->rowoffsets_gpu) {
822:       hipsparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
823:       hipsparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
824:     }
825:     ai = thrust::raw_pointer_cast(hipsparsestructA->rowoffsets_gpu->data());
826:   } else {
827:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
828:   }
829:   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
830:   aa = thrust::raw_pointer_cast(matrixA->values->data());

832:   if (!d_mat) {
833:     PetscSplitCSRDataStructure h_mat;

835:     // create and populate strucy on host and copy on device
836:     PetscInfo(A, "Create device matrix\n");
837:     PetscNew(&h_mat);
838:     hipMalloc((void **)&d_mat, sizeof(*d_mat));
839:     if (size > 1) { /* need the colmap array */
840:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
841:       PetscInt   *colmap;
842:       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;


846:       PetscCalloc1(N + 1, &colmap);
847:       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = ii + 1;
848: #if defined(PETSC_USE_64BIT_INDICES)
849:       { // have to make a long version of these
850:         int      *h_bi32, *h_bj32;
851:         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
852:         PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64);
853:         hipMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), hipMemcpyDeviceToHost);
854:         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
855:         hipMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), hipMemcpyDeviceToHost);
856:         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];

858:         hipMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64));
859:         hipMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), hipMemcpyHostToDevice);
860:         hipMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64));
861:         hipMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), hipMemcpyHostToDevice);

863:         h_mat->offdiag.i         = d_bi64;
864:         h_mat->offdiag.j         = d_bj64;
865:         h_mat->allocated_indices = PETSC_TRUE;

867:         PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64);
868:       }
869: #else
870:       h_mat->offdiag.i         = (PetscInt *)bi;
871:       h_mat->offdiag.j         = (PetscInt *)bj;
872:       h_mat->allocated_indices = PETSC_FALSE;
873: #endif
874:       h_mat->offdiag.a = ba;
875:       h_mat->offdiag.n = A->rmap->n;

877:       hipMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap));
878:       hipMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), hipMemcpyHostToDevice);
879:       PetscFree(colmap);
880:     }
881:     h_mat->rstart = A->rmap->rstart;
882:     h_mat->rend   = A->rmap->rend;
883:     h_mat->cstart = A->cmap->rstart;
884:     h_mat->cend   = A->cmap->rend;
885:     h_mat->M      = A->cmap->N;
886: #if defined(PETSC_USE_64BIT_INDICES)
887:     {
888:       int      *h_ai32, *h_aj32;
889:       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
890:       PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64);
891:       hipMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), hipMemcpyDeviceToHost);
892:       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
893:       hipMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), hipMemcpyDeviceToHost);
894:       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];

896:       hipMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64));
897:       hipMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), hipMemcpyHostToDevice);
898:       hipMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64));
899:       hipMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), hipMemcpyHostToDevice);

901:       h_mat->diag.i            = d_ai64;
902:       h_mat->diag.j            = d_aj64;
903:       h_mat->allocated_indices = PETSC_TRUE;

905:       PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64);
906:     }
907: #else
908:     h_mat->diag.i            = (PetscInt *)ai;
909:     h_mat->diag.j            = (PetscInt *)aj;
910:     h_mat->allocated_indices = PETSC_FALSE;
911: #endif
912:     h_mat->diag.a = aa;
913:     h_mat->diag.n = A->rmap->n;
914:     h_mat->rank   = PetscGlobalRank;
915:     // copy pointers and metadata to device
916:     hipMemcpy(d_mat, h_mat, sizeof(*d_mat), hipMemcpyHostToDevice);
917:     PetscFree(h_mat);
918:   } else {
919:     PetscInfo(A, "Reusing device matrix\n");
920:   }
921:   *B             = d_mat;
922:   A->offloadmask = PETSC_OFFLOAD_GPU;
923:   return 0;
924: }