Actual source code: mpisellcuda.cu
  1: #include <petscconf.h>
  2: #include <petscdevice.h>
  3: #include <../src/mat/impls/sell/mpi/mpisell.h>

  5: static PetscErrorCode MatMPISELLSetPreallocation_MPISELLCUDA(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[])
  6: {
  7:   Mat_MPISELL *b = (Mat_MPISELL *)B->data;

  9:   PetscFunctionBegin;
 10:   PetscCall(PetscLayoutSetUp(B->rmap));
 11:   PetscCall(PetscLayoutSetUp(B->cmap));

 13:   if (!B->preallocated) {
 14:     /* Explicitly create 2 MATSEQSELLCUDA matrices. */
 15:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
 16:     PetscCall(MatBindToCPU(b->A, B->boundtocpu));
 17:     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
 18:     PetscCall(MatSetType(b->A, MATSEQSELLCUDA));
 19:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
 20:     PetscCall(MatBindToCPU(b->B, B->boundtocpu));
 21:     PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N));
 22:     PetscCall(MatSetType(b->B, MATSEQSELLCUDA));
 23:   }
 24:   PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen));
 25:   PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen));
 26:   B->preallocated  = PETSC_TRUE;
 27:   B->was_assembled = PETSC_FALSE;
 28:   B->assembled     = PETSC_FALSE;
 29:   PetscFunctionReturn(PETSC_SUCCESS);
 30: }

 32: static PetscErrorCode MatMult_MPISELLCUDA(Mat A, Vec xx, Vec yy)
 33: {
 34:   Mat_MPISELL *a = (Mat_MPISELL *)A->data;
 35:   PetscInt     nt;

 37:   PetscFunctionBegin;
 38:   PetscCall(VecGetLocalSize(xx, &nt));
 39:   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
 40:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
 41:   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
 42:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
 43:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
 44:   PetscFunctionReturn(PETSC_SUCCESS);
 45: }

 47: static PetscErrorCode MatZeroEntries_MPISELLCUDA(Mat A)
 48: {
 49:   Mat_MPISELL *l = (Mat_MPISELL *)A->data;

 51:   PetscFunctionBegin;
 52:   PetscCall(MatZeroEntries(l->A));
 53:   PetscCall(MatZeroEntries(l->B));
 54:   PetscFunctionReturn(PETSC_SUCCESS);
 55: }

 57: static PetscErrorCode MatMultAdd_MPISELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
 58: {
 59:   Mat_MPISELL *a = (Mat_MPISELL *)A->data;
 60:   PetscInt     nt;

 62:   PetscFunctionBegin;
 63:   PetscCall(VecGetLocalSize(xx, &nt));
 64:   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
 65:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
 66:   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
 67:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
 68:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
 69:   PetscFunctionReturn(PETSC_SUCCESS);
 70: }

 72: static PetscErrorCode MatMultTranspose_MPISELLCUDA(Mat A, Vec xx, Vec yy)
 73: {
 74:   Mat_MPISELL *a = (Mat_MPISELL *)A->data;
 75:   PetscInt     nt;

 77:   PetscFunctionBegin;
 78:   PetscCall(VecGetLocalSize(xx, &nt));
 79:   PetscCheck(nt == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->rmap->n, nt);
 80:   PetscUseTypeMethod(a->B, multtranspose, xx, a->lvec);
 81:   PetscUseTypeMethod(a->A, multtranspose, xx, yy);
 82:   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
 83:   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
 84:   PetscFunctionReturn(PETSC_SUCCESS);
 85: }

 87: static PetscErrorCode MatSetFromOptions_MPISELLCUDA(Mat, PetscOptionItems *)
 88: {
 89:   return PETSC_SUCCESS;
 90: }

 92: static PetscErrorCode MatAssemblyEnd_MPISELLCUDA(Mat A, MatAssemblyType mode)
 93: {
 94:   PetscFunctionBegin;
 95:   PetscCall(MatAssemblyEnd_MPISELL(A, mode));
 96:   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(VecSetType(((Mat_MPISELL *)A->data)->lvec, VECSEQCUDA));
 97:   PetscFunctionReturn(PETSC_SUCCESS);
 98: }

100: static PetscErrorCode MatDestroy_MPISELLCUDA(Mat A)
101: {
102:   PetscFunctionBegin;
103:   PetscCall(MatDestroy_MPISELL(A));
104:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", NULL));
105:   PetscFunctionReturn(PETSC_SUCCESS);
106: }

108: PETSC_INTERN PetscErrorCode MatConvert_MPISELL_MPISELLCUDA(Mat B, MatType, MatReuse reuse, Mat *newmat)
109: {
110:   Mat_MPISELL *a;
111:   Mat          A;

113:   PetscFunctionBegin;
114:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
115:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
116:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
117:   A             = *newmat;
118:   A->boundtocpu = PETSC_FALSE;
119:   PetscCall(PetscFree(A->defaultvectype));
120:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));

122:   a = (Mat_MPISELL *)A->data;
123:   if (a->A) PetscCall(MatSetType(a->A, MATSEQSELLCUDA));
124:   if (a->B) PetscCall(MatSetType(a->B, MATSEQSELLCUDA));
125:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));

127:   A->ops->assemblyend    = MatAssemblyEnd_MPISELLCUDA;
128:   A->ops->mult           = MatMult_MPISELLCUDA;
129:   A->ops->multadd        = MatMultAdd_MPISELLCUDA;
130:   A->ops->multtranspose  = MatMultTranspose_MPISELLCUDA;
131:   A->ops->setfromoptions = MatSetFromOptions_MPISELLCUDA;
132:   A->ops->destroy        = MatDestroy_MPISELLCUDA;
133:   A->ops->zeroentries    = MatZeroEntries_MPISELLCUDA;

135:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPISELLCUDA));
136:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELLCUDA));
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: PETSC_EXTERN PetscErrorCode MatCreate_MPISELLCUDA(Mat A)
141: {
142:   PetscFunctionBegin;
143:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
144:   PetscCall(MatCreate_MPISELL(A));
145:   PetscCall(MatConvert_MPISELL_MPISELLCUDA(A, MATMPISELLCUDA, MAT_INPLACE_MATRIX, &A));
146:   PetscFunctionReturn(PETSC_SUCCESS);
147: }

149: /*@
150:   MatCreateSELLCUDA - Creates a sparse matrix in SELL format.
151:   This matrix will ultimately pushed down to NVIDIA GPUs.

153:   Collective

155:   Input Parameters:
156: + comm  - MPI communicator, set to `PETSC_COMM_SELF`
157: . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
158:            This value should be the same as the local size used in creating the
159:            y vector for the matrix-vector product y = Ax.
160: . n     - This value should be the same as the local size used in creating the
161:        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
162:        calculated if `N` is given) For square matrices `n` is almost always `m`.
163: . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
164: . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
165: . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
166:            (same value is used for all local rows)
167: . d_nnz - array containing the number of nonzeros in the various rows of the
168:            DIAGONAL portion of the local submatrix (possibly different for each row)
169:            or `NULL`, if `d_nz` is used to specify the nonzero structure.
170:            The size of this array is equal to the number of local rows, i.e `m`.
171:            For matrices you plan to factor you must leave room for the diagonal entry and
172:            put in the entry even if it is zero.
173: . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
174:            submatrix (same value is used for all local rows).
175: - o_nnz - array containing the number of nonzeros in the various rows of the
176:            OFF-DIAGONAL portion of the local submatrix (possibly different for
177:            each row) or `NULL`, if `o_nz` is used to specify the nonzero
178:            structure. The size of this array is equal to the number
179:            of local rows, i.e `m`.

181:   Output Parameter:
182: . A - the matrix

184:   Level: intermediate

186:   Notes:
187:   If `nnz` is given then `nz` is ignored

189:   Specify the preallocated storage with either `nz` or `nnz` (not both).
190:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
191:   allocation.

193: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MATMPISELLCUDA`, `MATSELLCUDA`
194: @*/
195: PetscErrorCode MatCreateSELLCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
196: {
197:   PetscMPIInt size;

199:   PetscFunctionBegin;
200:   PetscCall(MatCreate(comm, A));
201:   PetscCall(MatSetSizes(*A, m, n, M, N));
202:   PetscCallMPI(MPI_Comm_size(comm, &size));
203:   if (size > 1) {
204:     PetscCall(MatSetType(*A, MATMPISELLCUDA));
205:     PetscCall(MatMPISELLSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
206:   } else {
207:     PetscCall(MatSetType(*A, MATSEQSELLCUDA));
208:     PetscCall(MatSeqSELLSetPreallocation(*A, d_nz, d_nnz));
209:   }
210:   PetscFunctionReturn(PETSC_SUCCESS);
211: }

213: /*MC
214:    MATSELLCUDA - "sellcuda" = "mpisellcuda" - A matrix type to be used for sparse matrices.

216:    Sliced ELLPACK matrix type whose data resides on NVIDIA GPUs.

218:    This matrix type is identical to `MATSEQSELLCUDA` when constructed with a single process communicator,
219:    and `MATMPISELLCUDA` otherwise.  As a result, for single process communicators,
220:    `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
221:    for communicators controlling multiple processes.  It is recommended that you call both of
222:    the above preallocation routines for simplicity.

224:    Options Database Key:
225: .  -mat_type mpisellcuda - sets the matrix type to `MATMPISELLCUDA` during a call to MatSetFromOptions()

227:   Level: beginner

229:  .seealso: `MatCreateSELLCUDA()`, `MATSEQSELLCUDA`, `MatCreateSeqSELLCUDA()`, `MatCUDAFormatOperation()`
230: M*/