Actual source code: mpisellcuda.cu
1: #include <petscconf.h>
2: #include <petscdevice.h>
3: #include <../src/mat/impls/sell/mpi/mpisell.h>
5: static PetscErrorCode MatMPISELLSetPreallocation_MPISELLCUDA(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[])
6: {
7: Mat_MPISELL *b = (Mat_MPISELL *)B->data;
9: PetscFunctionBegin;
10: PetscCall(PetscLayoutSetUp(B->rmap));
11: PetscCall(PetscLayoutSetUp(B->cmap));
13: if (!B->preallocated) {
14: /* Explicitly create 2 MATSEQSELLCUDA matrices. */
15: PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
16: PetscCall(MatBindToCPU(b->A, B->boundtocpu));
17: PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
18: PetscCall(MatSetType(b->A, MATSEQSELLCUDA));
19: PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
20: PetscCall(MatBindToCPU(b->B, B->boundtocpu));
21: PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N));
22: PetscCall(MatSetType(b->B, MATSEQSELLCUDA));
23: }
24: PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen));
25: PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen));
26: B->preallocated = PETSC_TRUE;
27: B->was_assembled = PETSC_FALSE;
28: B->assembled = PETSC_FALSE;
29: PetscFunctionReturn(PETSC_SUCCESS);
30: }
32: static PetscErrorCode MatMult_MPISELLCUDA(Mat A, Vec xx, Vec yy)
33: {
34: Mat_MPISELL *a = (Mat_MPISELL *)A->data;
35: PetscInt nt;
37: PetscFunctionBegin;
38: PetscCall(VecGetLocalSize(xx, &nt));
39: PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
40: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
41: PetscCall((*a->A->ops->mult)(a->A, xx, yy));
42: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
43: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
44: PetscFunctionReturn(PETSC_SUCCESS);
45: }
47: static PetscErrorCode MatZeroEntries_MPISELLCUDA(Mat A)
48: {
49: Mat_MPISELL *l = (Mat_MPISELL *)A->data;
51: PetscFunctionBegin;
52: PetscCall(MatZeroEntries(l->A));
53: PetscCall(MatZeroEntries(l->B));
54: PetscFunctionReturn(PETSC_SUCCESS);
55: }
57: static PetscErrorCode MatMultAdd_MPISELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
58: {
59: Mat_MPISELL *a = (Mat_MPISELL *)A->data;
60: PetscInt nt;
62: PetscFunctionBegin;
63: PetscCall(VecGetLocalSize(xx, &nt));
64: PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
65: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
66: PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
67: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
68: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
69: PetscFunctionReturn(PETSC_SUCCESS);
70: }
72: static PetscErrorCode MatMultTranspose_MPISELLCUDA(Mat A, Vec xx, Vec yy)
73: {
74: Mat_MPISELL *a = (Mat_MPISELL *)A->data;
75: PetscInt nt;
77: PetscFunctionBegin;
78: PetscCall(VecGetLocalSize(xx, &nt));
79: PetscCheck(nt == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->rmap->n, nt);
80: PetscUseTypeMethod(a->B, multtranspose, xx, a->lvec);
81: PetscUseTypeMethod(a->A, multtranspose, xx, yy);
82: PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
83: PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: static PetscErrorCode MatSetFromOptions_MPISELLCUDA(Mat, PetscOptionItems *)
88: {
89: return PETSC_SUCCESS;
90: }
92: static PetscErrorCode MatAssemblyEnd_MPISELLCUDA(Mat A, MatAssemblyType mode)
93: {
94: PetscFunctionBegin;
95: PetscCall(MatAssemblyEnd_MPISELL(A, mode));
96: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(VecSetType(((Mat_MPISELL *)A->data)->lvec, VECSEQCUDA));
97: PetscFunctionReturn(PETSC_SUCCESS);
98: }
100: static PetscErrorCode MatDestroy_MPISELLCUDA(Mat A)
101: {
102: PetscFunctionBegin;
103: PetscCall(MatDestroy_MPISELL(A));
104: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", NULL));
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
108: PETSC_INTERN PetscErrorCode MatConvert_MPISELL_MPISELLCUDA(Mat B, MatType, MatReuse reuse, Mat *newmat)
109: {
110: Mat_MPISELL *a;
111: Mat A;
113: PetscFunctionBegin;
114: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
115: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
116: else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
117: A = *newmat;
118: A->boundtocpu = PETSC_FALSE;
119: PetscCall(PetscFree(A->defaultvectype));
120: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
122: a = (Mat_MPISELL *)A->data;
123: if (a->A) PetscCall(MatSetType(a->A, MATSEQSELLCUDA));
124: if (a->B) PetscCall(MatSetType(a->B, MATSEQSELLCUDA));
125: if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
127: A->ops->assemblyend = MatAssemblyEnd_MPISELLCUDA;
128: A->ops->mult = MatMult_MPISELLCUDA;
129: A->ops->multadd = MatMultAdd_MPISELLCUDA;
130: A->ops->multtranspose = MatMultTranspose_MPISELLCUDA;
131: A->ops->setfromoptions = MatSetFromOptions_MPISELLCUDA;
132: A->ops->destroy = MatDestroy_MPISELLCUDA;
133: A->ops->zeroentries = MatZeroEntries_MPISELLCUDA;
135: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPISELLCUDA));
136: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELLCUDA));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: PETSC_EXTERN PetscErrorCode MatCreate_MPISELLCUDA(Mat A)
141: {
142: PetscFunctionBegin;
143: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
144: PetscCall(MatCreate_MPISELL(A));
145: PetscCall(MatConvert_MPISELL_MPISELLCUDA(A, MATMPISELLCUDA, MAT_INPLACE_MATRIX, &A));
146: PetscFunctionReturn(PETSC_SUCCESS);
147: }
149: /*@
150: MatCreateSELLCUDA - Creates a sparse matrix in SELL format.
151: This matrix will ultimately pushed down to NVIDIA GPUs.
153: Collective
155: Input Parameters:
156: + comm - MPI communicator, set to `PETSC_COMM_SELF`
157: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
158: This value should be the same as the local size used in creating the
159: y vector for the matrix-vector product y = Ax.
160: . n - This value should be the same as the local size used in creating the
161: x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
162: calculated if `N` is given) For square matrices `n` is almost always `m`.
163: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
164: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
165: . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix
166: (same value is used for all local rows)
167: . d_nnz - array containing the number of nonzeros in the various rows of the
168: DIAGONAL portion of the local submatrix (possibly different for each row)
169: or `NULL`, if `d_nz` is used to specify the nonzero structure.
170: The size of this array is equal to the number of local rows, i.e `m`.
171: For matrices you plan to factor you must leave room for the diagonal entry and
172: put in the entry even if it is zero.
173: . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local
174: submatrix (same value is used for all local rows).
175: - o_nnz - array containing the number of nonzeros in the various rows of the
176: OFF-DIAGONAL portion of the local submatrix (possibly different for
177: each row) or `NULL`, if `o_nz` is used to specify the nonzero
178: structure. The size of this array is equal to the number
179: of local rows, i.e `m`.
181: Output Parameter:
182: . A - the matrix
184: Level: intermediate
186: Notes:
187: If `nnz` is given then `nz` is ignored
189: Specify the preallocated storage with either `nz` or `nnz` (not both).
190: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
191: allocation.
193: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MATMPISELLCUDA`, `MATSELLCUDA`
194: @*/
195: PetscErrorCode MatCreateSELLCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
196: {
197: PetscMPIInt size;
199: PetscFunctionBegin;
200: PetscCall(MatCreate(comm, A));
201: PetscCall(MatSetSizes(*A, m, n, M, N));
202: PetscCallMPI(MPI_Comm_size(comm, &size));
203: if (size > 1) {
204: PetscCall(MatSetType(*A, MATMPISELLCUDA));
205: PetscCall(MatMPISELLSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
206: } else {
207: PetscCall(MatSetType(*A, MATSEQSELLCUDA));
208: PetscCall(MatSeqSELLSetPreallocation(*A, d_nz, d_nnz));
209: }
210: PetscFunctionReturn(PETSC_SUCCESS);
211: }
213: /*MC
214: MATSELLCUDA - "sellcuda" = "mpisellcuda" - A matrix type to be used for sparse matrices.
216: Sliced ELLPACK matrix type whose data resides on NVIDIA GPUs.
218: This matrix type is identical to `MATSEQSELLCUDA` when constructed with a single process communicator,
219: and `MATMPISELLCUDA` otherwise. As a result, for single process communicators,
220: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
221: for communicators controlling multiple processes. It is recommended that you call both of
222: the above preallocation routines for simplicity.
224: Options Database Key:
225: . -mat_type mpisellcuda - sets the matrix type to `MATMPISELLCUDA` during a call to MatSetFromOptions()
227: Level: beginner
229: .seealso: `MatCreateSELLCUDA()`, `MATSEQSELLCUDA`, `MatCreateSeqSELLCUDA()`, `MatCUDAFormatOperation()`
230: M*/