Actual source code: sell.c
1: /*
2: Defines the basic matrix operations for the SELL matrix storage format.
3: */
4: #include <../src/mat/impls/sell/seq/sell.h>
5: #include <petscblaslapack.h>
6: #include <petsc/private/kernels/blocktranspose.h>
8: static PetscBool cited = PETSC_FALSE;
9: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13: " year = 2018\n"
14: "}\n";
16: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18: #include <immintrin.h>
20: #if !defined(_MM_SCALE_8)
21: #define _MM_SCALE_8 8
22: #endif
24: #if defined(__AVX512F__)
25: /* these do not work
26: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28: */
29: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
32: vec_vals = _mm512_loadu_pd(aval); \
33: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35: #elif defined(__AVX2__) && defined(__FMA__)
36: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37: vec_vals = _mm256_loadu_pd(aval); \
38: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41: #endif
42: #endif /* PETSC_HAVE_IMMINTRIN_H */
44: /*@C
45: MatSeqSELLSetPreallocation - For good matrix assembly performance
46: the user should preallocate the matrix storage by setting the parameter `nz`
47: (or the array `nnz`).
49: Collective
51: Input Parameters:
52: + B - The `MATSEQSELL` matrix
53: . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56: Level: intermediate
58: Notes:
59: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61: allocation.
63: You can call `MatGetInfo()` to get information on how effective the preallocation was;
64: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65: You can also run with the option `-info` and look for messages with the string
66: malloc in them to see if additional memory allocation was needed.
68: Developer Notes:
69: Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70: entries or columns indices.
72: The maximum number of nonzeos in any row should be as accurate as possible.
73: If it is underestimated, you will get bad performance due to reallocation
74: (`MatSeqXSELLReallocateSELL()`).
76: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77: @*/
78: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79: {
80: PetscFunctionBegin;
83: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88: {
89: Mat_SeqSELL *b;
90: PetscInt i, j, totalslices;
91: #if defined(PETSC_HAVE_CUDA)
92: PetscInt rlenmax = 0;
93: #endif
94: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96: PetscFunctionBegin;
97: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98: if (maxallocrow == MAT_SKIP_ALLOCATION) {
99: skipallocation = PETSC_TRUE;
100: maxallocrow = 0;
101: }
103: PetscCall(PetscLayoutSetUp(B->rmap));
104: PetscCall(PetscLayoutSetUp(B->cmap));
106: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108: PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109: if (rlen) {
110: for (i = 0; i < B->rmap->n; i++) {
111: PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112: PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113: }
114: }
116: B->preallocated = PETSC_TRUE;
118: b = (Mat_SeqSELL *)B->data;
120: if (!b->sliceheight) { /* not set yet */
121: #if defined(PETSC_HAVE_CUDA)
122: b->sliceheight = 16;
123: #else
124: b->sliceheight = 8;
125: #endif
126: }
127: totalslices = PetscCeilInt(B->rmap->n, b->sliceheight);
128: b->totalslices = totalslices;
129: if (!skipallocation) {
130: if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133: PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134: }
135: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137: else if (maxallocrow < 0) maxallocrow = 1;
138: #if defined(PETSC_HAVE_CUDA)
139: rlenmax = maxallocrow;
140: /* Pad the slice to DEVICE_MEM_ALIGN */
141: while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142: #endif
143: for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144: } else {
145: #if defined(PETSC_HAVE_CUDA)
146: PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147: #endif
148: maxallocrow = 0;
149: b->sliidx[0] = 0;
150: for (i = 1; i < totalslices; i++) {
151: b->sliidx[i] = 0;
152: for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153: #if defined(PETSC_HAVE_CUDA)
154: rlenmax = PetscMax(b->sliidx[i], rlenmax);
155: /* Pad the slice to DEVICE_MEM_ALIGN */
156: b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157: #endif
158: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
159: PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
160: }
161: /* last slice */
162: b->sliidx[totalslices] = 0;
163: for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
164: #if defined(PETSC_HAVE_CUDA)
165: rlenmax = PetscMax(b->sliidx[i], rlenmax);
166: b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
167: #endif
168: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
169: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
170: }
172: /* allocate space for val, colidx, rlen */
173: /* FIXME: should B's old memory be unlogged? */
174: PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
175: /* FIXME: assuming an element of the bit array takes 8 bits */
176: PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
177: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
178: PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
180: b->singlemalloc = PETSC_TRUE;
181: b->free_val = PETSC_TRUE;
182: b->free_colidx = PETSC_TRUE;
183: } else {
184: b->free_val = PETSC_FALSE;
185: b->free_colidx = PETSC_FALSE;
186: }
188: b->nz = 0;
189: b->maxallocrow = maxallocrow;
190: #if defined(PETSC_HAVE_CUDA)
191: b->rlenmax = rlenmax;
192: #else
193: b->rlenmax = maxallocrow;
194: #endif
195: b->maxallocmat = b->sliidx[totalslices];
196: B->info.nz_unneeded = (double)b->maxallocmat;
197: if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
198: PetscFunctionReturn(PETSC_SUCCESS);
199: }
201: static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
202: {
203: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204: PetscInt shift;
206: PetscFunctionBegin;
207: PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
208: if (nz) *nz = a->rlen[row];
209: shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
210: if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
211: if (idx) {
212: PetscInt j;
213: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
214: *idx = a->getrowcols;
215: }
216: if (v) {
217: PetscInt j;
218: for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
219: *v = a->getrowvals;
220: }
221: PetscFunctionReturn(PETSC_SUCCESS);
222: }
224: static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
225: {
226: PetscFunctionBegin;
227: PetscFunctionReturn(PETSC_SUCCESS);
228: }
230: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
231: {
232: Mat B;
233: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
234: PetscInt i;
236: PetscFunctionBegin;
237: if (reuse == MAT_REUSE_MATRIX) {
238: B = *newmat;
239: PetscCall(MatZeroEntries(B));
240: } else {
241: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
242: PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
243: PetscCall(MatSetType(B, MATSEQAIJ));
244: PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
245: }
247: for (i = 0; i < A->rmap->n; i++) {
248: PetscInt nz = 0, *cols = NULL;
249: PetscScalar *vals = NULL;
251: PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
252: PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
253: PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
254: }
256: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
257: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
258: B->rmap->bs = A->rmap->bs;
260: if (reuse == MAT_INPLACE_MATRIX) {
261: PetscCall(MatHeaderReplace(A, &B));
262: } else {
263: *newmat = B;
264: }
265: PetscFunctionReturn(PETSC_SUCCESS);
266: }
268: #include <../src/mat/impls/aij/seq/aij.h>
270: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
271: {
272: Mat B;
273: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
274: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
275: const PetscInt *cols;
276: const PetscScalar *vals;
278: PetscFunctionBegin;
280: if (reuse == MAT_REUSE_MATRIX) {
281: B = *newmat;
282: } else {
283: if (PetscDefined(USE_DEBUG) || !a->ilen) {
284: PetscCall(PetscMalloc1(m, &rowlengths));
285: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
286: }
287: if (PetscDefined(USE_DEBUG) && a->ilen) {
288: PetscBool eq;
289: PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
290: PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
291: PetscCall(PetscFree(rowlengths));
292: rowlengths = a->ilen;
293: } else if (a->ilen) rowlengths = a->ilen;
294: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
295: PetscCall(MatSetSizes(B, m, n, m, n));
296: PetscCall(MatSetType(B, MATSEQSELL));
297: PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
298: if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
299: }
301: for (row = 0; row < m; row++) {
302: PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
303: PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
304: PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305: }
306: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
307: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
308: B->rmap->bs = A->rmap->bs;
310: if (reuse == MAT_INPLACE_MATRIX) {
311: PetscCall(MatHeaderReplace(A, &B));
312: } else {
313: *newmat = B;
314: }
315: PetscFunctionReturn(PETSC_SUCCESS);
316: }
318: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
319: {
320: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
321: PetscScalar *y;
322: const PetscScalar *x;
323: const MatScalar *aval = a->val;
324: PetscInt totalslices = a->totalslices;
325: const PetscInt *acolidx = a->colidx;
326: PetscInt i, j;
327: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
328: __m512d vec_x, vec_y, vec_vals;
329: __m256i vec_idx;
330: __mmask8 mask;
331: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
332: __m256i vec_idx2, vec_idx3, vec_idx4;
333: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
334: __m128i vec_idx;
335: __m256d vec_x, vec_y, vec_y2, vec_vals;
336: MatScalar yval;
337: PetscInt r, rows_left, row, nnz_in_row;
338: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
339: __m128d vec_x_tmp;
340: __m256d vec_x, vec_y, vec_y2, vec_vals;
341: MatScalar yval;
342: PetscInt r, rows_left, row, nnz_in_row;
343: #else
344: PetscInt k, sliceheight = a->sliceheight;
345: PetscScalar *sum;
346: #endif
348: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
349: #pragma disjoint(*x, *y, *aval)
350: #endif
352: PetscFunctionBegin;
353: PetscCall(VecGetArrayRead(xx, &x));
354: PetscCall(VecGetArray(yy, &y));
355: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
356: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
357: for (i = 0; i < totalslices; i++) { /* loop over slices */
358: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
359: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361: vec_y = _mm512_setzero_pd();
362: vec_y2 = _mm512_setzero_pd();
363: vec_y3 = _mm512_setzero_pd();
364: vec_y4 = _mm512_setzero_pd();
366: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
367: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
368: case 3:
369: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
370: acolidx += 8;
371: aval += 8;
372: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
373: acolidx += 8;
374: aval += 8;
375: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
376: acolidx += 8;
377: aval += 8;
378: j += 3;
379: break;
380: case 2:
381: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
382: acolidx += 8;
383: aval += 8;
384: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
385: acolidx += 8;
386: aval += 8;
387: j += 2;
388: break;
389: case 1:
390: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
391: acolidx += 8;
392: aval += 8;
393: j += 1;
394: break;
395: }
396: #pragma novector
397: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
398: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
399: acolidx += 8;
400: aval += 8;
401: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
402: acolidx += 8;
403: aval += 8;
404: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
405: acolidx += 8;
406: aval += 8;
407: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
408: acolidx += 8;
409: aval += 8;
410: }
412: vec_y = _mm512_add_pd(vec_y, vec_y2);
413: vec_y = _mm512_add_pd(vec_y, vec_y3);
414: vec_y = _mm512_add_pd(vec_y, vec_y4);
415: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
416: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
417: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
418: } else {
419: _mm512_storeu_pd(&y[8 * i], vec_y);
420: }
421: }
422: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
424: for (i = 0; i < totalslices; i++) { /* loop over full slices */
425: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
426: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428: /* last slice may have padding rows. Don't use vectorization. */
429: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
430: rows_left = A->rmap->n - 8 * i;
431: for (r = 0; r < rows_left; ++r) {
432: yval = (MatScalar)0;
433: row = 8 * i + r;
434: nnz_in_row = a->rlen[row];
435: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
436: y[row] = yval;
437: }
438: break;
439: }
441: vec_y = _mm256_setzero_pd();
442: vec_y2 = _mm256_setzero_pd();
444: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
445: #pragma novector
446: #pragma unroll(2)
447: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
448: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
449: aval += 4;
450: acolidx += 4;
451: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
452: aval += 4;
453: acolidx += 4;
454: }
456: _mm256_storeu_pd(y + i * 8, vec_y);
457: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
458: }
459: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
460: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
461: for (i = 0; i < totalslices; i++) { /* loop over full slices */
462: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
463: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465: vec_y = _mm256_setzero_pd();
466: vec_y2 = _mm256_setzero_pd();
468: /* last slice may have padding rows. Don't use vectorization. */
469: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
470: rows_left = A->rmap->n - 8 * i;
471: for (r = 0; r < rows_left; ++r) {
472: yval = (MatScalar)0;
473: row = 8 * i + r;
474: nnz_in_row = a->rlen[row];
475: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
476: y[row] = yval;
477: }
478: break;
479: }
481: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
482: #pragma novector
483: #pragma unroll(2)
484: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
485: vec_vals = _mm256_loadu_pd(aval);
486: vec_x_tmp = _mm_setzero_pd();
487: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
488: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
489: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
490: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
491: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
492: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
493: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
494: aval += 4;
496: vec_vals = _mm256_loadu_pd(aval);
497: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
498: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
499: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
500: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
501: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
502: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
503: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
504: aval += 4;
505: }
507: _mm256_storeu_pd(y + i * 8, vec_y);
508: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
509: }
510: #else
511: PetscCall(PetscMalloc1(sliceheight, &sum));
512: for (i = 0; i < totalslices; i++) { /* loop over slices */
513: for (j = 0; j < sliceheight; j++) {
514: sum[j] = 0.0;
515: for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
516: }
517: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
518: for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
519: } else {
520: for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
521: }
522: }
523: PetscCall(PetscFree(sum));
524: #endif
526: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
527: PetscCall(VecRestoreArrayRead(xx, &x));
528: PetscCall(VecRestoreArray(yy, &y));
529: PetscFunctionReturn(PETSC_SUCCESS);
530: }
532: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
533: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
534: {
535: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
536: PetscScalar *y, *z;
537: const PetscScalar *x;
538: const MatScalar *aval = a->val;
539: PetscInt totalslices = a->totalslices;
540: const PetscInt *acolidx = a->colidx;
541: PetscInt i, j;
542: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
543: __m512d vec_x, vec_y, vec_vals;
544: __m256i vec_idx;
545: __mmask8 mask = 0;
546: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
547: __m256i vec_idx2, vec_idx3, vec_idx4;
548: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
549: __m128d vec_x_tmp;
550: __m256d vec_x, vec_y, vec_y2, vec_vals;
551: MatScalar yval;
552: PetscInt r, row, nnz_in_row;
553: #else
554: PetscInt k, sliceheight = a->sliceheight;
555: PetscScalar *sum;
556: #endif
558: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
559: #pragma disjoint(*x, *y, *aval)
560: #endif
562: PetscFunctionBegin;
563: if (!a->nz) {
564: PetscCall(VecCopy(yy, zz));
565: PetscFunctionReturn(PETSC_SUCCESS);
566: }
567: PetscCall(VecGetArrayRead(xx, &x));
568: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
569: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
570: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
571: for (i = 0; i < totalslices; i++) { /* loop over slices */
572: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
573: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
576: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
577: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
578: } else {
579: vec_y = _mm512_loadu_pd(&y[8 * i]);
580: }
581: vec_y2 = _mm512_setzero_pd();
582: vec_y3 = _mm512_setzero_pd();
583: vec_y4 = _mm512_setzero_pd();
585: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
586: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
587: case 3:
588: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
589: acolidx += 8;
590: aval += 8;
591: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
592: acolidx += 8;
593: aval += 8;
594: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
595: acolidx += 8;
596: aval += 8;
597: j += 3;
598: break;
599: case 2:
600: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
601: acolidx += 8;
602: aval += 8;
603: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
604: acolidx += 8;
605: aval += 8;
606: j += 2;
607: break;
608: case 1:
609: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
610: acolidx += 8;
611: aval += 8;
612: j += 1;
613: break;
614: }
615: #pragma novector
616: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
617: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
618: acolidx += 8;
619: aval += 8;
620: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
621: acolidx += 8;
622: aval += 8;
623: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
624: acolidx += 8;
625: aval += 8;
626: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
627: acolidx += 8;
628: aval += 8;
629: }
631: vec_y = _mm512_add_pd(vec_y, vec_y2);
632: vec_y = _mm512_add_pd(vec_y, vec_y3);
633: vec_y = _mm512_add_pd(vec_y, vec_y4);
634: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
635: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
636: } else {
637: _mm512_storeu_pd(&z[8 * i], vec_y);
638: }
639: }
640: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
641: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
642: for (i = 0; i < totalslices; i++) { /* loop over full slices */
643: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
644: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646: /* last slice may have padding rows. Don't use vectorization. */
647: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
648: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
649: row = 8 * i + r;
650: yval = (MatScalar)0.0;
651: nnz_in_row = a->rlen[row];
652: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
653: z[row] = y[row] + yval;
654: }
655: break;
656: }
658: vec_y = _mm256_loadu_pd(y + 8 * i);
659: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
661: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
662: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
663: vec_vals = _mm256_loadu_pd(aval);
664: vec_x_tmp = _mm_setzero_pd();
665: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
666: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
667: vec_x = _mm256_setzero_pd();
668: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
669: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
670: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
671: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
672: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
673: aval += 4;
675: vec_vals = _mm256_loadu_pd(aval);
676: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
677: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
678: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
679: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
680: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
681: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
682: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
683: aval += 4;
684: }
686: _mm256_storeu_pd(z + i * 8, vec_y);
687: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
688: }
689: #else
690: PetscCall(PetscMalloc1(sliceheight, &sum));
691: for (i = 0; i < totalslices; i++) { /* loop over slices */
692: for (j = 0; j < sliceheight; j++) {
693: sum[j] = 0.0;
694: for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
695: }
696: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
697: for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
698: } else {
699: for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700: }
701: }
702: PetscCall(PetscFree(sum));
703: #endif
705: PetscCall(PetscLogFlops(2.0 * a->nz));
706: PetscCall(VecRestoreArrayRead(xx, &x));
707: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
708: PetscFunctionReturn(PETSC_SUCCESS);
709: }
711: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
712: {
713: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
714: PetscScalar *y;
715: const PetscScalar *x;
716: const MatScalar *aval = a->val;
717: const PetscInt *acolidx = a->colidx;
718: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
720: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
721: #pragma disjoint(*x, *y, *aval)
722: #endif
724: PetscFunctionBegin;
725: if (A->symmetric == PETSC_BOOL3_TRUE) {
726: PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
727: PetscFunctionReturn(PETSC_SUCCESS);
728: }
729: if (zz != yy) PetscCall(VecCopy(zz, yy));
731: if (a->nz) {
732: PetscCall(VecGetArrayRead(xx, &x));
733: PetscCall(VecGetArray(yy, &y));
734: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
735: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
736: for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
737: row = sliceheight * i + r;
738: nnz_in_row = a->rlen[row];
739: for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
740: }
741: break;
742: }
743: for (r = 0; r < sliceheight; ++r)
744: for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
745: }
746: PetscCall(PetscLogFlops(2.0 * a->nz));
747: PetscCall(VecRestoreArrayRead(xx, &x));
748: PetscCall(VecRestoreArray(yy, &y));
749: }
750: PetscFunctionReturn(PETSC_SUCCESS);
751: }
753: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
754: {
755: PetscFunctionBegin;
756: if (A->symmetric == PETSC_BOOL3_TRUE) {
757: PetscCall(MatMult_SeqSELL(A, xx, yy));
758: } else {
759: PetscCall(VecSet(yy, 0.0));
760: PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
761: }
762: PetscFunctionReturn(PETSC_SUCCESS);
763: }
765: /*
766: Checks for missing diagonals
767: */
768: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
769: {
770: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
771: PetscInt *diag, i;
773: PetscFunctionBegin;
774: *missing = PETSC_FALSE;
775: if (A->rmap->n > 0 && !(a->colidx)) {
776: *missing = PETSC_TRUE;
777: if (d) *d = 0;
778: PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
779: } else {
780: diag = a->diag;
781: for (i = 0; i < A->rmap->n; i++) {
782: if (diag[i] == -1) {
783: *missing = PETSC_TRUE;
784: if (d) *d = i;
785: PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
786: break;
787: }
788: }
789: }
790: PetscFunctionReturn(PETSC_SUCCESS);
791: }
793: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
794: {
795: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
796: PetscInt i, j, m = A->rmap->n, shift;
798: PetscFunctionBegin;
799: if (!a->diag) {
800: PetscCall(PetscMalloc1(m, &a->diag));
801: a->free_diag = PETSC_TRUE;
802: }
803: for (i = 0; i < m; i++) { /* loop over rows */
804: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
805: a->diag[i] = -1;
806: for (j = 0; j < a->rlen[i]; j++) {
807: if (a->colidx[shift + a->sliceheight * j] == i) {
808: a->diag[i] = shift + a->sliceheight * j;
809: break;
810: }
811: }
812: }
813: PetscFunctionReturn(PETSC_SUCCESS);
814: }
816: /*
817: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
818: */
819: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
820: {
821: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
822: PetscInt i, *diag, m = A->rmap->n;
823: MatScalar *val = a->val;
824: PetscScalar *idiag, *mdiag;
826: PetscFunctionBegin;
827: if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
828: PetscCall(MatMarkDiagonal_SeqSELL(A));
829: diag = a->diag;
830: if (!a->idiag) {
831: PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
832: val = a->val;
833: }
834: mdiag = a->mdiag;
835: idiag = a->idiag;
837: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
838: for (i = 0; i < m; i++) {
839: mdiag[i] = val[diag[i]];
840: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
841: PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
842: PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
843: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
844: A->factorerror_zeropivot_value = 0.0;
845: A->factorerror_zeropivot_row = i;
846: }
847: idiag[i] = 1.0 / val[diag[i]];
848: }
849: PetscCall(PetscLogFlops(m));
850: } else {
851: for (i = 0; i < m; i++) {
852: mdiag[i] = val[diag[i]];
853: idiag[i] = omega / (fshift + val[diag[i]]);
854: }
855: PetscCall(PetscLogFlops(2.0 * m));
856: }
857: a->idiagvalid = PETSC_TRUE;
858: PetscFunctionReturn(PETSC_SUCCESS);
859: }
861: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
862: {
863: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
865: PetscFunctionBegin;
866: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
867: PetscCall(MatSeqSELLInvalidateDiagonal(A));
868: PetscFunctionReturn(PETSC_SUCCESS);
869: }
871: PetscErrorCode MatDestroy_SeqSELL(Mat A)
872: {
873: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875: PetscFunctionBegin;
876: PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
877: PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
878: PetscCall(ISDestroy(&a->row));
879: PetscCall(ISDestroy(&a->col));
880: PetscCall(PetscFree(a->diag));
881: PetscCall(PetscFree(a->rlen));
882: PetscCall(PetscFree(a->sliidx));
883: PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
884: PetscCall(PetscFree(a->solve_work));
885: PetscCall(ISDestroy(&a->icol));
886: PetscCall(PetscFree(a->saved_values));
887: PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
888: PetscCall(PetscFree(A->data));
889: #if defined(PETSC_HAVE_CUDA)
890: PetscCall(PetscFree(a->chunk_slice_map));
891: #endif
893: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
894: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
895: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
896: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
897: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
898: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
899: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
900: #if defined(PETSC_HAVE_CUDA)
901: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
902: #endif
903: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
904: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
905: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
906: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
907: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
908: PetscFunctionReturn(PETSC_SUCCESS);
909: }
911: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
912: {
913: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915: PetscFunctionBegin;
916: switch (op) {
917: case MAT_ROW_ORIENTED:
918: a->roworiented = flg;
919: break;
920: case MAT_KEEP_NONZERO_PATTERN:
921: a->keepnonzeropattern = flg;
922: break;
923: case MAT_NEW_NONZERO_LOCATIONS:
924: a->nonew = (flg ? 0 : 1);
925: break;
926: case MAT_NEW_NONZERO_LOCATION_ERR:
927: a->nonew = (flg ? -1 : 0);
928: break;
929: case MAT_NEW_NONZERO_ALLOCATION_ERR:
930: a->nonew = (flg ? -2 : 0);
931: break;
932: case MAT_UNUSED_NONZERO_LOCATION_ERR:
933: a->nounused = (flg ? -1 : 0);
934: break;
935: case MAT_FORCE_DIAGONAL_ENTRIES:
936: case MAT_IGNORE_OFF_PROC_ENTRIES:
937: case MAT_USE_HASH_TABLE:
938: case MAT_SORTED_FULL:
939: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
940: break;
941: case MAT_SPD:
942: case MAT_SYMMETRIC:
943: case MAT_STRUCTURALLY_SYMMETRIC:
944: case MAT_HERMITIAN:
945: case MAT_SYMMETRY_ETERNAL:
946: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
947: case MAT_SPD_ETERNAL:
948: /* These options are handled directly by MatSetOption() */
949: break;
950: default:
951: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
952: }
953: PetscFunctionReturn(PETSC_SUCCESS);
954: }
956: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
957: {
958: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
959: PetscInt i, j, n, shift;
960: PetscScalar *x, zero = 0.0;
962: PetscFunctionBegin;
963: PetscCall(VecGetLocalSize(v, &n));
964: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
966: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
967: PetscInt *diag = a->diag;
968: PetscCall(VecGetArray(v, &x));
969: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
970: PetscCall(VecRestoreArray(v, &x));
971: PetscFunctionReturn(PETSC_SUCCESS);
972: }
974: PetscCall(VecSet(v, zero));
975: PetscCall(VecGetArray(v, &x));
976: for (i = 0; i < n; i++) { /* loop over rows */
977: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
978: x[i] = 0;
979: for (j = 0; j < a->rlen[i]; j++) {
980: if (a->colidx[shift + a->sliceheight * j] == i) {
981: x[i] = a->val[shift + a->sliceheight * j];
982: break;
983: }
984: }
985: }
986: PetscCall(VecRestoreArray(v, &x));
987: PetscFunctionReturn(PETSC_SUCCESS);
988: }
990: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
991: {
992: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
993: const PetscScalar *l, *r;
994: PetscInt i, j, m, n, row;
996: PetscFunctionBegin;
997: if (ll) {
998: /* The local size is used so that VecMPI can be passed to this routine
999: by MatDiagonalScale_MPISELL */
1000: PetscCall(VecGetLocalSize(ll, &m));
1001: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1002: PetscCall(VecGetArrayRead(ll, &l));
1003: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1004: if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1005: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1006: if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1007: }
1008: } else {
1009: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1010: }
1011: }
1012: PetscCall(VecRestoreArrayRead(ll, &l));
1013: PetscCall(PetscLogFlops(a->nz));
1014: }
1015: if (rr) {
1016: PetscCall(VecGetLocalSize(rr, &n));
1017: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1018: PetscCall(VecGetArrayRead(rr, &r));
1019: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1020: if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1021: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1022: if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1023: }
1024: } else {
1025: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1026: }
1027: }
1028: PetscCall(VecRestoreArrayRead(rr, &r));
1029: PetscCall(PetscLogFlops(a->nz));
1030: }
1031: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1032: #if defined(PETSC_HAVE_CUDA)
1033: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1034: #endif
1035: PetscFunctionReturn(PETSC_SUCCESS);
1036: }
1038: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1039: {
1040: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1041: PetscInt *cp, i, k, low, high, t, row, col, l;
1042: PetscInt shift;
1043: MatScalar *vp;
1045: PetscFunctionBegin;
1046: for (k = 0; k < m; k++) { /* loop over requested rows */
1047: row = im[k];
1048: if (row < 0) continue;
1049: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1050: shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1051: cp = a->colidx + shift; /* pointer to the row */
1052: vp = a->val + shift; /* pointer to the row */
1053: for (l = 0; l < n; l++) { /* loop over requested columns */
1054: col = in[l];
1055: if (col < 0) continue;
1056: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1057: high = a->rlen[row];
1058: low = 0; /* assume unsorted */
1059: while (high - low > 5) {
1060: t = (low + high) / 2;
1061: if (*(cp + a->sliceheight * t) > col) high = t;
1062: else low = t;
1063: }
1064: for (i = low; i < high; i++) {
1065: if (*(cp + a->sliceheight * i) > col) break;
1066: if (*(cp + a->sliceheight * i) == col) {
1067: *v++ = *(vp + a->sliceheight * i);
1068: goto finished;
1069: }
1070: }
1071: *v++ = 0.0;
1072: finished:;
1073: }
1074: }
1075: PetscFunctionReturn(PETSC_SUCCESS);
1076: }
1078: static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1079: {
1080: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1081: PetscInt i, j, m = A->rmap->n, shift;
1082: const char *name;
1083: PetscViewerFormat format;
1085: PetscFunctionBegin;
1086: PetscCall(PetscViewerGetFormat(viewer, &format));
1087: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1088: PetscInt nofinalvalue = 0;
1089: /*
1090: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1091: nofinalvalue = 1;
1092: }
1093: */
1094: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1095: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1096: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1097: #if defined(PETSC_USE_COMPLEX)
1098: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1099: #else
1100: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1101: #endif
1102: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1104: for (i = 0; i < m; i++) {
1105: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1106: for (j = 0; j < a->rlen[i]; j++) {
1107: #if defined(PETSC_USE_COMPLEX)
1108: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1109: #else
1110: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1111: #endif
1112: }
1113: }
1114: /*
1115: if (nofinalvalue) {
1116: #if defined(PETSC_USE_COMPLEX)
1117: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1118: #else
1119: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0));
1120: #endif
1121: }
1122: */
1123: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1124: PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1125: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1126: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1127: PetscFunctionReturn(PETSC_SUCCESS);
1128: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1129: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1130: for (i = 0; i < m; i++) {
1131: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1132: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1133: for (j = 0; j < a->rlen[i]; j++) {
1134: #if defined(PETSC_USE_COMPLEX)
1135: if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1136: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1137: } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1138: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1139: } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1140: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1141: }
1142: #else
1143: if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1144: #endif
1145: }
1146: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1147: }
1148: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1149: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1150: PetscInt cnt = 0, jcnt;
1151: PetscScalar value;
1152: #if defined(PETSC_USE_COMPLEX)
1153: PetscBool realonly = PETSC_TRUE;
1154: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1155: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1156: realonly = PETSC_FALSE;
1157: break;
1158: }
1159: }
1160: #endif
1162: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1163: for (i = 0; i < m; i++) {
1164: jcnt = 0;
1165: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1166: for (j = 0; j < A->cmap->n; j++) {
1167: if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1168: value = a->val[cnt++];
1169: jcnt++;
1170: } else {
1171: value = 0.0;
1172: }
1173: #if defined(PETSC_USE_COMPLEX)
1174: if (realonly) {
1175: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1176: } else {
1177: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1178: }
1179: #else
1180: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1181: #endif
1182: }
1183: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1184: }
1185: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1186: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1187: PetscInt fshift = 1;
1188: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1189: #if defined(PETSC_USE_COMPLEX)
1190: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1191: #else
1192: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1193: #endif
1194: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1195: for (i = 0; i < m; i++) {
1196: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1197: for (j = 0; j < a->rlen[i]; j++) {
1198: #if defined(PETSC_USE_COMPLEX)
1199: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1200: #else
1201: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1202: #endif
1203: }
1204: }
1205: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1206: } else if (format == PETSC_VIEWER_NATIVE) {
1207: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1208: PetscInt row;
1209: PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1210: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1211: #if defined(PETSC_USE_COMPLEX)
1212: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1213: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1214: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1215: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1216: } else {
1217: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1218: }
1219: #else
1220: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1221: #endif
1222: }
1223: }
1224: } else {
1225: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1226: if (A->factortype) {
1227: for (i = 0; i < m; i++) {
1228: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1229: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1230: /* L part */
1231: for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1232: #if defined(PETSC_USE_COMPLEX)
1233: if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1234: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1235: } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1236: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1237: } else {
1238: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1239: }
1240: #else
1241: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1242: #endif
1243: }
1244: /* diagonal */
1245: j = a->diag[i];
1246: #if defined(PETSC_USE_COMPLEX)
1247: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1248: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1249: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1250: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1251: } else {
1252: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1253: }
1254: #else
1255: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1256: #endif
1258: /* U part */
1259: for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1260: #if defined(PETSC_USE_COMPLEX)
1261: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1262: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1263: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1264: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1265: } else {
1266: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1267: }
1268: #else
1269: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1270: #endif
1271: }
1272: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1273: }
1274: } else {
1275: for (i = 0; i < m; i++) {
1276: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1277: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1278: for (j = 0; j < a->rlen[i]; j++) {
1279: #if defined(PETSC_USE_COMPLEX)
1280: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1281: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1282: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1283: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1284: } else {
1285: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1286: }
1287: #else
1288: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1289: #endif
1290: }
1291: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1292: }
1293: }
1294: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1295: }
1296: PetscCall(PetscViewerFlush(viewer));
1297: PetscFunctionReturn(PETSC_SUCCESS);
1298: }
1300: #include <petscdraw.h>
1301: static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1302: {
1303: Mat A = (Mat)Aa;
1304: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1305: PetscInt i, j, m = A->rmap->n, shift;
1306: int color;
1307: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1308: PetscViewer viewer;
1309: PetscViewerFormat format;
1311: PetscFunctionBegin;
1312: PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1313: PetscCall(PetscViewerGetFormat(viewer, &format));
1314: PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1316: /* loop over matrix elements drawing boxes */
1318: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1319: PetscDrawCollectiveBegin(draw);
1320: /* Blue for negative, Cyan for zero and Red for positive */
1321: color = PETSC_DRAW_BLUE;
1322: for (i = 0; i < m; i++) {
1323: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1324: y_l = m - i - 1.0;
1325: y_r = y_l + 1.0;
1326: for (j = 0; j < a->rlen[i]; j++) {
1327: x_l = a->colidx[shift + a->sliceheight * j];
1328: x_r = x_l + 1.0;
1329: if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1330: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1331: }
1332: }
1333: color = PETSC_DRAW_CYAN;
1334: for (i = 0; i < m; i++) {
1335: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1336: y_l = m - i - 1.0;
1337: y_r = y_l + 1.0;
1338: for (j = 0; j < a->rlen[i]; j++) {
1339: x_l = a->colidx[shift + a->sliceheight * j];
1340: x_r = x_l + 1.0;
1341: if (a->val[shift + a->sliceheight * j] != 0.) continue;
1342: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1343: }
1344: }
1345: color = PETSC_DRAW_RED;
1346: for (i = 0; i < m; i++) {
1347: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1348: y_l = m - i - 1.0;
1349: y_r = y_l + 1.0;
1350: for (j = 0; j < a->rlen[i]; j++) {
1351: x_l = a->colidx[shift + a->sliceheight * j];
1352: x_r = x_l + 1.0;
1353: if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1354: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1355: }
1356: }
1357: PetscDrawCollectiveEnd(draw);
1358: } else {
1359: /* use contour shading to indicate magnitude of values */
1360: /* first determine max of all nonzero values */
1361: PetscReal minv = 0.0, maxv = 0.0;
1362: PetscInt count = 0;
1363: PetscDraw popup;
1364: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1365: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1366: }
1367: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1368: PetscCall(PetscDrawGetPopup(draw, &popup));
1369: PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1371: PetscDrawCollectiveBegin(draw);
1372: for (i = 0; i < m; i++) {
1373: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1374: y_l = m - i - 1.0;
1375: y_r = y_l + 1.0;
1376: for (j = 0; j < a->rlen[i]; j++) {
1377: x_l = a->colidx[shift + a->sliceheight * j];
1378: x_r = x_l + 1.0;
1379: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1380: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1381: count++;
1382: }
1383: }
1384: PetscDrawCollectiveEnd(draw);
1385: }
1386: PetscFunctionReturn(PETSC_SUCCESS);
1387: }
1389: #include <petscdraw.h>
1390: static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1391: {
1392: PetscDraw draw;
1393: PetscReal xr, yr, xl, yl, h, w;
1394: PetscBool isnull;
1396: PetscFunctionBegin;
1397: PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1398: PetscCall(PetscDrawIsNull(draw, &isnull));
1399: if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1401: xr = A->cmap->n;
1402: yr = A->rmap->n;
1403: h = yr / 10.0;
1404: w = xr / 10.0;
1405: xr += w;
1406: yr += h;
1407: xl = -w;
1408: yl = -h;
1409: PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1410: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1411: PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1412: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1413: PetscCall(PetscDrawSave(draw));
1414: PetscFunctionReturn(PETSC_SUCCESS);
1415: }
1417: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1418: {
1419: PetscBool iascii, isbinary, isdraw;
1421: PetscFunctionBegin;
1422: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1423: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1424: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1425: if (iascii) {
1426: PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1427: } else if (isbinary) {
1428: /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1429: } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1430: PetscFunctionReturn(PETSC_SUCCESS);
1431: }
1433: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1434: {
1435: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1436: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1437: MatScalar *vp;
1438: #if defined(PETSC_HAVE_CUDA)
1439: PetscInt totalchunks = 0;
1440: #endif
1442: PetscFunctionBegin;
1443: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1444: /* To do: compress out the unused elements */
1445: PetscCall(MatMarkDiagonal_SeqSELL(A));
1446: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1447: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1448: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1449: a->nonzerorowcnt = 0;
1450: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1451: for (i = 0; i < a->totalslices; ++i) {
1452: shift = a->sliidx[i]; /* starting index of the slice */
1453: cp = a->colidx + shift; /* pointer to the column indices of the slice */
1454: vp = a->val + shift; /* pointer to the nonzero values of the slice */
1455: for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1456: row = a->sliceheight * i + row_in_slice;
1457: nrow = a->rlen[row]; /* number of nonzeros in row */
1458: /*
1459: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1460: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1461: */
1462: lastcol = 0;
1463: if (nrow > 0) { /* nonempty row */
1464: a->nonzerorowcnt++;
1465: lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1466: } else if (!row_in_slice) { /* first row of the correct slice is empty */
1467: for (j = 1; j < a->sliceheight; j++) {
1468: if (a->rlen[a->sliceheight * i + j]) {
1469: lastcol = cp[j];
1470: break;
1471: }
1472: }
1473: } else {
1474: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1475: }
1477: for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1478: cp[a->sliceheight * k + row_in_slice] = lastcol;
1479: vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1480: }
1481: }
1482: }
1484: A->info.mallocs += a->reallocs;
1485: a->reallocs = 0;
1487: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1488: #if defined(PETSC_HAVE_CUDA)
1489: if (!a->chunksize && a->totalslices) {
1490: a->chunksize = 64;
1491: while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1492: totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1493: }
1494: if (totalchunks != a->totalchunks) {
1495: PetscCall(PetscFree(a->chunk_slice_map));
1496: PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1497: a->totalchunks = totalchunks;
1498: }
1499: j = 0;
1500: for (i = 0; i < totalchunks; i++) {
1501: while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1502: a->chunk_slice_map[i] = j;
1503: }
1504: #endif
1505: PetscFunctionReturn(PETSC_SUCCESS);
1506: }
1508: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1509: {
1510: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1512: PetscFunctionBegin;
1513: info->block_size = 1.0;
1514: info->nz_allocated = a->maxallocmat;
1515: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1516: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1517: info->assemblies = A->num_ass;
1518: info->mallocs = A->info.mallocs;
1519: info->memory = 0; /* REVIEW ME */
1520: if (A->factortype) {
1521: info->fill_ratio_given = A->info.fill_ratio_given;
1522: info->fill_ratio_needed = A->info.fill_ratio_needed;
1523: info->factor_mallocs = A->info.factor_mallocs;
1524: } else {
1525: info->fill_ratio_given = 0;
1526: info->fill_ratio_needed = 0;
1527: info->factor_mallocs = 0;
1528: }
1529: PetscFunctionReturn(PETSC_SUCCESS);
1530: }
1532: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1533: {
1534: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1535: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1536: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1537: MatScalar *vp, value;
1538: #if defined(PETSC_HAVE_CUDA)
1539: PetscBool inserted = PETSC_FALSE;
1540: PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight;
1541: #endif
1543: PetscFunctionBegin;
1544: for (k = 0; k < m; k++) { /* loop over added rows */
1545: row = im[k];
1546: if (row < 0) continue;
1547: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1548: shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1549: cp = a->colidx + shift; /* pointer to the row */
1550: vp = a->val + shift; /* pointer to the row */
1551: nrow = a->rlen[row];
1552: low = 0;
1553: high = nrow;
1555: for (l = 0; l < n; l++) { /* loop over added columns */
1556: col = in[l];
1557: if (col < 0) continue;
1558: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1559: if (a->roworiented) {
1560: value = v[l + k * n];
1561: } else {
1562: value = v[k + l * m];
1563: }
1564: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1566: /* search in this row for the specified column, i indicates the column to be set */
1567: if (col <= lastcol) low = 0;
1568: else high = nrow;
1569: lastcol = col;
1570: while (high - low > 5) {
1571: t = (low + high) / 2;
1572: if (*(cp + a->sliceheight * t) > col) high = t;
1573: else low = t;
1574: }
1575: for (i = low; i < high; i++) {
1576: if (*(cp + a->sliceheight * i) > col) break;
1577: if (*(cp + a->sliceheight * i) == col) {
1578: if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1579: else *(vp + a->sliceheight * i) = value;
1580: #if defined(PETSC_HAVE_CUDA)
1581: inserted = PETSC_TRUE;
1582: #endif
1583: low = i + 1;
1584: goto noinsert;
1585: }
1586: }
1587: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1588: if (nonew == 1) goto noinsert;
1589: PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1590: #if defined(PETSC_HAVE_CUDA)
1591: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1592: #else
1593: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1594: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1595: #endif
1596: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1597: for (ii = nrow - 1; ii >= i; ii--) {
1598: *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1599: *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1600: }
1601: a->rlen[row]++;
1602: *(cp + a->sliceheight * i) = col;
1603: *(vp + a->sliceheight * i) = value;
1604: a->nz++;
1605: A->nonzerostate++;
1606: #if defined(PETSC_HAVE_CUDA)
1607: inserted = PETSC_TRUE;
1608: #endif
1609: low = i + 1;
1610: high++;
1611: nrow++;
1612: noinsert:;
1613: }
1614: a->rlen[row] = nrow;
1615: }
1616: #if defined(PETSC_HAVE_CUDA)
1617: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1618: #endif
1619: PetscFunctionReturn(PETSC_SUCCESS);
1620: }
1622: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1623: {
1624: PetscFunctionBegin;
1625: /* If the two matrices have the same copy implementation, use fast copy. */
1626: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1627: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1628: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1630: PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1631: PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1632: } else {
1633: PetscCall(MatCopy_Basic(A, B, str));
1634: }
1635: PetscFunctionReturn(PETSC_SUCCESS);
1636: }
1638: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1639: {
1640: PetscFunctionBegin;
1641: PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1642: PetscFunctionReturn(PETSC_SUCCESS);
1643: }
1645: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1646: {
1647: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1649: PetscFunctionBegin;
1650: *array = a->val;
1651: PetscFunctionReturn(PETSC_SUCCESS);
1652: }
1654: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1655: {
1656: PetscFunctionBegin;
1657: PetscFunctionReturn(PETSC_SUCCESS);
1658: }
1660: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1661: {
1662: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1663: MatScalar *aval = a->val;
1664: PetscScalar oalpha = alpha;
1665: PetscBLASInt one = 1, size;
1667: PetscFunctionBegin;
1668: PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1669: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1670: PetscCall(PetscLogFlops(a->nz));
1671: PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1672: #if defined(PETSC_HAVE_CUDA)
1673: if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1674: #endif
1675: PetscFunctionReturn(PETSC_SUCCESS);
1676: }
1678: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1679: {
1680: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1682: PetscFunctionBegin;
1683: if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1684: PetscCall(MatShift_Basic(Y, a));
1685: PetscFunctionReturn(PETSC_SUCCESS);
1686: }
1688: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1689: {
1690: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1691: PetscScalar *x, sum, *t;
1692: const MatScalar *idiag = NULL, *mdiag;
1693: const PetscScalar *b, *xb;
1694: PetscInt n, m = A->rmap->n, i, j, shift;
1695: const PetscInt *diag;
1697: PetscFunctionBegin;
1698: its = its * lits;
1700: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1701: if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1702: a->fshift = fshift;
1703: a->omega = omega;
1705: diag = a->diag;
1706: t = a->ssor_work;
1707: idiag = a->idiag;
1708: mdiag = a->mdiag;
1710: PetscCall(VecGetArray(xx, &x));
1711: PetscCall(VecGetArrayRead(bb, &b));
1712: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1713: PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1714: PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1715: PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1717: if (flag & SOR_ZERO_INITIAL_GUESS) {
1718: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1719: for (i = 0; i < m; i++) {
1720: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1721: sum = b[i];
1722: n = (diag[i] - shift) / a->sliceheight;
1723: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1724: t[i] = sum;
1725: x[i] = sum * idiag[i];
1726: }
1727: xb = t;
1728: PetscCall(PetscLogFlops(a->nz));
1729: } else xb = b;
1730: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1731: for (i = m - 1; i >= 0; i--) {
1732: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1733: sum = xb[i];
1734: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1735: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1736: if (xb == b) {
1737: x[i] = sum * idiag[i];
1738: } else {
1739: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1740: }
1741: }
1742: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1743: }
1744: its--;
1745: }
1746: while (its--) {
1747: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1748: for (i = 0; i < m; i++) {
1749: /* lower */
1750: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1751: sum = b[i];
1752: n = (diag[i] - shift) / a->sliceheight;
1753: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1754: t[i] = sum; /* save application of the lower-triangular part */
1755: /* upper */
1756: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1757: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1758: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1759: }
1760: xb = t;
1761: PetscCall(PetscLogFlops(2.0 * a->nz));
1762: } else xb = b;
1763: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1764: for (i = m - 1; i >= 0; i--) {
1765: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1766: sum = xb[i];
1767: if (xb == b) {
1768: /* whole matrix (no checkpointing available) */
1769: n = a->rlen[i];
1770: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1771: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1772: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1773: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1774: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1775: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1776: }
1777: }
1778: if (xb == b) {
1779: PetscCall(PetscLogFlops(2.0 * a->nz));
1780: } else {
1781: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1782: }
1783: }
1784: }
1785: PetscCall(VecRestoreArray(xx, &x));
1786: PetscCall(VecRestoreArrayRead(bb, &b));
1787: PetscFunctionReturn(PETSC_SUCCESS);
1788: }
1790: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1791: MatGetRow_SeqSELL,
1792: MatRestoreRow_SeqSELL,
1793: MatMult_SeqSELL,
1794: /* 4*/ MatMultAdd_SeqSELL,
1795: MatMultTranspose_SeqSELL,
1796: MatMultTransposeAdd_SeqSELL,
1797: NULL,
1798: NULL,
1799: NULL,
1800: /* 10*/ NULL,
1801: NULL,
1802: NULL,
1803: MatSOR_SeqSELL,
1804: NULL,
1805: /* 15*/ MatGetInfo_SeqSELL,
1806: MatEqual_SeqSELL,
1807: MatGetDiagonal_SeqSELL,
1808: MatDiagonalScale_SeqSELL,
1809: NULL,
1810: /* 20*/ NULL,
1811: MatAssemblyEnd_SeqSELL,
1812: MatSetOption_SeqSELL,
1813: MatZeroEntries_SeqSELL,
1814: /* 24*/ NULL,
1815: NULL,
1816: NULL,
1817: NULL,
1818: NULL,
1819: /* 29*/ MatSetUp_SeqSELL,
1820: NULL,
1821: NULL,
1822: NULL,
1823: NULL,
1824: /* 34*/ MatDuplicate_SeqSELL,
1825: NULL,
1826: NULL,
1827: NULL,
1828: NULL,
1829: /* 39*/ NULL,
1830: NULL,
1831: NULL,
1832: MatGetValues_SeqSELL,
1833: MatCopy_SeqSELL,
1834: /* 44*/ NULL,
1835: MatScale_SeqSELL,
1836: MatShift_SeqSELL,
1837: NULL,
1838: NULL,
1839: /* 49*/ NULL,
1840: NULL,
1841: NULL,
1842: NULL,
1843: NULL,
1844: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1845: NULL,
1846: NULL,
1847: NULL,
1848: NULL,
1849: /* 59*/ NULL,
1850: MatDestroy_SeqSELL,
1851: MatView_SeqSELL,
1852: NULL,
1853: NULL,
1854: /* 64*/ NULL,
1855: NULL,
1856: NULL,
1857: NULL,
1858: NULL,
1859: /* 69*/ NULL,
1860: NULL,
1861: NULL,
1862: NULL,
1863: NULL,
1864: /* 74*/ NULL,
1865: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1866: NULL,
1867: NULL,
1868: NULL,
1869: /* 79*/ NULL,
1870: NULL,
1871: NULL,
1872: NULL,
1873: NULL,
1874: /* 84*/ NULL,
1875: NULL,
1876: NULL,
1877: NULL,
1878: NULL,
1879: /* 89*/ NULL,
1880: NULL,
1881: NULL,
1882: NULL,
1883: NULL,
1884: /* 94*/ NULL,
1885: NULL,
1886: NULL,
1887: NULL,
1888: NULL,
1889: /* 99*/ NULL,
1890: NULL,
1891: NULL,
1892: MatConjugate_SeqSELL,
1893: NULL,
1894: /*104*/ NULL,
1895: NULL,
1896: NULL,
1897: NULL,
1898: NULL,
1899: /*109*/ NULL,
1900: NULL,
1901: NULL,
1902: NULL,
1903: MatMissingDiagonal_SeqSELL,
1904: /*114*/ NULL,
1905: NULL,
1906: NULL,
1907: NULL,
1908: NULL,
1909: /*119*/ NULL,
1910: NULL,
1911: NULL,
1912: NULL,
1913: NULL,
1914: /*124*/ NULL,
1915: NULL,
1916: NULL,
1917: NULL,
1918: NULL,
1919: /*129*/ NULL,
1920: NULL,
1921: NULL,
1922: NULL,
1923: NULL,
1924: /*134*/ NULL,
1925: NULL,
1926: NULL,
1927: NULL,
1928: NULL,
1929: /*139*/ NULL,
1930: NULL,
1931: NULL,
1932: MatFDColoringSetUp_SeqXAIJ,
1933: NULL,
1934: /*144*/ NULL,
1935: NULL,
1936: NULL,
1937: NULL,
1938: NULL,
1939: NULL,
1940: /*150*/ NULL,
1941: NULL};
1943: static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1944: {
1945: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1947: PetscFunctionBegin;
1948: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1950: /* allocate space for values if not already there */
1951: if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1953: /* copy values over */
1954: PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1955: PetscFunctionReturn(PETSC_SUCCESS);
1956: }
1958: static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1959: {
1960: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1962: PetscFunctionBegin;
1963: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1964: PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1965: PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1966: PetscFunctionReturn(PETSC_SUCCESS);
1967: }
1969: static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1970: {
1971: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1973: PetscFunctionBegin;
1974: if (a->totalslices && a->sliidx[a->totalslices]) {
1975: *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1976: } else {
1977: *ratio = 0.0;
1978: }
1979: PetscFunctionReturn(PETSC_SUCCESS);
1980: }
1982: static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1983: {
1984: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1985: PetscInt i, current_slicewidth;
1987: PetscFunctionBegin;
1988: *slicewidth = 0;
1989: for (i = 0; i < a->totalslices; i++) {
1990: current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1991: if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1992: }
1993: PetscFunctionReturn(PETSC_SUCCESS);
1994: }
1996: static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1997: {
1998: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2000: PetscFunctionBegin;
2001: *slicewidth = 0;
2002: if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2003: PetscFunctionReturn(PETSC_SUCCESS);
2004: }
2006: static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2007: {
2008: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2009: PetscReal mean;
2010: PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx;
2012: PetscFunctionBegin;
2013: *variance = 0;
2014: if (totalslices) {
2015: mean = (PetscReal)sliidx[totalslices] / totalslices;
2016: for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2017: }
2018: PetscFunctionReturn(PETSC_SUCCESS);
2019: }
2021: static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2022: {
2023: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2025: PetscFunctionBegin;
2026: if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2027: PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2028: a->sliceheight = sliceheight;
2029: #if defined(PETSC_HAVE_CUDA)
2030: PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2031: #endif
2032: PetscFunctionReturn(PETSC_SUCCESS);
2033: }
2035: /*@C
2036: MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2038: Not Collective
2040: Input Parameter:
2041: . A - a MATSEQSELL matrix
2043: Output Parameter:
2044: . ratio - ratio of number of padded zeros to number of allocated elements
2046: Level: intermediate
2048: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2049: @*/
2050: PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2051: {
2052: PetscFunctionBegin;
2053: PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2054: PetscFunctionReturn(PETSC_SUCCESS);
2055: }
2057: /*@C
2058: MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2060: Not Collective
2062: Input Parameter:
2063: . A - a MATSEQSELL matrix
2065: Output Parameter:
2066: . slicewidth - maximum slice width
2068: Level: intermediate
2070: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2071: @*/
2072: PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2073: {
2074: PetscFunctionBegin;
2075: PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2076: PetscFunctionReturn(PETSC_SUCCESS);
2077: }
2079: /*@C
2080: MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2082: Not Collective
2084: Input Parameter:
2085: . A - a MATSEQSELL matrix
2087: Output Parameter:
2088: . slicewidth - average slice width
2090: Level: intermediate
2092: .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2093: @*/
2094: PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2095: {
2096: PetscFunctionBegin;
2097: PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2098: PetscFunctionReturn(PETSC_SUCCESS);
2099: }
2101: /*@C
2102: MatSeqSELLSetSliceHeight - sets the slice height.
2104: Not Collective
2106: Input Parameters:
2107: + A - a MATSEQSELL matrix
2108: - sliceheight - slice height
2110: Notes:
2111: You cannot change the slice height once it have been set.
2113: The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2115: Level: intermediate
2117: .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2118: @*/
2119: PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2120: {
2121: PetscFunctionBegin;
2122: PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2123: PetscFunctionReturn(PETSC_SUCCESS);
2124: }
2126: /*@C
2127: MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2129: Not Collective
2131: Input Parameter:
2132: . A - a MATSEQSELL matrix
2134: Output Parameter:
2135: . variance - variance of the slice size
2137: Level: intermediate
2139: .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2140: @*/
2141: PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2142: {
2143: PetscFunctionBegin;
2144: PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2145: PetscFunctionReturn(PETSC_SUCCESS);
2146: }
2148: #if defined(PETSC_HAVE_CUDA)
2149: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2150: #endif
2152: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2153: {
2154: Mat_SeqSELL *b;
2155: PetscMPIInt size;
2157: PetscFunctionBegin;
2158: PetscCall(PetscCitationsRegister(citation, &cited));
2159: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2160: PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2162: PetscCall(PetscNew(&b));
2164: B->data = (void *)b;
2165: B->ops[0] = MatOps_Values;
2167: b->row = NULL;
2168: b->col = NULL;
2169: b->icol = NULL;
2170: b->reallocs = 0;
2171: b->ignorezeroentries = PETSC_FALSE;
2172: b->roworiented = PETSC_TRUE;
2173: b->nonew = 0;
2174: b->diag = NULL;
2175: b->solve_work = NULL;
2176: B->spptr = NULL;
2177: b->saved_values = NULL;
2178: b->idiag = NULL;
2179: b->mdiag = NULL;
2180: b->ssor_work = NULL;
2181: b->omega = 1.0;
2182: b->fshift = 0.0;
2183: b->idiagvalid = PETSC_FALSE;
2184: b->keepnonzeropattern = PETSC_FALSE;
2185: b->sliceheight = 0;
2187: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2188: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2189: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2190: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2191: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2192: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2193: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2194: #if defined(PETSC_HAVE_CUDA)
2195: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2196: #endif
2197: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2198: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2199: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2200: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2201: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2203: PetscObjectOptionsBegin((PetscObject)B);
2204: {
2205: PetscInt newsh = -1;
2206: PetscBool flg;
2207: #if defined(PETSC_HAVE_CUDA)
2208: PetscInt chunksize = 0;
2209: #endif
2211: PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2212: if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2213: #if defined(PETSC_HAVE_CUDA)
2214: PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2215: if (flg) {
2216: PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2217: b->chunksize = chunksize;
2218: }
2219: #endif
2220: }
2221: PetscOptionsEnd();
2222: PetscFunctionReturn(PETSC_SUCCESS);
2223: }
2225: /*
2226: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2227: */
2228: static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2229: {
2230: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2231: PetscInt i, m = A->rmap->n;
2232: PetscInt totalslices = a->totalslices;
2234: PetscFunctionBegin;
2235: C->factortype = A->factortype;
2236: c->row = NULL;
2237: c->col = NULL;
2238: c->icol = NULL;
2239: c->reallocs = 0;
2240: C->assembled = PETSC_TRUE;
2242: PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2243: PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2245: PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2246: PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2248: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2249: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2251: /* allocate the matrix space */
2252: if (mallocmatspace) {
2253: PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2255: c->singlemalloc = PETSC_TRUE;
2257: if (m > 0) {
2258: PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2259: if (cpvalues == MAT_COPY_VALUES) {
2260: PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2261: } else {
2262: PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2263: }
2264: }
2265: }
2267: c->ignorezeroentries = a->ignorezeroentries;
2268: c->roworiented = a->roworiented;
2269: c->nonew = a->nonew;
2270: if (a->diag) {
2271: PetscCall(PetscMalloc1(m, &c->diag));
2272: for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2273: } else c->diag = NULL;
2275: c->solve_work = NULL;
2276: c->saved_values = NULL;
2277: c->idiag = NULL;
2278: c->ssor_work = NULL;
2279: c->keepnonzeropattern = a->keepnonzeropattern;
2280: c->free_val = PETSC_TRUE;
2281: c->free_colidx = PETSC_TRUE;
2283: c->maxallocmat = a->maxallocmat;
2284: c->maxallocrow = a->maxallocrow;
2285: c->rlenmax = a->rlenmax;
2286: c->nz = a->nz;
2287: C->preallocated = PETSC_TRUE;
2289: c->nonzerorowcnt = a->nonzerorowcnt;
2290: C->nonzerostate = A->nonzerostate;
2292: PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2293: PetscFunctionReturn(PETSC_SUCCESS);
2294: }
2296: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2297: {
2298: PetscFunctionBegin;
2299: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2300: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2301: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2302: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2303: PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2304: PetscFunctionReturn(PETSC_SUCCESS);
2305: }
2307: /*MC
2308: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2309: based on the sliced Ellpack format, {cite}`zhangellpack2018`
2311: Options Database Key:
2312: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2314: Level: beginner
2316: .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2317: M*/
2319: /*MC
2320: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2322: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2323: and `MATMPISELL` otherwise. As a result, for single process communicators,
2324: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2325: for communicators controlling multiple processes. It is recommended that you call both of
2326: the above preallocation routines for simplicity.
2328: Options Database Key:
2329: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2331: Level: beginner
2333: Notes:
2334: This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2336: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2337: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2339: Developer Notes:
2340: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2342: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2343: .vb
2344: (2 0 3 4)
2345: Consider the matrix A = (5 0 6 0)
2346: (0 0 7 8)
2347: (0 0 9 9)
2349: symbolically the Ellpack format can be written as
2351: (2 3 4 |) (0 2 3 |)
2352: v = (5 6 0 |) colidx = (0 2 2 |)
2353: -------- ---------
2354: (7 8 |) (2 3 |)
2355: (9 9 |) (2 3 |)
2357: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2358: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2359: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2361: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2363: .ve
2365: See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2367: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2368: M*/
2370: /*@C
2371: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2373: Collective
2375: Input Parameters:
2376: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2377: . m - number of rows
2378: . n - number of columns
2379: . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2380: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2382: Output Parameter:
2383: . A - the matrix
2385: Level: intermediate
2387: Notes:
2388: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2389: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2390: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2392: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2393: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2394: allocation.
2396: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2397: @*/
2398: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2399: {
2400: PetscFunctionBegin;
2401: PetscCall(MatCreate(comm, A));
2402: PetscCall(MatSetSizes(*A, m, n, m, n));
2403: PetscCall(MatSetType(*A, MATSEQSELL));
2404: PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2405: PetscFunctionReturn(PETSC_SUCCESS);
2406: }
2408: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2409: {
2410: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2411: PetscInt totalslices = a->totalslices;
2413: PetscFunctionBegin;
2414: /* If the matrix dimensions are not equal,or no of nonzeros */
2415: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2416: *flg = PETSC_FALSE;
2417: PetscFunctionReturn(PETSC_SUCCESS);
2418: }
2419: /* if the a->colidx are the same */
2420: PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2421: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2422: /* if a->val are the same */
2423: PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2424: PetscFunctionReturn(PETSC_SUCCESS);
2425: }
2427: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2428: {
2429: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2431: PetscFunctionBegin;
2432: a->idiagvalid = PETSC_FALSE;
2433: PetscFunctionReturn(PETSC_SUCCESS);
2434: }
2436: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2437: {
2438: #if defined(PETSC_USE_COMPLEX)
2439: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2440: PetscInt i;
2441: PetscScalar *val = a->val;
2443: PetscFunctionBegin;
2444: for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2445: #if defined(PETSC_HAVE_CUDA)
2446: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2447: #endif
2448: #else
2449: PetscFunctionBegin;
2450: #endif
2451: PetscFunctionReturn(PETSC_SUCCESS);
2452: }