Actual source code: sell.c
2: /*
3: Defines the basic matrix operations for the SELL matrix storage format.
4: */
5: #include <../src/mat/impls/sell/seq/sell.h>
6: #include <petscblaslapack.h>
7: #include <petsc/private/kernels/blocktranspose.h>
9: static PetscBool cited = PETSC_FALSE;
10: static const char citation[] =
11: "@inproceedings{ZhangELLPACK2018,\n"
12: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15: " year = 2018\n"
16: "}\n";
18: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
20: #include <immintrin.h>
22: #if !defined(_MM_SCALE_8)
23: #define _MM_SCALE_8 8
24: #endif
26: #if defined(__AVX512F__)
27: /* these do not work
28: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30: */
31: #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33: vec_idx = _mm256_loadu_si256((__m256i const*)acolidx); \
34: vec_vals = _mm512_loadu_pd(aval); \
35: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37: #elif defined(__AVX2__) && defined(__FMA__)
38: #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39: vec_vals = _mm256_loadu_pd(aval); \
40: vec_idx = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41: vec_x = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42: vec_y = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43: #endif
44: #endif /* PETSC_HAVE_IMMINTRIN_H */
46: /*@C
47: MatSeqSELLSetPreallocation - For good matrix assembly performance
48: the user should preallocate the matrix storage by setting the parameter nz
49: (or the array nnz). By setting these parameters accurately, performance
50: during matrix assembly can be increased significantly.
52: Collective
54: Input Parameters:
55: + B - The matrix
56: . nz - number of nonzeros per row (same for all rows)
57: - nnz - array containing the number of nonzeros in the various rows
58: (possibly different for each row) or NULL
60: Notes:
61: If nnz is given then nz is ignored.
63: Specify the preallocated storage with either nz or nnz (not both).
64: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65: allocation. For large problems you MUST preallocate memory or you
66: will get TERRIBLE performance, see the users' manual chapter on matrices.
68: You can call MatGetInfo() to get information on how effective the preallocation was;
69: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70: You can also run with the option -info and look for messages with the string
71: malloc in them to see if additional memory allocation was needed.
73: Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74: entries or columns indices.
76: The maximum number of nonzeos in any row should be as accurate as possible.
77: If it is underestimated, you will get bad performance due to reallocation
78: (MatSeqXSELLReallocateSELL).
80: Level: intermediate
82: .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
84: @*/
85: PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86: {
89: PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
90: return 0;
91: }
93: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
94: {
95: Mat_SeqSELL *b;
96: PetscInt i,j,totalslices;
97: PetscBool skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
100: if (maxallocrow == MAT_SKIP_ALLOCATION) {
101: skipallocation = PETSC_TRUE;
102: maxallocrow = 0;
103: }
105: PetscLayoutSetUp(B->rmap);
106: PetscLayoutSetUp(B->cmap);
108: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
109: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
111: if (rlen) {
112: for (i=0; i<B->rmap->n; i++) {
115: }
116: }
118: B->preallocated = PETSC_TRUE;
120: b = (Mat_SeqSELL*)B->data;
122: totalslices = PetscCeilInt(B->rmap->n,8);
123: b->totalslices = totalslices;
124: if (!skipallocation) {
125: if (B->rmap->n & 0x07) PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n);
127: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
128: PetscMalloc1(totalslices+1,&b->sliidx);
129: PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));
130: }
131: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
132: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
133: else if (maxallocrow < 0) maxallocrow = 1;
134: for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
135: } else {
136: maxallocrow = 0;
137: b->sliidx[0] = 0;
138: for (i=1; i<totalslices; i++) {
139: b->sliidx[i] = 0;
140: for (j=0;j<8;j++) {
141: b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
142: }
143: maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
144: PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);
145: }
146: /* last slice */
147: b->sliidx[totalslices] = 0;
148: for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
149: maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
150: b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
151: }
153: /* allocate space for val, colidx, rlen */
154: /* FIXME: should B's old memory be unlogged? */
155: MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);
156: /* FIXME: assuming an element of the bit array takes 8 bits */
157: PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);
158: PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));
159: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
160: PetscCalloc1(8*totalslices,&b->rlen);
161: PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));
163: b->singlemalloc = PETSC_TRUE;
164: b->free_val = PETSC_TRUE;
165: b->free_colidx = PETSC_TRUE;
166: } else {
167: b->free_val = PETSC_FALSE;
168: b->free_colidx = PETSC_FALSE;
169: }
171: b->nz = 0;
172: b->maxallocrow = maxallocrow;
173: b->rlenmax = maxallocrow;
174: b->maxallocmat = b->sliidx[totalslices];
175: B->info.nz_unneeded = (double)b->maxallocmat;
176: if (realalloc) {
177: MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
178: }
179: return 0;
180: }
182: PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
183: {
184: Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
185: PetscInt shift;
188: if (nz) *nz = a->rlen[row];
189: shift = a->sliidx[row>>3]+(row&0x07);
190: if (!a->getrowcols) {
192: PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);
193: }
194: if (idx) {
195: PetscInt j;
196: for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
197: *idx = a->getrowcols;
198: }
199: if (v) {
200: PetscInt j;
201: for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
202: *v = a->getrowvals;
203: }
204: return 0;
205: }
207: PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
208: {
209: return 0;
210: }
212: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
213: {
214: Mat B;
215: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
216: PetscInt i;
218: if (reuse == MAT_REUSE_MATRIX) {
219: B = *newmat;
220: MatZeroEntries(B);
221: } else {
222: MatCreate(PetscObjectComm((PetscObject)A),&B);
223: MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);
224: MatSetType(B,MATSEQAIJ);
225: MatSeqAIJSetPreallocation(B,0,a->rlen);
226: }
228: for (i=0; i<A->rmap->n; i++) {
229: PetscInt nz = 0,*cols = NULL;
230: PetscScalar *vals = NULL;
232: MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);
233: MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);
234: MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);
235: }
237: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
238: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
239: B->rmap->bs = A->rmap->bs;
241: if (reuse == MAT_INPLACE_MATRIX) {
242: MatHeaderReplace(A,&B);
243: } else {
244: *newmat = B;
245: }
246: return 0;
247: }
249: #include <../src/mat/impls/aij/seq/aij.h>
251: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252: {
253: Mat B;
254: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
255: PetscInt *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256: const PetscInt *cols;
257: const PetscScalar *vals;
260: if (reuse == MAT_REUSE_MATRIX) {
261: B = *newmat;
262: } else {
263: if (PetscDefined(USE_DEBUG) || !a->ilen) {
264: PetscMalloc1(m,&rowlengths);
265: for (i=0; i<m; i++) {
266: rowlengths[i] = ai[i+1] - ai[i];
267: }
268: }
269: if (PetscDefined(USE_DEBUG) && a->ilen) {
270: PetscBool eq;
271: PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq);
273: PetscFree(rowlengths);
274: rowlengths = a->ilen;
275: } else if (a->ilen) rowlengths = a->ilen;
276: MatCreate(PetscObjectComm((PetscObject)A),&B);
277: MatSetSizes(B,m,n,m,n);
278: MatSetType(B,MATSEQSELL);
279: MatSeqSELLSetPreallocation(B,0,rowlengths);
280: if (rowlengths != a->ilen) PetscFree(rowlengths);
281: }
283: for (row=0; row<m; row++) {
284: MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals);
285: MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES);
286: MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals);
287: }
288: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
289: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
290: B->rmap->bs = A->rmap->bs;
292: if (reuse == MAT_INPLACE_MATRIX) {
293: MatHeaderReplace(A,&B);
294: } else {
295: *newmat = B;
296: }
297: return 0;
298: }
300: PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
301: {
302: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
303: PetscScalar *y;
304: const PetscScalar *x;
305: const MatScalar *aval=a->val;
306: PetscInt totalslices=a->totalslices;
307: const PetscInt *acolidx=a->colidx;
308: PetscInt i,j;
309: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
310: __m512d vec_x,vec_y,vec_vals;
311: __m256i vec_idx;
312: __mmask8 mask;
313: __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
314: __m256i vec_idx2,vec_idx3,vec_idx4;
315: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316: __m128i vec_idx;
317: __m256d vec_x,vec_y,vec_y2,vec_vals;
318: MatScalar yval;
319: PetscInt r,rows_left,row,nnz_in_row;
320: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
321: __m128d vec_x_tmp;
322: __m256d vec_x,vec_y,vec_y2,vec_vals;
323: MatScalar yval;
324: PetscInt r,rows_left,row,nnz_in_row;
325: #else
326: PetscScalar sum[8];
327: #endif
329: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330: #pragma disjoint(*x,*y,*aval)
331: #endif
333: VecGetArrayRead(xx,&x);
334: VecGetArray(yy,&y);
335: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336: for (i=0; i<totalslices; i++) { /* loop over slices */
337: PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
338: PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
340: vec_y = _mm512_setzero_pd();
341: vec_y2 = _mm512_setzero_pd();
342: vec_y3 = _mm512_setzero_pd();
343: vec_y4 = _mm512_setzero_pd();
345: j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
346: switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
347: case 3:
348: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
349: acolidx += 8; aval += 8;
350: AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
351: acolidx += 8; aval += 8;
352: AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
353: acolidx += 8; aval += 8;
354: j += 3;
355: break;
356: case 2:
357: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
358: acolidx += 8; aval += 8;
359: AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
360: acolidx += 8; aval += 8;
361: j += 2;
362: break;
363: case 1:
364: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365: acolidx += 8; aval += 8;
366: j += 1;
367: break;
368: }
369: #pragma novector
370: for (; j<(a->sliidx[i+1]>>3); j+=4) {
371: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372: acolidx += 8; aval += 8;
373: AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
374: acolidx += 8; aval += 8;
375: AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
376: acolidx += 8; aval += 8;
377: AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
378: acolidx += 8; aval += 8;
379: }
381: vec_y = _mm512_add_pd(vec_y,vec_y2);
382: vec_y = _mm512_add_pd(vec_y,vec_y3);
383: vec_y = _mm512_add_pd(vec_y,vec_y4);
384: if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385: mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
386: _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
387: } else {
388: _mm512_storeu_pd(&y[8*i],vec_y);
389: }
390: }
391: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
392: for (i=0; i<totalslices; i++) { /* loop over full slices */
393: PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
394: PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
396: /* last slice may have padding rows. Don't use vectorization. */
397: if (i == totalslices-1 && (A->rmap->n & 0x07)) {
398: rows_left = A->rmap->n - 8*i;
399: for (r=0; r<rows_left; ++r) {
400: yval = (MatScalar)0;
401: row = 8*i + r;
402: nnz_in_row = a->rlen[row];
403: for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
404: y[row] = yval;
405: }
406: break;
407: }
409: vec_y = _mm256_setzero_pd();
410: vec_y2 = _mm256_setzero_pd();
412: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413: #pragma novector
414: #pragma unroll(2)
415: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
416: AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
417: aval += 4; acolidx += 4;
418: AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
419: aval += 4; acolidx += 4;
420: }
422: _mm256_storeu_pd(y+i*8,vec_y);
423: _mm256_storeu_pd(y+i*8+4,vec_y2);
424: }
425: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
426: for (i=0; i<totalslices; i++) { /* loop over full slices */
427: PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
428: PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
430: vec_y = _mm256_setzero_pd();
431: vec_y2 = _mm256_setzero_pd();
433: /* last slice may have padding rows. Don't use vectorization. */
434: if (i == totalslices-1 && (A->rmap->n & 0x07)) {
435: rows_left = A->rmap->n - 8*i;
436: for (r=0; r<rows_left; ++r) {
437: yval = (MatScalar)0;
438: row = 8*i + r;
439: nnz_in_row = a->rlen[row];
440: for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
441: y[row] = yval;
442: }
443: break;
444: }
446: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447: #pragma novector
448: #pragma unroll(2)
449: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
450: vec_vals = _mm256_loadu_pd(aval);
451: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
454: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
457: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
458: aval += 4;
460: vec_vals = _mm256_loadu_pd(aval);
461: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
464: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
467: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
468: aval += 4;
469: }
471: _mm256_storeu_pd(y + i*8, vec_y);
472: _mm256_storeu_pd(y + i*8 + 4, vec_y2);
473: }
474: #else
475: for (i=0; i<totalslices; i++) { /* loop over slices */
476: for (j=0; j<8; j++) sum[j] = 0.0;
477: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
478: sum[0] += aval[j] * x[acolidx[j]];
479: sum[1] += aval[j+1] * x[acolidx[j+1]];
480: sum[2] += aval[j+2] * x[acolidx[j+2]];
481: sum[3] += aval[j+3] * x[acolidx[j+3]];
482: sum[4] += aval[j+4] * x[acolidx[j+4]];
483: sum[5] += aval[j+5] * x[acolidx[j+5]];
484: sum[6] += aval[j+6] * x[acolidx[j+6]];
485: sum[7] += aval[j+7] * x[acolidx[j+7]];
486: }
487: if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488: for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
489: } else {
490: for (j=0; j<8; j++) y[8*i+j] = sum[j];
491: }
492: }
493: #endif
495: PetscLogFlops(2.0*a->nz-a->nonzerorowcnt); /* theoretical minimal FLOPs */
496: VecRestoreArrayRead(xx,&x);
497: VecRestoreArray(yy,&y);
498: return 0;
499: }
501: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502: PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
503: {
504: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
505: PetscScalar *y,*z;
506: const PetscScalar *x;
507: const MatScalar *aval=a->val;
508: PetscInt totalslices=a->totalslices;
509: const PetscInt *acolidx=a->colidx;
510: PetscInt i,j;
511: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
512: __m512d vec_x,vec_y,vec_vals;
513: __m256i vec_idx;
514: __mmask8 mask;
515: __m512d vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
516: __m256i vec_idx2,vec_idx3,vec_idx4;
517: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518: __m128d vec_x_tmp;
519: __m256d vec_x,vec_y,vec_y2,vec_vals;
520: MatScalar yval;
521: PetscInt r,row,nnz_in_row;
522: #else
523: PetscScalar sum[8];
524: #endif
526: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
527: #pragma disjoint(*x,*y,*aval)
528: #endif
530: VecGetArrayRead(xx,&x);
531: VecGetArrayPair(yy,zz,&y,&z);
532: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
533: for (i=0; i<totalslices; i++) { /* loop over slices */
534: PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535: PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
537: if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
538: mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
539: vec_y = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
540: } else {
541: vec_y = _mm512_loadu_pd(&y[8*i]);
542: }
543: vec_y2 = _mm512_setzero_pd();
544: vec_y3 = _mm512_setzero_pd();
545: vec_y4 = _mm512_setzero_pd();
547: j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
548: switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
549: case 3:
550: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
551: acolidx += 8; aval += 8;
552: AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
553: acolidx += 8; aval += 8;
554: AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
555: acolidx += 8; aval += 8;
556: j += 3;
557: break;
558: case 2:
559: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
560: acolidx += 8; aval += 8;
561: AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
562: acolidx += 8; aval += 8;
563: j += 2;
564: break;
565: case 1:
566: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
567: acolidx += 8; aval += 8;
568: j += 1;
569: break;
570: }
571: #pragma novector
572: for (; j<(a->sliidx[i+1]>>3); j+=4) {
573: AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
574: acolidx += 8; aval += 8;
575: AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
576: acolidx += 8; aval += 8;
577: AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
578: acolidx += 8; aval += 8;
579: AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
580: acolidx += 8; aval += 8;
581: }
583: vec_y = _mm512_add_pd(vec_y,vec_y2);
584: vec_y = _mm512_add_pd(vec_y,vec_y3);
585: vec_y = _mm512_add_pd(vec_y,vec_y4);
586: if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
587: _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
588: } else {
589: _mm512_storeu_pd(&z[8*i],vec_y);
590: }
591: }
592: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
593: for (i=0; i<totalslices; i++) { /* loop over full slices */
594: PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595: PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
597: /* last slice may have padding rows. Don't use vectorization. */
598: if (i == totalslices-1 && (A->rmap->n & 0x07)) {
599: for (r=0; r<(A->rmap->n & 0x07); ++r) {
600: row = 8*i + r;
601: yval = (MatScalar)0.0;
602: nnz_in_row = a->rlen[row];
603: for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
604: z[row] = y[row] + yval;
605: }
606: break;
607: }
609: vec_y = _mm256_loadu_pd(y+8*i);
610: vec_y2 = _mm256_loadu_pd(y+8*i+4);
612: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
613: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
614: vec_vals = _mm256_loadu_pd(aval);
615: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
616: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
617: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
618: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
619: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
620: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
621: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
622: aval += 4;
624: vec_vals = _mm256_loadu_pd(aval);
625: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
626: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
627: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
628: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
629: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
630: vec_x = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
631: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
632: aval += 4;
633: }
635: _mm256_storeu_pd(z+i*8,vec_y);
636: _mm256_storeu_pd(z+i*8+4,vec_y2);
637: }
638: #else
639: for (i=0; i<totalslices; i++) { /* loop over slices */
640: for (j=0; j<8; j++) sum[j] = 0.0;
641: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
642: sum[0] += aval[j] * x[acolidx[j]];
643: sum[1] += aval[j+1] * x[acolidx[j+1]];
644: sum[2] += aval[j+2] * x[acolidx[j+2]];
645: sum[3] += aval[j+3] * x[acolidx[j+3]];
646: sum[4] += aval[j+4] * x[acolidx[j+4]];
647: sum[5] += aval[j+5] * x[acolidx[j+5]];
648: sum[6] += aval[j+6] * x[acolidx[j+6]];
649: sum[7] += aval[j+7] * x[acolidx[j+7]];
650: }
651: if (i == totalslices-1 && (A->rmap->n & 0x07)) {
652: for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
653: } else {
654: for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
655: }
656: }
657: #endif
659: PetscLogFlops(2.0*a->nz);
660: VecRestoreArrayRead(xx,&x);
661: VecRestoreArrayPair(yy,zz,&y,&z);
662: return 0;
663: }
665: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
666: {
667: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
668: PetscScalar *y;
669: const PetscScalar *x;
670: const MatScalar *aval=a->val;
671: const PetscInt *acolidx=a->colidx;
672: PetscInt i,j,r,row,nnz_in_row,totalslices=a->totalslices;
674: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675: #pragma disjoint(*x,*y,*aval)
676: #endif
678: if (A->symmetric) {
679: MatMultAdd_SeqSELL(A,xx,zz,yy);
680: return 0;
681: }
682: if (zz != yy) VecCopy(zz,yy);
683: VecGetArrayRead(xx,&x);
684: VecGetArray(yy,&y);
685: for (i=0; i<a->totalslices; i++) { /* loop over slices */
686: if (i == totalslices-1 && (A->rmap->n & 0x07)) {
687: for (r=0; r<(A->rmap->n & 0x07); ++r) {
688: row = 8*i + r;
689: nnz_in_row = a->rlen[row];
690: for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
691: }
692: break;
693: }
694: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
695: y[acolidx[j]] += aval[j] * x[8*i];
696: y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
697: y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
698: y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
699: y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
700: y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
701: y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
702: y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
703: }
704: }
705: PetscLogFlops(2.0*a->sliidx[a->totalslices]);
706: VecRestoreArrayRead(xx,&x);
707: VecRestoreArray(yy,&y);
708: return 0;
709: }
711: PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
712: {
713: if (A->symmetric) {
714: MatMult_SeqSELL(A,xx,yy);
715: } else {
716: VecSet(yy,0.0);
717: MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);
718: }
719: return 0;
720: }
722: /*
723: Checks for missing diagonals
724: */
725: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool *missing,PetscInt *d)
726: {
727: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
728: PetscInt *diag,i;
730: *missing = PETSC_FALSE;
731: if (A->rmap->n > 0 && !(a->colidx)) {
732: *missing = PETSC_TRUE;
733: if (d) *d = 0;
734: PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
735: } else {
736: diag = a->diag;
737: for (i=0; i<A->rmap->n; i++) {
738: if (diag[i] == -1) {
739: *missing = PETSC_TRUE;
740: if (d) *d = i;
741: PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i);
742: break;
743: }
744: }
745: }
746: return 0;
747: }
749: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
750: {
751: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
752: PetscInt i,j,m=A->rmap->n,shift;
754: if (!a->diag) {
755: PetscMalloc1(m,&a->diag);
756: PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));
757: a->free_diag = PETSC_TRUE;
758: }
759: for (i=0; i<m; i++) { /* loop over rows */
760: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
761: a->diag[i] = -1;
762: for (j=0; j<a->rlen[i]; j++) {
763: if (a->colidx[shift+j*8] == i) {
764: a->diag[i] = shift+j*8;
765: break;
766: }
767: }
768: }
769: return 0;
770: }
772: /*
773: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
774: */
775: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
776: {
777: Mat_SeqSELL *a=(Mat_SeqSELL*) A->data;
778: PetscInt i,*diag,m = A->rmap->n;
779: MatScalar *val = a->val;
780: PetscScalar *idiag,*mdiag;
782: if (a->idiagvalid) return 0;
783: MatMarkDiagonal_SeqSELL(A);
784: diag = a->diag;
785: if (!a->idiag) {
786: PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);
787: PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));
788: val = a->val;
789: }
790: mdiag = a->mdiag;
791: idiag = a->idiag;
793: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
794: for (i=0; i<m; i++) {
795: mdiag[i] = val[diag[i]];
796: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
797: if (PetscRealPart(fshift)) {
798: PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i);
799: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
800: A->factorerror_zeropivot_value = 0.0;
801: A->factorerror_zeropivot_row = i;
802: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
803: }
804: idiag[i] = 1.0/val[diag[i]];
805: }
806: PetscLogFlops(m);
807: } else {
808: for (i=0; i<m; i++) {
809: mdiag[i] = val[diag[i]];
810: idiag[i] = omega/(fshift + val[diag[i]]);
811: }
812: PetscLogFlops(2.0*m);
813: }
814: a->idiagvalid = PETSC_TRUE;
815: return 0;
816: }
818: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
819: {
820: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
822: PetscArrayzero(a->val,a->sliidx[a->totalslices]);
823: MatSeqSELLInvalidateDiagonal(A);
824: return 0;
825: }
827: PetscErrorCode MatDestroy_SeqSELL(Mat A)
828: {
829: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
831: #if defined(PETSC_USE_LOG)
832: PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
833: #endif
834: MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);
835: ISDestroy(&a->row);
836: ISDestroy(&a->col);
837: PetscFree(a->diag);
838: PetscFree(a->rlen);
839: PetscFree(a->sliidx);
840: PetscFree3(a->idiag,a->mdiag,a->ssor_work);
841: PetscFree(a->solve_work);
842: ISDestroy(&a->icol);
843: PetscFree(a->saved_values);
844: PetscFree2(a->getrowcols,a->getrowvals);
846: PetscFree(A->data);
848: PetscObjectChangeTypeName((PetscObject)A,NULL);
849: PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);
850: PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);
851: PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);
852: return 0;
853: }
855: PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
856: {
857: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
859: switch (op) {
860: case MAT_ROW_ORIENTED:
861: a->roworiented = flg;
862: break;
863: case MAT_KEEP_NONZERO_PATTERN:
864: a->keepnonzeropattern = flg;
865: break;
866: case MAT_NEW_NONZERO_LOCATIONS:
867: a->nonew = (flg ? 0 : 1);
868: break;
869: case MAT_NEW_NONZERO_LOCATION_ERR:
870: a->nonew = (flg ? -1 : 0);
871: break;
872: case MAT_NEW_NONZERO_ALLOCATION_ERR:
873: a->nonew = (flg ? -2 : 0);
874: break;
875: case MAT_UNUSED_NONZERO_LOCATION_ERR:
876: a->nounused = (flg ? -1 : 0);
877: break;
878: case MAT_FORCE_DIAGONAL_ENTRIES:
879: case MAT_IGNORE_OFF_PROC_ENTRIES:
880: case MAT_USE_HASH_TABLE:
881: case MAT_SORTED_FULL:
882: PetscInfo(A,"Option %s ignored\n",MatOptions[op]);
883: break;
884: case MAT_SPD:
885: case MAT_SYMMETRIC:
886: case MAT_STRUCTURALLY_SYMMETRIC:
887: case MAT_HERMITIAN:
888: case MAT_SYMMETRY_ETERNAL:
889: /* These options are handled directly by MatSetOption() */
890: break;
891: default:
892: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
893: }
894: return 0;
895: }
897: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
898: {
899: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
900: PetscInt i,j,n,shift;
901: PetscScalar *x,zero=0.0;
903: VecGetLocalSize(v,&n);
906: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
907: PetscInt *diag=a->diag;
908: VecGetArray(v,&x);
909: for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
910: VecRestoreArray(v,&x);
911: return 0;
912: }
914: VecSet(v,zero);
915: VecGetArray(v,&x);
916: for (i=0; i<n; i++) { /* loop over rows */
917: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
918: x[i] = 0;
919: for (j=0; j<a->rlen[i]; j++) {
920: if (a->colidx[shift+j*8] == i) {
921: x[i] = a->val[shift+j*8];
922: break;
923: }
924: }
925: }
926: VecRestoreArray(v,&x);
927: return 0;
928: }
930: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
931: {
932: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
933: const PetscScalar *l,*r;
934: PetscInt i,j,m,n,row;
936: if (ll) {
937: /* The local size is used so that VecMPI can be passed to this routine
938: by MatDiagonalScale_MPISELL */
939: VecGetLocalSize(ll,&m);
941: VecGetArrayRead(ll,&l);
942: for (i=0; i<a->totalslices; i++) { /* loop over slices */
943: if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
944: for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
945: if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
946: }
947: } else {
948: for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
949: a->val[j] *= l[8*i+row];
950: }
951: }
952: }
953: VecRestoreArrayRead(ll,&l);
954: PetscLogFlops(a->nz);
955: }
956: if (rr) {
957: VecGetLocalSize(rr,&n);
959: VecGetArrayRead(rr,&r);
960: for (i=0; i<a->totalslices; i++) { /* loop over slices */
961: if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
962: for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
963: if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
964: }
965: } else {
966: for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
967: a->val[j] *= r[a->colidx[j]];
968: }
969: }
970: }
971: VecRestoreArrayRead(rr,&r);
972: PetscLogFlops(a->nz);
973: }
974: MatSeqSELLInvalidateDiagonal(A);
975: return 0;
976: }
978: PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
979: {
980: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
981: PetscInt *cp,i,k,low,high,t,row,col,l;
982: PetscInt shift;
983: MatScalar *vp;
985: for (k=0; k<m; k++) { /* loop over requested rows */
986: row = im[k];
987: if (row<0) continue;
989: shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
990: cp = a->colidx+shift; /* pointer to the row */
991: vp = a->val+shift; /* pointer to the row */
992: for (l=0; l<n; l++) { /* loop over requested columns */
993: col = in[l];
994: if (col<0) continue;
996: high = a->rlen[row]; low = 0; /* assume unsorted */
997: while (high-low > 5) {
998: t = (low+high)/2;
999: if (*(cp+t*8) > col) high = t;
1000: else low = t;
1001: }
1002: for (i=low; i<high; i++) {
1003: if (*(cp+8*i) > col) break;
1004: if (*(cp+8*i) == col) {
1005: *v++ = *(vp+8*i);
1006: goto finished;
1007: }
1008: }
1009: *v++ = 0.0;
1010: finished:;
1011: }
1012: }
1013: return 0;
1014: }
1016: PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1017: {
1018: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1019: PetscInt i,j,m=A->rmap->n,shift;
1020: const char *name;
1021: PetscViewerFormat format;
1023: PetscViewerGetFormat(viewer,&format);
1024: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1025: PetscInt nofinalvalue = 0;
1026: /*
1027: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1028: nofinalvalue = 1;
1029: }
1030: */
1031: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1032: PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n);
1033: PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz);
1034: #if defined(PETSC_USE_COMPLEX)
1035: PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue);
1036: #else
1037: PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue);
1038: #endif
1039: PetscViewerASCIIPrintf(viewer,"zzz = [\n");
1041: for (i=0; i<m; i++) {
1042: shift = a->sliidx[i>>3]+(i&0x07);
1043: for (j=0; j<a->rlen[i]; j++) {
1044: #if defined(PETSC_USE_COMPLEX)
1045: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1046: #else
1047: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);
1048: #endif
1049: }
1050: }
1051: /*
1052: if (nofinalvalue) {
1053: #if defined(PETSC_USE_COMPLEX)
1054: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.);
1055: #else
1056: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0);
1057: #endif
1058: }
1059: */
1060: PetscObjectGetName((PetscObject)A,&name);
1061: PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);
1062: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1063: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1064: return 0;
1065: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1066: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1067: for (i=0; i<m; i++) {
1068: PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);
1069: shift = a->sliidx[i>>3]+(i&0x07);
1070: for (j=0; j<a->rlen[i]; j++) {
1071: #if defined(PETSC_USE_COMPLEX)
1072: if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1073: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1074: } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1075: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));
1076: } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1077: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));
1078: }
1079: #else
1080: if (a->val[shift+8*j] != 0.0) PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);
1081: #endif
1082: }
1083: PetscViewerASCIIPrintf(viewer,"\n");
1084: }
1085: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1086: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1087: PetscInt cnt=0,jcnt;
1088: PetscScalar value;
1089: #if defined(PETSC_USE_COMPLEX)
1090: PetscBool realonly=PETSC_TRUE;
1091: for (i=0; i<a->sliidx[a->totalslices]; i++) {
1092: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1093: realonly = PETSC_FALSE;
1094: break;
1095: }
1096: }
1097: #endif
1099: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1100: for (i=0; i<m; i++) {
1101: jcnt = 0;
1102: shift = a->sliidx[i>>3]+(i&0x07);
1103: for (j=0; j<A->cmap->n; j++) {
1104: if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1105: value = a->val[cnt++];
1106: jcnt++;
1107: } else {
1108: value = 0.0;
1109: }
1110: #if defined(PETSC_USE_COMPLEX)
1111: if (realonly) {
1112: PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));
1113: } else {
1114: PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));
1115: }
1116: #else
1117: PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);
1118: #endif
1119: }
1120: PetscViewerASCIIPrintf(viewer,"\n");
1121: }
1122: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1123: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1124: PetscInt fshift=1;
1125: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1126: #if defined(PETSC_USE_COMPLEX)
1127: PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");
1128: #else
1129: PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");
1130: #endif
1131: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz);
1132: for (i=0; i<m; i++) {
1133: shift = a->sliidx[i>>3]+(i&0x07);
1134: for (j=0; j<a->rlen[i]; j++) {
1135: #if defined(PETSC_USE_COMPLEX)
1136: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1137: #else
1138: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);
1139: #endif
1140: }
1141: }
1142: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1143: } else if (format == PETSC_VIEWER_NATIVE) {
1144: for (i=0; i<a->totalslices; i++) { /* loop over slices */
1145: PetscInt row;
1146: PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]);
1147: for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1148: #if defined(PETSC_USE_COMPLEX)
1149: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1150: PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));
1151: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1152: PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));
1153: } else {
1154: PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));
1155: }
1156: #else
1157: PetscViewerASCIIPrintf(viewer," %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]);
1158: #endif
1159: }
1160: }
1161: } else {
1162: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1163: if (A->factortype) {
1164: for (i=0; i<m; i++) {
1165: shift = a->sliidx[i>>3]+(i&0x07);
1166: PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);
1167: /* L part */
1168: for (j=shift; j<a->diag[i]; j+=8) {
1169: #if defined(PETSC_USE_COMPLEX)
1170: if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1171: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));
1172: } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1173: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));
1174: } else {
1175: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));
1176: }
1177: #else
1178: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]);
1179: #endif
1180: }
1181: /* diagonal */
1182: j = a->diag[i];
1183: #if defined(PETSC_USE_COMPLEX)
1184: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1185: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));
1186: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1187: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));
1188: } else {
1189: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));
1190: }
1191: #else
1192: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]));
1193: #endif
1195: /* U part */
1196: for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1197: #if defined(PETSC_USE_COMPLEX)
1198: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1199: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));
1200: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1201: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));
1202: } else {
1203: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));
1204: }
1205: #else
1206: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]);
1207: #endif
1208: }
1209: PetscViewerASCIIPrintf(viewer,"\n");
1210: }
1211: } else {
1212: for (i=0; i<m; i++) {
1213: shift = a->sliidx[i>>3]+(i&0x07);
1214: PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);
1215: for (j=0; j<a->rlen[i]; j++) {
1216: #if defined(PETSC_USE_COMPLEX)
1217: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1219: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));
1221: } else {
1222: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));
1223: }
1224: #else
1225: PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);
1226: #endif
1227: }
1228: PetscViewerASCIIPrintf(viewer,"\n");
1229: }
1230: }
1231: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1232: }
1233: PetscViewerFlush(viewer);
1234: return 0;
1235: }
1237: #include <petscdraw.h>
1238: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1239: {
1240: Mat A=(Mat)Aa;
1241: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1242: PetscInt i,j,m=A->rmap->n,shift;
1243: int color;
1244: PetscReal xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1245: PetscViewer viewer;
1246: PetscViewerFormat format;
1247: PetscErrorCode ierr;
1249: PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);
1250: PetscViewerGetFormat(viewer,&format);
1251: PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);
1253: /* loop over matrix elements drawing boxes */
1255: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1256: PetscDrawCollectiveBegin(draw);
1257: /* Blue for negative, Cyan for zero and Red for positive */
1258: color = PETSC_DRAW_BLUE;
1259: for (i=0; i<m; i++) {
1260: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1261: y_l = m - i - 1.0; y_r = y_l + 1.0;
1262: for (j=0; j<a->rlen[i]; j++) {
1263: x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1264: if (PetscRealPart(a->val[shift+8*j]) >= 0.) continue;
1265: PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1266: }
1267: }
1268: color = PETSC_DRAW_CYAN;
1269: for (i=0; i<m; i++) {
1270: shift = a->sliidx[i>>3]+(i&0x07);
1271: y_l = m - i - 1.0; y_r = y_l + 1.0;
1272: for (j=0; j<a->rlen[i]; j++) {
1273: x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1274: if (a->val[shift+8*j] != 0.) continue;
1275: PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1276: }
1277: }
1278: color = PETSC_DRAW_RED;
1279: for (i=0; i<m; i++) {
1280: shift = a->sliidx[i>>3]+(i&0x07);
1281: y_l = m - i - 1.0; y_r = y_l + 1.0;
1282: for (j=0; j<a->rlen[i]; j++) {
1283: x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1284: if (PetscRealPart(a->val[shift+8*j]) <= 0.) continue;
1285: PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1286: }
1287: }
1288: PetscDrawCollectiveEnd(draw);
1289: } else {
1290: /* use contour shading to indicate magnitude of values */
1291: /* first determine max of all nonzero values */
1292: PetscReal minv=0.0,maxv=0.0;
1293: PetscInt count=0;
1294: PetscDraw popup;
1295: for (i=0; i<a->sliidx[a->totalslices]; i++) {
1296: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1297: }
1298: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1299: PetscDrawGetPopup(draw,&popup);
1300: PetscDrawScalePopup(popup,minv,maxv);
1302: PetscDrawCollectiveBegin(draw);
1303: for (i=0; i<m; i++) {
1304: shift = a->sliidx[i>>3]+(i&0x07);
1305: y_l = m - i - 1.0;
1306: y_r = y_l + 1.0;
1307: for (j=0; j<a->rlen[i]; j++) {
1308: x_l = a->colidx[shift+j*8];
1309: x_r = x_l + 1.0;
1310: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1311: PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1312: count++;
1313: }
1314: }
1315: PetscDrawCollectiveEnd(draw);
1316: }
1317: return 0;
1318: }
1320: #include <petscdraw.h>
1321: PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1322: {
1323: PetscDraw draw;
1324: PetscReal xr,yr,xl,yl,h,w;
1325: PetscBool isnull;
1327: PetscViewerDrawGetDraw(viewer,0,&draw);
1328: PetscDrawIsNull(draw,&isnull);
1329: if (isnull) return 0;
1331: xr = A->cmap->n; yr = A->rmap->n; h = yr/10.0; w = xr/10.0;
1332: xr += w; yr += h; xl = -w; yl = -h;
1333: PetscDrawSetCoordinates(draw,xl,yl,xr,yr);
1334: PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);
1335: PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);
1336: PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);
1337: PetscDrawSave(draw);
1338: return 0;
1339: }
1341: PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1342: {
1343: PetscBool iascii,isbinary,isdraw;
1345: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
1346: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);
1347: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);
1348: if (iascii) {
1349: MatView_SeqSELL_ASCII(A,viewer);
1350: } else if (isbinary) {
1351: /* MatView_SeqSELL_Binary(A,viewer); */
1352: } else if (isdraw) {
1353: MatView_SeqSELL_Draw(A,viewer);
1354: }
1355: return 0;
1356: }
1358: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1359: {
1360: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1361: PetscInt i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1362: MatScalar *vp;
1364: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
1365: /* To do: compress out the unused elements */
1366: MatMarkDiagonal_SeqSELL(A);
1367: PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);
1368: PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs);
1369: PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax);
1370: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1371: for (i=0; i<a->totalslices; ++i) {
1372: shift = a->sliidx[i]; /* starting index of the slice */
1373: cp = a->colidx+shift; /* pointer to the column indices of the slice */
1374: vp = a->val+shift; /* pointer to the nonzero values of the slice */
1375: for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1376: row = 8*i + row_in_slice;
1377: nrow = a->rlen[row]; /* number of nonzeros in row */
1378: /*
1379: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1380: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1381: */
1382: lastcol = 0;
1383: if (nrow>0) { /* nonempty row */
1384: lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1385: } else if (!row_in_slice) { /* first row of the currect slice is empty */
1386: for (j=1;j<8;j++) {
1387: if (a->rlen[8*i+j]) {
1388: lastcol = cp[j];
1389: break;
1390: }
1391: }
1392: } else {
1393: if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1394: }
1396: for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1397: cp[8*k+row_in_slice] = lastcol;
1398: vp[8*k+row_in_slice] = (MatScalar)0;
1399: }
1400: }
1401: }
1403: A->info.mallocs += a->reallocs;
1404: a->reallocs = 0;
1406: MatSeqSELLInvalidateDiagonal(A);
1407: return 0;
1408: }
1410: PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1411: {
1412: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1414: info->block_size = 1.0;
1415: info->nz_allocated = a->maxallocmat;
1416: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1417: info->nz_unneeded = (a->maxallocmat-a->sliidx[a->totalslices]);
1418: info->assemblies = A->num_ass;
1419: info->mallocs = A->info.mallocs;
1420: info->memory = ((PetscObject)A)->mem;
1421: if (A->factortype) {
1422: info->fill_ratio_given = A->info.fill_ratio_given;
1423: info->fill_ratio_needed = A->info.fill_ratio_needed;
1424: info->factor_mallocs = A->info.factor_mallocs;
1425: } else {
1426: info->fill_ratio_given = 0;
1427: info->fill_ratio_needed = 0;
1428: info->factor_mallocs = 0;
1429: }
1430: return 0;
1431: }
1433: PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1434: {
1435: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1436: PetscInt shift,i,k,l,low,high,t,ii,row,col,nrow;
1437: PetscInt *cp,nonew=a->nonew,lastcol=-1;
1438: MatScalar *vp,value;
1440: for (k=0; k<m; k++) { /* loop over added rows */
1441: row = im[k];
1442: if (row < 0) continue;
1444: shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1445: cp = a->colidx+shift; /* pointer to the row */
1446: vp = a->val+shift; /* pointer to the row */
1447: nrow = a->rlen[row];
1448: low = 0;
1449: high = nrow;
1451: for (l=0; l<n; l++) { /* loop over added columns */
1452: col = in[l];
1453: if (col<0) continue;
1455: if (a->roworiented) {
1456: value = v[l+k*n];
1457: } else {
1458: value = v[k+l*m];
1459: }
1460: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1462: /* search in this row for the specified column, i indicates the column to be set */
1463: if (col <= lastcol) low = 0;
1464: else high = nrow;
1465: lastcol = col;
1466: while (high-low > 5) {
1467: t = (low+high)/2;
1468: if (*(cp+t*8) > col) high = t;
1469: else low = t;
1470: }
1471: for (i=low; i<high; i++) {
1472: if (*(cp+i*8) > col) break;
1473: if (*(cp+i*8) == col) {
1474: if (is == ADD_VALUES) *(vp+i*8) += value;
1475: else *(vp+i*8) = value;
1476: low = i + 1;
1477: goto noinsert;
1478: }
1479: }
1480: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1481: if (nonew == 1) goto noinsert;
1483: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1484: MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1485: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1486: for (ii=nrow-1; ii>=i; ii--) {
1487: *(cp+(ii+1)*8) = *(cp+ii*8);
1488: *(vp+(ii+1)*8) = *(vp+ii*8);
1489: }
1490: a->rlen[row]++;
1491: *(cp+i*8) = col;
1492: *(vp+i*8) = value;
1493: a->nz++;
1494: A->nonzerostate++;
1495: low = i+1; high++; nrow++;
1496: noinsert:;
1497: }
1498: a->rlen[row] = nrow;
1499: }
1500: return 0;
1501: }
1503: PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1504: {
1505: /* If the two matrices have the same copy implementation, use fast copy. */
1506: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1507: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1508: Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1511: PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);
1512: } else {
1513: MatCopy_Basic(A,B,str);
1514: }
1515: return 0;
1516: }
1518: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1519: {
1520: MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL);
1521: return 0;
1522: }
1524: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1525: {
1526: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1528: *array = a->val;
1529: return 0;
1530: }
1532: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1533: {
1534: return 0;
1535: }
1537: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1538: {
1539: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1540: PetscInt i;
1541: MatScalar *aval=a->val;
1543: for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1544: return 0;
1545: }
1547: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1548: {
1549: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1550: PetscInt i;
1551: MatScalar *aval=a->val;
1553: for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1554: MatSeqSELLInvalidateDiagonal(A);
1555: return 0;
1556: }
1558: PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1559: {
1560: Mat_SeqSELL *a=(Mat_SeqSELL*)inA->data;
1561: MatScalar *aval=a->val;
1562: PetscScalar oalpha=alpha;
1563: PetscBLASInt one=1,size;
1565: PetscBLASIntCast(a->sliidx[a->totalslices],&size);
1566: PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1567: PetscLogFlops(a->nz);
1568: MatSeqSELLInvalidateDiagonal(inA);
1569: return 0;
1570: }
1572: PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1573: {
1574: Mat_SeqSELL *y=(Mat_SeqSELL*)Y->data;
1576: if (!Y->preallocated || !y->nz) {
1577: MatSeqSELLSetPreallocation(Y,1,NULL);
1578: }
1579: MatShift_Basic(Y,a);
1580: return 0;
1581: }
1583: PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1584: {
1585: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1586: PetscScalar *x,sum,*t;
1587: const MatScalar *idiag=NULL,*mdiag;
1588: const PetscScalar *b,*xb;
1589: PetscInt n,m=A->rmap->n,i,j,shift;
1590: const PetscInt *diag;
1592: its = its*lits;
1594: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1595: if (!a->idiagvalid) MatInvertDiagonal_SeqSELL(A,omega,fshift);
1596: a->fshift = fshift;
1597: a->omega = omega;
1599: diag = a->diag;
1600: t = a->ssor_work;
1601: idiag = a->idiag;
1602: mdiag = a->mdiag;
1604: VecGetArray(xx,&x);
1605: VecGetArrayRead(bb,&b);
1606: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1611: if (flag & SOR_ZERO_INITIAL_GUESS) {
1612: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1613: for (i=0; i<m; i++) {
1614: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1615: sum = b[i];
1616: n = (diag[i]-shift)/8;
1617: for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1618: t[i] = sum;
1619: x[i] = sum*idiag[i];
1620: }
1621: xb = t;
1622: PetscLogFlops(a->nz);
1623: } else xb = b;
1624: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1625: for (i=m-1; i>=0; i--) {
1626: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1627: sum = xb[i];
1628: n = a->rlen[i]-(diag[i]-shift)/8-1;
1629: for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1630: if (xb == b) {
1631: x[i] = sum*idiag[i];
1632: } else {
1633: x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */
1634: }
1635: }
1636: PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1637: }
1638: its--;
1639: }
1640: while (its--) {
1641: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1642: for (i=0; i<m; i++) {
1643: /* lower */
1644: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1645: sum = b[i];
1646: n = (diag[i]-shift)/8;
1647: for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1648: t[i] = sum; /* save application of the lower-triangular part */
1649: /* upper */
1650: n = a->rlen[i]-(diag[i]-shift)/8-1;
1651: for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1652: x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */
1653: }
1654: xb = t;
1655: PetscLogFlops(2.0*a->nz);
1656: } else xb = b;
1657: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1658: for (i=m-1; i>=0; i--) {
1659: shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1660: sum = xb[i];
1661: if (xb == b) {
1662: /* whole matrix (no checkpointing available) */
1663: n = a->rlen[i];
1664: for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1665: x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1666: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1667: n = a->rlen[i]-(diag[i]-shift)/8-1;
1668: for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1669: x[i] = (1.-omega)*x[i]+sum*idiag[i]; /* omega in idiag */
1670: }
1671: }
1672: if (xb == b) {
1673: PetscLogFlops(2.0*a->nz);
1674: } else {
1675: PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1676: }
1677: }
1678: }
1679: VecRestoreArray(xx,&x);
1680: VecRestoreArrayRead(bb,&b);
1681: return 0;
1682: }
1684: /* -------------------------------------------------------------------*/
1685: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1686: MatGetRow_SeqSELL,
1687: MatRestoreRow_SeqSELL,
1688: MatMult_SeqSELL,
1689: /* 4*/ MatMultAdd_SeqSELL,
1690: MatMultTranspose_SeqSELL,
1691: MatMultTransposeAdd_SeqSELL,
1692: NULL,
1693: NULL,
1694: NULL,
1695: /* 10*/ NULL,
1696: NULL,
1697: NULL,
1698: MatSOR_SeqSELL,
1699: NULL,
1700: /* 15*/ MatGetInfo_SeqSELL,
1701: MatEqual_SeqSELL,
1702: MatGetDiagonal_SeqSELL,
1703: MatDiagonalScale_SeqSELL,
1704: NULL,
1705: /* 20*/ NULL,
1706: MatAssemblyEnd_SeqSELL,
1707: MatSetOption_SeqSELL,
1708: MatZeroEntries_SeqSELL,
1709: /* 24*/ NULL,
1710: NULL,
1711: NULL,
1712: NULL,
1713: NULL,
1714: /* 29*/ MatSetUp_SeqSELL,
1715: NULL,
1716: NULL,
1717: NULL,
1718: NULL,
1719: /* 34*/ MatDuplicate_SeqSELL,
1720: NULL,
1721: NULL,
1722: NULL,
1723: NULL,
1724: /* 39*/ NULL,
1725: NULL,
1726: NULL,
1727: MatGetValues_SeqSELL,
1728: MatCopy_SeqSELL,
1729: /* 44*/ NULL,
1730: MatScale_SeqSELL,
1731: MatShift_SeqSELL,
1732: NULL,
1733: NULL,
1734: /* 49*/ NULL,
1735: NULL,
1736: NULL,
1737: NULL,
1738: NULL,
1739: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1740: NULL,
1741: NULL,
1742: NULL,
1743: NULL,
1744: /* 59*/ NULL,
1745: MatDestroy_SeqSELL,
1746: MatView_SeqSELL,
1747: NULL,
1748: NULL,
1749: /* 64*/ NULL,
1750: NULL,
1751: NULL,
1752: NULL,
1753: NULL,
1754: /* 69*/ NULL,
1755: NULL,
1756: NULL,
1757: NULL,
1758: NULL,
1759: /* 74*/ NULL,
1760: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1761: NULL,
1762: NULL,
1763: NULL,
1764: /* 79*/ NULL,
1765: NULL,
1766: NULL,
1767: NULL,
1768: NULL,
1769: /* 84*/ NULL,
1770: NULL,
1771: NULL,
1772: NULL,
1773: NULL,
1774: /* 89*/ NULL,
1775: NULL,
1776: NULL,
1777: NULL,
1778: NULL,
1779: /* 94*/ NULL,
1780: NULL,
1781: NULL,
1782: NULL,
1783: NULL,
1784: /* 99*/ NULL,
1785: NULL,
1786: NULL,
1787: MatConjugate_SeqSELL,
1788: NULL,
1789: /*104*/ NULL,
1790: NULL,
1791: NULL,
1792: NULL,
1793: NULL,
1794: /*109*/ NULL,
1795: NULL,
1796: NULL,
1797: NULL,
1798: MatMissingDiagonal_SeqSELL,
1799: /*114*/ NULL,
1800: NULL,
1801: NULL,
1802: NULL,
1803: NULL,
1804: /*119*/ NULL,
1805: NULL,
1806: NULL,
1807: NULL,
1808: NULL,
1809: /*124*/ NULL,
1810: NULL,
1811: NULL,
1812: NULL,
1813: NULL,
1814: /*129*/ NULL,
1815: NULL,
1816: NULL,
1817: NULL,
1818: NULL,
1819: /*134*/ NULL,
1820: NULL,
1821: NULL,
1822: NULL,
1823: NULL,
1824: /*139*/ NULL,
1825: NULL,
1826: NULL,
1827: MatFDColoringSetUp_SeqXAIJ,
1828: NULL,
1829: /*144*/ NULL,
1830: NULL,
1831: NULL,
1832: NULL
1833: };
1835: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1836: {
1837: Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data;
1841: /* allocate space for values if not already there */
1842: if (!a->saved_values) {
1843: PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);
1844: PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));
1845: }
1847: /* copy values over */
1848: PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);
1849: return 0;
1850: }
1852: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1853: {
1854: Mat_SeqSELL *a=(Mat_SeqSELL*)mat->data;
1858: PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);
1859: return 0;
1860: }
1862: /*@C
1863: MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1865: Not Collective
1867: Input Parameters:
1868: . mat - a MATSEQSELL matrix
1869: . array - pointer to the data
1871: Level: intermediate
1873: .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1874: @*/
1875: PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1876: {
1877: PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1878: return 0;
1879: }
1881: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1882: {
1883: Mat_SeqSELL *b;
1884: PetscMPIInt size;
1886: PetscCitationsRegister(citation,&cited);
1887: MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
1890: PetscNewLog(B,&b);
1892: B->data = (void*)b;
1894: PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
1896: b->row = NULL;
1897: b->col = NULL;
1898: b->icol = NULL;
1899: b->reallocs = 0;
1900: b->ignorezeroentries = PETSC_FALSE;
1901: b->roworiented = PETSC_TRUE;
1902: b->nonew = 0;
1903: b->diag = NULL;
1904: b->solve_work = NULL;
1905: B->spptr = NULL;
1906: b->saved_values = NULL;
1907: b->idiag = NULL;
1908: b->mdiag = NULL;
1909: b->ssor_work = NULL;
1910: b->omega = 1.0;
1911: b->fshift = 0.0;
1912: b->idiagvalid = PETSC_FALSE;
1913: b->keepnonzeropattern = PETSC_FALSE;
1915: PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);
1916: PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);
1917: PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);
1918: PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);
1919: PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);
1920: PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);
1921: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);
1922: return 0;
1923: }
1925: /*
1926: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1927: */
1928: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1929: {
1930: Mat_SeqSELL *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1931: PetscInt i,m=A->rmap->n;
1932: PetscInt totalslices=a->totalslices;
1934: C->factortype = A->factortype;
1935: c->row = NULL;
1936: c->col = NULL;
1937: c->icol = NULL;
1938: c->reallocs = 0;
1939: C->assembled = PETSC_TRUE;
1941: PetscLayoutReference(A->rmap,&C->rmap);
1942: PetscLayoutReference(A->cmap,&C->cmap);
1944: PetscMalloc1(8*totalslices,&c->rlen);
1945: PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));
1946: PetscMalloc1(totalslices+1,&c->sliidx);
1947: PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));
1949: for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1950: for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1952: /* allocate the matrix space */
1953: if (mallocmatspace) {
1954: PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);
1955: PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));
1957: c->singlemalloc = PETSC_TRUE;
1959: if (m > 0) {
1960: PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);
1961: if (cpvalues == MAT_COPY_VALUES) {
1962: PetscArraycpy(c->val,a->val,a->maxallocmat);
1963: } else {
1964: PetscArrayzero(c->val,a->maxallocmat);
1965: }
1966: }
1967: }
1969: c->ignorezeroentries = a->ignorezeroentries;
1970: c->roworiented = a->roworiented;
1971: c->nonew = a->nonew;
1972: if (a->diag) {
1973: PetscMalloc1(m,&c->diag);
1974: PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));
1975: for (i=0; i<m; i++) {
1976: c->diag[i] = a->diag[i];
1977: }
1978: } else c->diag = NULL;
1980: c->solve_work = NULL;
1981: c->saved_values = NULL;
1982: c->idiag = NULL;
1983: c->ssor_work = NULL;
1984: c->keepnonzeropattern = a->keepnonzeropattern;
1985: c->free_val = PETSC_TRUE;
1986: c->free_colidx = PETSC_TRUE;
1988: c->maxallocmat = a->maxallocmat;
1989: c->maxallocrow = a->maxallocrow;
1990: c->rlenmax = a->rlenmax;
1991: c->nz = a->nz;
1992: C->preallocated = PETSC_TRUE;
1994: c->nonzerorowcnt = a->nonzerorowcnt;
1995: C->nonzerostate = A->nonzerostate;
1997: PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);
1998: return 0;
1999: }
2001: PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2002: {
2003: MatCreate(PetscObjectComm((PetscObject)A),B);
2004: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
2005: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2006: MatSetBlockSizesFromMats(*B,A,A);
2007: }
2008: MatSetType(*B,((PetscObject)A)->type_name);
2009: MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);
2010: return 0;
2011: }
2013: /*MC
2014: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2015: based on the sliced Ellpack format
2017: Options Database Keys:
2018: . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2020: Level: beginner
2022: .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ
2023: M*/
2025: /*MC
2026: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2028: This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2029: and MATMPISELL otherwise. As a result, for single process communicators,
2030: MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2031: for communicators controlling multiple processes. It is recommended that you call both of
2032: the above preallocation routines for simplicity.
2034: Options Database Keys:
2035: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2037: Level: beginner
2039: Notes:
2040: This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2042: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2043: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2045: Developer Notes:
2046: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2048: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2049: .vb
2050: (2 0 3 4)
2051: Consider the matrix A = (5 0 6 0)
2052: (0 0 7 8)
2053: (0 0 9 9)
2055: symbolically the Ellpack format can be written as
2057: (2 3 4 |) (0 2 3 |)
2058: v = (5 6 0 |) colidx = (0 2 2 |)
2059: -------- ---------
2060: (7 8 |) (2 3 |)
2061: (9 9 |) (2 3 |)
2063: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2064: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2065: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2067: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2069: .ve
2071: See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2073: References:
2074: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2075: Proceedings of the 47th International Conference on Parallel Processing, 2018.
2077: .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ
2078: M*/
2080: /*@C
2081: MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2083: Collective on comm
2085: Input Parameters:
2086: + comm - MPI communicator, set to PETSC_COMM_SELF
2087: . m - number of rows
2088: . n - number of columns
2089: . rlenmax - maximum number of nonzeros in a row
2090: - rlen - array containing the number of nonzeros in the various rows
2091: (possibly different for each row) or NULL
2093: Output Parameter:
2094: . A - the matrix
2096: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2097: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2098: [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2100: Notes:
2101: If nnz is given then nz is ignored
2103: Specify the preallocated storage with either rlenmax or rlen (not both).
2104: Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2105: allocation. For large problems you MUST preallocate memory or you
2106: will get TERRIBLE performance, see the users' manual chapter on matrices.
2108: Level: intermediate
2110: .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL
2112: @*/
2113: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2114: {
2115: MatCreate(comm,A);
2116: MatSetSizes(*A,m,n,m,n);
2117: MatSetType(*A,MATSEQSELL);
2118: MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);
2119: return 0;
2120: }
2122: PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2123: {
2124: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2125: PetscInt totalslices=a->totalslices;
2127: /* If the matrix dimensions are not equal,or no of nonzeros */
2128: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2129: *flg = PETSC_FALSE;
2130: return 0;
2131: }
2132: /* if the a->colidx are the same */
2133: PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);
2134: if (!*flg) return 0;
2135: /* if a->val are the same */
2136: PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);
2137: return 0;
2138: }
2140: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2141: {
2142: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2144: a->idiagvalid = PETSC_FALSE;
2145: return 0;
2146: }
2148: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2149: {
2150: #if defined(PETSC_USE_COMPLEX)
2151: Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2152: PetscInt i;
2153: PetscScalar *val = a->val;
2155: for (i=0; i<a->sliidx[a->totalslices]; i++) {
2156: val[i] = PetscConj(val[i]);
2157: }
2158: #else
2159: #endif
2160: return 0;
2161: }