Actual source code: sell.c


  2: /*
  3:   Defines the basic matrix operations for the SELL matrix storage format.
  4: */
  5: #include <../src/mat/impls/sell/seq/sell.h>
  6: #include <petscblaslapack.h>
  7: #include <petsc/private/kernels/blocktranspose.h>

  9: static PetscBool  cited = PETSC_FALSE;
 10: static const char citation[] =
 11:   "@inproceedings{ZhangELLPACK2018,\n"
 12:   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
 13:   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
 14:   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
 15:   " year = 2018\n"
 16: "}\n";

 18: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)

 20:   #include <immintrin.h>

 22:   #if !defined(_MM_SCALE_8)
 23:   #define _MM_SCALE_8    8
 24:   #endif

 26:   #if defined(__AVX512F__)
 27:   /* these do not work
 28:    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
 29:    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
 30:   */
 31:     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
 32:     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
 33:     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
 34:     vec_vals = _mm512_loadu_pd(aval); \
 35:     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
 36:     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
 37:   #elif defined(__AVX2__) && defined(__FMA__)
 38:     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
 39:     vec_vals = _mm256_loadu_pd(aval); \
 40:     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
 41:     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
 42:     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
 43:   #endif
 44: #endif  /* PETSC_HAVE_IMMINTRIN_H */

 46: /*@C
 47:  MatSeqSELLSetPreallocation - For good matrix assembly performance
 48:  the user should preallocate the matrix storage by setting the parameter nz
 49:  (or the array nnz).  By setting these parameters accurately, performance
 50:  during matrix assembly can be increased significantly.

 52:  Collective

 54:  Input Parameters:
 55:  +  B - The matrix
 56:  .  nz - number of nonzeros per row (same for all rows)
 57:  -  nnz - array containing the number of nonzeros in the various rows
 58:  (possibly different for each row) or NULL

 60:  Notes:
 61:  If nnz is given then nz is ignored.

 63:  Specify the preallocated storage with either nz or nnz (not both).
 64:  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
 65:  allocation.  For large problems you MUST preallocate memory or you
 66:  will get TERRIBLE performance, see the users' manual chapter on matrices.

 68:  You can call MatGetInfo() to get information on how effective the preallocation was;
 69:  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
 70:  You can also run with the option -info and look for messages with the string
 71:  malloc in them to see if additional memory allocation was needed.

 73:  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
 74:  entries or columns indices.

 76:  The maximum number of nonzeos in any row should be as accurate as possible.
 77:  If it is underestimated, you will get bad performance due to reallocation
 78:  (MatSeqXSELLReallocateSELL).

 80:  Level: intermediate

 82:  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()

 84:  @*/
 85: PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
 86: {
 89:   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
 90:   return 0;
 91: }

 93: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
 94: {
 95:   Mat_SeqSELL    *b;
 96:   PetscInt       i,j,totalslices;
 97:   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;

 99:   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
100:   if (maxallocrow == MAT_SKIP_ALLOCATION) {
101:     skipallocation = PETSC_TRUE;
102:     maxallocrow    = 0;
103:   }

105:   PetscLayoutSetUp(B->rmap);
106:   PetscLayoutSetUp(B->cmap);

108:   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
109:   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
111:   if (rlen) {
112:     for (i=0; i<B->rmap->n; i++) {
115:     }
116:   }

118:   B->preallocated = PETSC_TRUE;

120:   b = (Mat_SeqSELL*)B->data;

122:   totalslices = PetscCeilInt(B->rmap->n,8);
123:   b->totalslices = totalslices;
124:   if (!skipallocation) {
125:     if (B->rmap->n & 0x07) PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n);

127:     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
128:       PetscMalloc1(totalslices+1,&b->sliidx);
129:       PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));
130:     }
131:     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
132:       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
133:       else if (maxallocrow < 0) maxallocrow = 1;
134:       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
135:     } else {
136:       maxallocrow = 0;
137:       b->sliidx[0] = 0;
138:       for (i=1; i<totalslices; i++) {
139:         b->sliidx[i] = 0;
140:         for (j=0;j<8;j++) {
141:           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
142:         }
143:         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
144:         PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);
145:       }
146:       /* last slice */
147:       b->sliidx[totalslices] = 0;
148:       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
149:       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
150:       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
151:     }

153:     /* allocate space for val, colidx, rlen */
154:     /* FIXME: should B's old memory be unlogged? */
155:     MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);
156:     /* FIXME: assuming an element of the bit array takes 8 bits */
157:     PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);
158:     PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));
159:     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
160:     PetscCalloc1(8*totalslices,&b->rlen);
161:     PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));

163:     b->singlemalloc = PETSC_TRUE;
164:     b->free_val     = PETSC_TRUE;
165:     b->free_colidx  = PETSC_TRUE;
166:   } else {
167:     b->free_val    = PETSC_FALSE;
168:     b->free_colidx = PETSC_FALSE;
169:   }

171:   b->nz               = 0;
172:   b->maxallocrow      = maxallocrow;
173:   b->rlenmax          = maxallocrow;
174:   b->maxallocmat      = b->sliidx[totalslices];
175:   B->info.nz_unneeded = (double)b->maxallocmat;
176:   if (realalloc) {
177:     MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
178:   }
179:   return 0;
180: }

182: PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
183: {
184:   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
185:   PetscInt    shift;

188:   if (nz) *nz = a->rlen[row];
189:   shift = a->sliidx[row>>3]+(row&0x07);
190:   if (!a->getrowcols) {

192:     PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);
193:   }
194:   if (idx) {
195:     PetscInt j;
196:     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
197:     *idx = a->getrowcols;
198:   }
199:   if (v) {
200:     PetscInt j;
201:     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
202:     *v = a->getrowvals;
203:   }
204:   return 0;
205: }

207: PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
208: {
209:   return 0;
210: }

212: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
213: {
214:   Mat            B;
215:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
216:   PetscInt       i;

218:   if (reuse == MAT_REUSE_MATRIX) {
219:     B    = *newmat;
220:     MatZeroEntries(B);
221:   } else {
222:     MatCreate(PetscObjectComm((PetscObject)A),&B);
223:     MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);
224:     MatSetType(B,MATSEQAIJ);
225:     MatSeqAIJSetPreallocation(B,0,a->rlen);
226:   }

228:   for (i=0; i<A->rmap->n; i++) {
229:     PetscInt    nz = 0,*cols = NULL;
230:     PetscScalar *vals = NULL;

232:     MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);
233:     MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);
234:     MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);
235:   }

237:   MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
238:   MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
239:   B->rmap->bs = A->rmap->bs;

241:   if (reuse == MAT_INPLACE_MATRIX) {
242:     MatHeaderReplace(A,&B);
243:   } else {
244:     *newmat = B;
245:   }
246:   return 0;
247: }

249: #include <../src/mat/impls/aij/seq/aij.h>

251: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252: {
253:   Mat               B;
254:   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255:   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256:   const PetscInt    *cols;
257:   const PetscScalar *vals;


260:   if (reuse == MAT_REUSE_MATRIX) {
261:     B = *newmat;
262:   } else {
263:     if (PetscDefined(USE_DEBUG) || !a->ilen) {
264:       PetscMalloc1(m,&rowlengths);
265:       for (i=0; i<m; i++) {
266:         rowlengths[i] = ai[i+1] - ai[i];
267:       }
268:     }
269:     if (PetscDefined(USE_DEBUG) && a->ilen) {
270:       PetscBool eq;
271:       PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq);
273:       PetscFree(rowlengths);
274:       rowlengths = a->ilen;
275:     } else if (a->ilen) rowlengths = a->ilen;
276:     MatCreate(PetscObjectComm((PetscObject)A),&B);
277:     MatSetSizes(B,m,n,m,n);
278:     MatSetType(B,MATSEQSELL);
279:     MatSeqSELLSetPreallocation(B,0,rowlengths);
280:     if (rowlengths != a->ilen) PetscFree(rowlengths);
281:   }

283:   for (row=0; row<m; row++) {
284:     MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals);
285:     MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES);
286:     MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals);
287:   }
288:   MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
289:   MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
290:   B->rmap->bs = A->rmap->bs;

292:   if (reuse == MAT_INPLACE_MATRIX) {
293:     MatHeaderReplace(A,&B);
294:   } else {
295:     *newmat = B;
296:   }
297:   return 0;
298: }

300: PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
301: {
302:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
303:   PetscScalar       *y;
304:   const PetscScalar *x;
305:   const MatScalar   *aval=a->val;
306:   PetscInt          totalslices=a->totalslices;
307:   const PetscInt    *acolidx=a->colidx;
308:   PetscInt          i,j;
309: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
310:   __m512d           vec_x,vec_y,vec_vals;
311:   __m256i           vec_idx;
312:   __mmask8          mask;
313:   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
314:   __m256i           vec_idx2,vec_idx3,vec_idx4;
315: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316:   __m128i           vec_idx;
317:   __m256d           vec_x,vec_y,vec_y2,vec_vals;
318:   MatScalar         yval;
319:   PetscInt          r,rows_left,row,nnz_in_row;
320: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
321:   __m128d           vec_x_tmp;
322:   __m256d           vec_x,vec_y,vec_y2,vec_vals;
323:   MatScalar         yval;
324:   PetscInt          r,rows_left,row,nnz_in_row;
325: #else
326:   PetscScalar       sum[8];
327: #endif

329: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330: #pragma disjoint(*x,*y,*aval)
331: #endif

333:   VecGetArrayRead(xx,&x);
334:   VecGetArray(yy,&y);
335: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336:   for (i=0; i<totalslices; i++) { /* loop over slices */
337:     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
338:     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);

340:     vec_y  = _mm512_setzero_pd();
341:     vec_y2 = _mm512_setzero_pd();
342:     vec_y3 = _mm512_setzero_pd();
343:     vec_y4 = _mm512_setzero_pd();

345:     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
346:     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
347:     case 3:
348:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
349:       acolidx += 8; aval += 8;
350:       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
351:       acolidx += 8; aval += 8;
352:       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
353:       acolidx += 8; aval += 8;
354:       j += 3;
355:       break;
356:     case 2:
357:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
358:       acolidx += 8; aval += 8;
359:       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
360:       acolidx += 8; aval += 8;
361:       j += 2;
362:       break;
363:     case 1:
364:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365:       acolidx += 8; aval += 8;
366:       j += 1;
367:       break;
368:     }
369:     #pragma novector
370:     for (; j<(a->sliidx[i+1]>>3); j+=4) {
371:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372:       acolidx += 8; aval += 8;
373:       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
374:       acolidx += 8; aval += 8;
375:       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
376:       acolidx += 8; aval += 8;
377:       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
378:       acolidx += 8; aval += 8;
379:     }

381:     vec_y = _mm512_add_pd(vec_y,vec_y2);
382:     vec_y = _mm512_add_pd(vec_y,vec_y3);
383:     vec_y = _mm512_add_pd(vec_y,vec_y4);
384:     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385:       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
386:       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
387:     } else {
388:       _mm512_storeu_pd(&y[8*i],vec_y);
389:     }
390:   }
391: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
392:   for (i=0; i<totalslices; i++) { /* loop over full slices */
393:     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
394:     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);

396:     /* last slice may have padding rows. Don't use vectorization. */
397:     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
398:       rows_left = A->rmap->n - 8*i;
399:       for (r=0; r<rows_left; ++r) {
400:         yval = (MatScalar)0;
401:         row = 8*i + r;
402:         nnz_in_row = a->rlen[row];
403:         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
404:         y[row] = yval;
405:       }
406:       break;
407:     }

409:     vec_y  = _mm256_setzero_pd();
410:     vec_y2 = _mm256_setzero_pd();

412:     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413:     #pragma novector
414:     #pragma unroll(2)
415:     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
416:       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
417:       aval += 4; acolidx += 4;
418:       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
419:       aval += 4; acolidx += 4;
420:     }

422:     _mm256_storeu_pd(y+i*8,vec_y);
423:     _mm256_storeu_pd(y+i*8+4,vec_y2);
424:   }
425: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
426:   for (i=0; i<totalslices; i++) { /* loop over full slices */
427:     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
428:     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);

430:     vec_y  = _mm256_setzero_pd();
431:     vec_y2 = _mm256_setzero_pd();

433:     /* last slice may have padding rows. Don't use vectorization. */
434:     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
435:       rows_left = A->rmap->n - 8*i;
436:       for (r=0; r<rows_left; ++r) {
437:         yval = (MatScalar)0;
438:         row = 8*i + r;
439:         nnz_in_row = a->rlen[row];
440:         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
441:         y[row] = yval;
442:       }
443:       break;
444:     }

446:     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447:     #pragma novector
448:     #pragma unroll(2)
449:     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
450:       vec_vals  = _mm256_loadu_pd(aval);
451:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
454:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
457:       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
458:       aval     += 4;

460:       vec_vals  = _mm256_loadu_pd(aval);
461:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
464:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
467:       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
468:       aval     += 4;
469:     }

471:     _mm256_storeu_pd(y + i*8,     vec_y);
472:     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
473:   }
474: #else
475:   for (i=0; i<totalslices; i++) { /* loop over slices */
476:     for (j=0; j<8; j++) sum[j] = 0.0;
477:     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
478:       sum[0] += aval[j] * x[acolidx[j]];
479:       sum[1] += aval[j+1] * x[acolidx[j+1]];
480:       sum[2] += aval[j+2] * x[acolidx[j+2]];
481:       sum[3] += aval[j+3] * x[acolidx[j+3]];
482:       sum[4] += aval[j+4] * x[acolidx[j+4]];
483:       sum[5] += aval[j+5] * x[acolidx[j+5]];
484:       sum[6] += aval[j+6] * x[acolidx[j+6]];
485:       sum[7] += aval[j+7] * x[acolidx[j+7]];
486:     }
487:     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488:       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
489:     } else {
490:       for (j=0; j<8; j++) y[8*i+j] = sum[j];
491:     }
492:   }
493: #endif

495:   PetscLogFlops(2.0*a->nz-a->nonzerorowcnt); /* theoretical minimal FLOPs */
496:   VecRestoreArrayRead(xx,&x);
497:   VecRestoreArray(yy,&y);
498:   return 0;
499: }

501: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502: PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
503: {
504:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
505:   PetscScalar       *y,*z;
506:   const PetscScalar *x;
507:   const MatScalar   *aval=a->val;
508:   PetscInt          totalslices=a->totalslices;
509:   const PetscInt    *acolidx=a->colidx;
510:   PetscInt          i,j;
511: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
512:   __m512d           vec_x,vec_y,vec_vals;
513:   __m256i           vec_idx;
514:   __mmask8          mask;
515:   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
516:   __m256i           vec_idx2,vec_idx3,vec_idx4;
517: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518:   __m128d           vec_x_tmp;
519:   __m256d           vec_x,vec_y,vec_y2,vec_vals;
520:   MatScalar         yval;
521:   PetscInt          r,row,nnz_in_row;
522: #else
523:   PetscScalar       sum[8];
524: #endif

526: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
527: #pragma disjoint(*x,*y,*aval)
528: #endif

530:   VecGetArrayRead(xx,&x);
531:   VecGetArrayPair(yy,zz,&y,&z);
532: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
533:   for (i=0; i<totalslices; i++) { /* loop over slices */
534:     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535:     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);

537:     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
538:       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
539:       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
540:     } else {
541:       vec_y  = _mm512_loadu_pd(&y[8*i]);
542:     }
543:     vec_y2 = _mm512_setzero_pd();
544:     vec_y3 = _mm512_setzero_pd();
545:     vec_y4 = _mm512_setzero_pd();

547:     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
548:     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
549:     case 3:
550:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
551:       acolidx += 8; aval += 8;
552:       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
553:       acolidx += 8; aval += 8;
554:       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
555:       acolidx += 8; aval += 8;
556:       j += 3;
557:       break;
558:     case 2:
559:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
560:       acolidx += 8; aval += 8;
561:       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
562:       acolidx += 8; aval += 8;
563:       j += 2;
564:       break;
565:     case 1:
566:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
567:       acolidx += 8; aval += 8;
568:       j += 1;
569:       break;
570:     }
571:     #pragma novector
572:     for (; j<(a->sliidx[i+1]>>3); j+=4) {
573:       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
574:       acolidx += 8; aval += 8;
575:       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
576:       acolidx += 8; aval += 8;
577:       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
578:       acolidx += 8; aval += 8;
579:       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
580:       acolidx += 8; aval += 8;
581:     }

583:     vec_y = _mm512_add_pd(vec_y,vec_y2);
584:     vec_y = _mm512_add_pd(vec_y,vec_y3);
585:     vec_y = _mm512_add_pd(vec_y,vec_y4);
586:     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
587:       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
588:     } else {
589:       _mm512_storeu_pd(&z[8*i],vec_y);
590:     }
591:   }
592: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
593:   for (i=0; i<totalslices; i++) { /* loop over full slices */
594:     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595:     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);

597:     /* last slice may have padding rows. Don't use vectorization. */
598:     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
599:       for (r=0; r<(A->rmap->n & 0x07); ++r) {
600:         row        = 8*i + r;
601:         yval       = (MatScalar)0.0;
602:         nnz_in_row = a->rlen[row];
603:         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
604:         z[row] = y[row] + yval;
605:       }
606:       break;
607:     }

609:     vec_y  = _mm256_loadu_pd(y+8*i);
610:     vec_y2 = _mm256_loadu_pd(y+8*i+4);

612:     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
613:     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
614:       vec_vals  = _mm256_loadu_pd(aval);
615:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
616:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
617:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
618:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
619:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
620:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
621:       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
622:       aval     += 4;

624:       vec_vals  = _mm256_loadu_pd(aval);
625:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
626:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
627:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
628:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
629:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
630:       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
631:       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
632:       aval     += 4;
633:     }

635:     _mm256_storeu_pd(z+i*8,vec_y);
636:     _mm256_storeu_pd(z+i*8+4,vec_y2);
637:   }
638: #else
639:   for (i=0; i<totalslices; i++) { /* loop over slices */
640:     for (j=0; j<8; j++) sum[j] = 0.0;
641:     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
642:       sum[0] += aval[j] * x[acolidx[j]];
643:       sum[1] += aval[j+1] * x[acolidx[j+1]];
644:       sum[2] += aval[j+2] * x[acolidx[j+2]];
645:       sum[3] += aval[j+3] * x[acolidx[j+3]];
646:       sum[4] += aval[j+4] * x[acolidx[j+4]];
647:       sum[5] += aval[j+5] * x[acolidx[j+5]];
648:       sum[6] += aval[j+6] * x[acolidx[j+6]];
649:       sum[7] += aval[j+7] * x[acolidx[j+7]];
650:     }
651:     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
652:       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
653:     } else {
654:       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
655:     }
656:   }
657: #endif

659:   PetscLogFlops(2.0*a->nz);
660:   VecRestoreArrayRead(xx,&x);
661:   VecRestoreArrayPair(yy,zz,&y,&z);
662:   return 0;
663: }

665: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
666: {
667:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
668:   PetscScalar       *y;
669:   const PetscScalar *x;
670:   const MatScalar   *aval=a->val;
671:   const PetscInt    *acolidx=a->colidx;
672:   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;

674: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675: #pragma disjoint(*x,*y,*aval)
676: #endif

678:   if (A->symmetric) {
679:     MatMultAdd_SeqSELL(A,xx,zz,yy);
680:     return 0;
681:   }
682:   if (zz != yy) VecCopy(zz,yy);
683:   VecGetArrayRead(xx,&x);
684:   VecGetArray(yy,&y);
685:   for (i=0; i<a->totalslices; i++) { /* loop over slices */
686:     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
687:       for (r=0; r<(A->rmap->n & 0x07); ++r) {
688:         row        = 8*i + r;
689:         nnz_in_row = a->rlen[row];
690:         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
691:       }
692:       break;
693:     }
694:     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
695:       y[acolidx[j]]   += aval[j] * x[8*i];
696:       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
697:       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
698:       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
699:       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
700:       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
701:       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
702:       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
703:     }
704:   }
705:   PetscLogFlops(2.0*a->sliidx[a->totalslices]);
706:   VecRestoreArrayRead(xx,&x);
707:   VecRestoreArray(yy,&y);
708:   return 0;
709: }

711: PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
712: {
713:   if (A->symmetric) {
714:     MatMult_SeqSELL(A,xx,yy);
715:   } else {
716:     VecSet(yy,0.0);
717:     MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);
718:   }
719:   return 0;
720: }

722: /*
723:      Checks for missing diagonals
724: */
725: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
726: {
727:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
728:   PetscInt       *diag,i;

730:   *missing = PETSC_FALSE;
731:   if (A->rmap->n > 0 && !(a->colidx)) {
732:     *missing = PETSC_TRUE;
733:     if (d) *d = 0;
734:     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
735:   } else {
736:     diag = a->diag;
737:     for (i=0; i<A->rmap->n; i++) {
738:       if (diag[i] == -1) {
739:         *missing = PETSC_TRUE;
740:         if (d) *d = i;
741:         PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i);
742:         break;
743:       }
744:     }
745:   }
746:   return 0;
747: }

749: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
750: {
751:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
752:   PetscInt       i,j,m=A->rmap->n,shift;

754:   if (!a->diag) {
755:     PetscMalloc1(m,&a->diag);
756:     PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));
757:     a->free_diag = PETSC_TRUE;
758:   }
759:   for (i=0; i<m; i++) { /* loop over rows */
760:     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
761:     a->diag[i] = -1;
762:     for (j=0; j<a->rlen[i]; j++) {
763:       if (a->colidx[shift+j*8] == i) {
764:         a->diag[i] = shift+j*8;
765:         break;
766:       }
767:     }
768:   }
769:   return 0;
770: }

772: /*
773:   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
774: */
775: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
776: {
777:   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
778:   PetscInt       i,*diag,m = A->rmap->n;
779:   MatScalar      *val = a->val;
780:   PetscScalar    *idiag,*mdiag;

782:   if (a->idiagvalid) return 0;
783:   MatMarkDiagonal_SeqSELL(A);
784:   diag = a->diag;
785:   if (!a->idiag) {
786:     PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);
787:     PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));
788:     val  = a->val;
789:   }
790:   mdiag = a->mdiag;
791:   idiag = a->idiag;

793:   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
794:     for (i=0; i<m; i++) {
795:       mdiag[i] = val[diag[i]];
796:       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
797:         if (PetscRealPart(fshift)) {
798:           PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i);
799:           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
800:           A->factorerror_zeropivot_value = 0.0;
801:           A->factorerror_zeropivot_row   = i;
802:         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
803:       }
804:       idiag[i] = 1.0/val[diag[i]];
805:     }
806:     PetscLogFlops(m);
807:   } else {
808:     for (i=0; i<m; i++) {
809:       mdiag[i] = val[diag[i]];
810:       idiag[i] = omega/(fshift + val[diag[i]]);
811:     }
812:     PetscLogFlops(2.0*m);
813:   }
814:   a->idiagvalid = PETSC_TRUE;
815:   return 0;
816: }

818: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
819: {
820:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;

822:   PetscArrayzero(a->val,a->sliidx[a->totalslices]);
823:   MatSeqSELLInvalidateDiagonal(A);
824:   return 0;
825: }

827: PetscErrorCode MatDestroy_SeqSELL(Mat A)
828: {
829:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;

831: #if defined(PETSC_USE_LOG)
832:   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
833: #endif
834:   MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);
835:   ISDestroy(&a->row);
836:   ISDestroy(&a->col);
837:   PetscFree(a->diag);
838:   PetscFree(a->rlen);
839:   PetscFree(a->sliidx);
840:   PetscFree3(a->idiag,a->mdiag,a->ssor_work);
841:   PetscFree(a->solve_work);
842:   ISDestroy(&a->icol);
843:   PetscFree(a->saved_values);
844:   PetscFree2(a->getrowcols,a->getrowvals);

846:   PetscFree(A->data);

848:   PetscObjectChangeTypeName((PetscObject)A,NULL);
849:   PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);
850:   PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);
851:   PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);
852:   return 0;
853: }

855: PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
856: {
857:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;

859:   switch (op) {
860:   case MAT_ROW_ORIENTED:
861:     a->roworiented = flg;
862:     break;
863:   case MAT_KEEP_NONZERO_PATTERN:
864:     a->keepnonzeropattern = flg;
865:     break;
866:   case MAT_NEW_NONZERO_LOCATIONS:
867:     a->nonew = (flg ? 0 : 1);
868:     break;
869:   case MAT_NEW_NONZERO_LOCATION_ERR:
870:     a->nonew = (flg ? -1 : 0);
871:     break;
872:   case MAT_NEW_NONZERO_ALLOCATION_ERR:
873:     a->nonew = (flg ? -2 : 0);
874:     break;
875:   case MAT_UNUSED_NONZERO_LOCATION_ERR:
876:     a->nounused = (flg ? -1 : 0);
877:     break;
878:   case MAT_FORCE_DIAGONAL_ENTRIES:
879:   case MAT_IGNORE_OFF_PROC_ENTRIES:
880:   case MAT_USE_HASH_TABLE:
881:   case MAT_SORTED_FULL:
882:     PetscInfo(A,"Option %s ignored\n",MatOptions[op]);
883:     break;
884:   case MAT_SPD:
885:   case MAT_SYMMETRIC:
886:   case MAT_STRUCTURALLY_SYMMETRIC:
887:   case MAT_HERMITIAN:
888:   case MAT_SYMMETRY_ETERNAL:
889:     /* These options are handled directly by MatSetOption() */
890:     break;
891:   default:
892:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
893:   }
894:   return 0;
895: }

897: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
898: {
899:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
900:   PetscInt       i,j,n,shift;
901:   PetscScalar    *x,zero=0.0;

903:   VecGetLocalSize(v,&n);

906:   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
907:     PetscInt *diag=a->diag;
908:     VecGetArray(v,&x);
909:     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
910:     VecRestoreArray(v,&x);
911:     return 0;
912:   }

914:   VecSet(v,zero);
915:   VecGetArray(v,&x);
916:   for (i=0; i<n; i++) { /* loop over rows */
917:     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
918:     x[i] = 0;
919:     for (j=0; j<a->rlen[i]; j++) {
920:       if (a->colidx[shift+j*8] == i) {
921:         x[i] = a->val[shift+j*8];
922:         break;
923:       }
924:     }
925:   }
926:   VecRestoreArray(v,&x);
927:   return 0;
928: }

930: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
931: {
932:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
933:   const PetscScalar *l,*r;
934:   PetscInt          i,j,m,n,row;

936:   if (ll) {
937:     /* The local size is used so that VecMPI can be passed to this routine
938:        by MatDiagonalScale_MPISELL */
939:     VecGetLocalSize(ll,&m);
941:     VecGetArrayRead(ll,&l);
942:     for (i=0; i<a->totalslices; i++) { /* loop over slices */
943:       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
944:         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
945:           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
946:         }
947:       } else {
948:         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
949:           a->val[j] *= l[8*i+row];
950:         }
951:       }
952:     }
953:     VecRestoreArrayRead(ll,&l);
954:     PetscLogFlops(a->nz);
955:   }
956:   if (rr) {
957:     VecGetLocalSize(rr,&n);
959:     VecGetArrayRead(rr,&r);
960:     for (i=0; i<a->totalslices; i++) { /* loop over slices */
961:       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
962:         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
963:           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
964:         }
965:       } else {
966:         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
967:           a->val[j] *= r[a->colidx[j]];
968:         }
969:       }
970:     }
971:     VecRestoreArrayRead(rr,&r);
972:     PetscLogFlops(a->nz);
973:   }
974:   MatSeqSELLInvalidateDiagonal(A);
975:   return 0;
976: }

978: PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
979: {
980:   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
981:   PetscInt    *cp,i,k,low,high,t,row,col,l;
982:   PetscInt    shift;
983:   MatScalar   *vp;

985:   for (k=0; k<m; k++) { /* loop over requested rows */
986:     row = im[k];
987:     if (row<0) continue;
989:     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
990:     cp = a->colidx+shift; /* pointer to the row */
991:     vp = a->val+shift; /* pointer to the row */
992:     for (l=0; l<n; l++) { /* loop over requested columns */
993:       col = in[l];
994:       if (col<0) continue;
996:       high = a->rlen[row]; low = 0; /* assume unsorted */
997:       while (high-low > 5) {
998:         t = (low+high)/2;
999:         if (*(cp+t*8) > col) high = t;
1000:         else low = t;
1001:       }
1002:       for (i=low; i<high; i++) {
1003:         if (*(cp+8*i) > col) break;
1004:         if (*(cp+8*i) == col) {
1005:           *v++ = *(vp+8*i);
1006:           goto finished;
1007:         }
1008:       }
1009:       *v++ = 0.0;
1010:     finished:;
1011:     }
1012:   }
1013:   return 0;
1014: }

1016: PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1017: {
1018:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1019:   PetscInt          i,j,m=A->rmap->n,shift;
1020:   const char        *name;
1021:   PetscViewerFormat format;

1023:   PetscViewerGetFormat(viewer,&format);
1024:   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1025:     PetscInt nofinalvalue = 0;
1026:     /*
1027:     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1028:       nofinalvalue = 1;
1029:     }
1030:     */
1031:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1032:     PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n);
1033:     PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz);
1034: #if defined(PETSC_USE_COMPLEX)
1035:     PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue);
1036: #else
1037:     PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue);
1038: #endif
1039:     PetscViewerASCIIPrintf(viewer,"zzz = [\n");

1041:     for (i=0; i<m; i++) {
1042:       shift = a->sliidx[i>>3]+(i&0x07);
1043:       for (j=0; j<a->rlen[i]; j++) {
1044: #if defined(PETSC_USE_COMPLEX)
1045:         PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1046: #else
1047:         PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);
1048: #endif
1049:       }
1050:     }
1051:     /*
1052:     if (nofinalvalue) {
1053: #if defined(PETSC_USE_COMPLEX)
1054:       PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);
1055: #else
1056:       PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0);
1057: #endif
1058:     }
1059:     */
1060:     PetscObjectGetName((PetscObject)A,&name);
1061:     PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);
1062:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1063:   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1064:     return 0;
1065:   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1066:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1067:     for (i=0; i<m; i++) {
1068:       PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);
1069:       shift = a->sliidx[i>>3]+(i&0x07);
1070:       for (j=0; j<a->rlen[i]; j++) {
1071: #if defined(PETSC_USE_COMPLEX)
1072:         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1073:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1074:         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1075:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));
1076:         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1077:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));
1078:         }
1079: #else
1080:         if (a->val[shift+8*j] != 0.0) PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);
1081: #endif
1082:       }
1083:       PetscViewerASCIIPrintf(viewer,"\n");
1084:     }
1085:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1086:   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1087:     PetscInt    cnt=0,jcnt;
1088:     PetscScalar value;
1089: #if defined(PETSC_USE_COMPLEX)
1090:     PetscBool   realonly=PETSC_TRUE;
1091:     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1092:       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1093:         realonly = PETSC_FALSE;
1094:         break;
1095:       }
1096:     }
1097: #endif

1099:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1100:     for (i=0; i<m; i++) {
1101:       jcnt = 0;
1102:       shift = a->sliidx[i>>3]+(i&0x07);
1103:       for (j=0; j<A->cmap->n; j++) {
1104:         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1105:           value = a->val[cnt++];
1106:           jcnt++;
1107:         } else {
1108:           value = 0.0;
1109:         }
1110: #if defined(PETSC_USE_COMPLEX)
1111:         if (realonly) {
1112:           PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));
1113:         } else {
1114:           PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));
1115:         }
1116: #else
1117:         PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);
1118: #endif
1119:       }
1120:       PetscViewerASCIIPrintf(viewer,"\n");
1121:     }
1122:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1123:   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1124:     PetscInt fshift=1;
1125:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1126: #if defined(PETSC_USE_COMPLEX)
1127:     PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");
1128: #else
1129:     PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");
1130: #endif
1131:     PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz);
1132:     for (i=0; i<m; i++) {
1133:       shift = a->sliidx[i>>3]+(i&0x07);
1134:       for (j=0; j<a->rlen[i]; j++) {
1135: #if defined(PETSC_USE_COMPLEX)
1136:         PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1137: #else
1138:         PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);
1139: #endif
1140:       }
1141:     }
1142:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1143:   } else if (format == PETSC_VIEWER_NATIVE) {
1144:     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1145:       PetscInt row;
1146:       PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]);
1147:       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1148: #if defined(PETSC_USE_COMPLEX)
1149:         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1150:           PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));
1151:         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1152:           PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));
1153:         } else {
1154:           PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));
1155:         }
1156: #else
1157:         PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]);
1158: #endif
1159:       }
1160:     }
1161:   } else {
1162:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1163:     if (A->factortype) {
1164:       for (i=0; i<m; i++) {
1165:         shift = a->sliidx[i>>3]+(i&0x07);
1166:         PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);
1167:         /* L part */
1168:         for (j=shift; j<a->diag[i]; j+=8) {
1169: #if defined(PETSC_USE_COMPLEX)
1170:           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1171:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));
1172:           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1173:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));
1174:           } else {
1175:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));
1176:           }
1177: #else
1178:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]);
1179: #endif
1180:         }
1181:         /* diagonal */
1182:         j = a->diag[i];
1183: #if defined(PETSC_USE_COMPLEX)
1184:         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1185:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));
1186:         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1187:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));
1188:         } else {
1189:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));
1190:         }
1191: #else
1192:         PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]));
1193: #endif

1195:         /* U part */
1196:         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1197: #if defined(PETSC_USE_COMPLEX)
1198:           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1199:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));
1200:           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1201:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));
1202:           } else {
1203:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));
1204:           }
1205: #else
1206:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]);
1207: #endif
1208:         }
1209:         PetscViewerASCIIPrintf(viewer,"\n");
1210:       }
1211:     } else {
1212:       for (i=0; i<m; i++) {
1213:         shift = a->sliidx[i>>3]+(i&0x07);
1214:         PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);
1215:         for (j=0; j<a->rlen[i]; j++) {
1216: #if defined(PETSC_USE_COMPLEX)
1217:           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));
1219:           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));
1221:           } else {
1222:             PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));
1223:           }
1224: #else
1225:           PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);
1226: #endif
1227:         }
1228:         PetscViewerASCIIPrintf(viewer,"\n");
1229:       }
1230:     }
1231:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1232:   }
1233:   PetscViewerFlush(viewer);
1234:   return 0;
1235: }

1237: #include <petscdraw.h>
1238: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1239: {
1240:   Mat               A=(Mat)Aa;
1241:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1242:   PetscInt          i,j,m=A->rmap->n,shift;
1243:   int               color;
1244:   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1245:   PetscViewer       viewer;
1246:   PetscViewerFormat format;
1247:   PetscErrorCode    ierr;

1249:   PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);
1250:   PetscViewerGetFormat(viewer,&format);
1251:   PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);

1253:   /* loop over matrix elements drawing boxes */

1255:   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1256:     PetscDrawCollectiveBegin(draw);
1257:     /* Blue for negative, Cyan for zero and  Red for positive */
1258:     color = PETSC_DRAW_BLUE;
1259:     for (i=0; i<m; i++) {
1260:       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1261:       y_l = m - i - 1.0; y_r = y_l + 1.0;
1262:       for (j=0; j<a->rlen[i]; j++) {
1263:         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1264:         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1265:         PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1266:       }
1267:     }
1268:     color = PETSC_DRAW_CYAN;
1269:     for (i=0; i<m; i++) {
1270:       shift = a->sliidx[i>>3]+(i&0x07);
1271:       y_l = m - i - 1.0; y_r = y_l + 1.0;
1272:       for (j=0; j<a->rlen[i]; j++) {
1273:         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1274:         if (a->val[shift+8*j] !=  0.) continue;
1275:         PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1276:       }
1277:     }
1278:     color = PETSC_DRAW_RED;
1279:     for (i=0; i<m; i++) {
1280:       shift = a->sliidx[i>>3]+(i&0x07);
1281:       y_l = m - i - 1.0; y_r = y_l + 1.0;
1282:       for (j=0; j<a->rlen[i]; j++) {
1283:         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1284:         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1285:         PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1286:       }
1287:     }
1288:     PetscDrawCollectiveEnd(draw);
1289:   } else {
1290:     /* use contour shading to indicate magnitude of values */
1291:     /* first determine max of all nonzero values */
1292:     PetscReal minv=0.0,maxv=0.0;
1293:     PetscInt  count=0;
1294:     PetscDraw popup;
1295:     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1296:       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1297:     }
1298:     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1299:     PetscDrawGetPopup(draw,&popup);
1300:     PetscDrawScalePopup(popup,minv,maxv);

1302:     PetscDrawCollectiveBegin(draw);
1303:     for (i=0; i<m; i++) {
1304:       shift = a->sliidx[i>>3]+(i&0x07);
1305:       y_l = m - i - 1.0;
1306:       y_r = y_l + 1.0;
1307:       for (j=0; j<a->rlen[i]; j++) {
1308:         x_l = a->colidx[shift+j*8];
1309:         x_r = x_l + 1.0;
1310:         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1311:         PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);
1312:         count++;
1313:       }
1314:     }
1315:     PetscDrawCollectiveEnd(draw);
1316:   }
1317:   return 0;
1318: }

1320: #include <petscdraw.h>
1321: PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1322: {
1323:   PetscDraw      draw;
1324:   PetscReal      xr,yr,xl,yl,h,w;
1325:   PetscBool      isnull;

1327:   PetscViewerDrawGetDraw(viewer,0,&draw);
1328:   PetscDrawIsNull(draw,&isnull);
1329:   if (isnull) return 0;

1331:   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1332:   xr  += w;          yr += h;         xl = -w;     yl = -h;
1333:   PetscDrawSetCoordinates(draw,xl,yl,xr,yr);
1334:   PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);
1335:   PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);
1336:   PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);
1337:   PetscDrawSave(draw);
1338:   return 0;
1339: }

1341: PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1342: {
1343:   PetscBool      iascii,isbinary,isdraw;

1345:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
1346:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);
1347:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);
1348:   if (iascii) {
1349:     MatView_SeqSELL_ASCII(A,viewer);
1350:   } else if (isbinary) {
1351:     /* MatView_SeqSELL_Binary(A,viewer); */
1352:   } else if (isdraw) {
1353:     MatView_SeqSELL_Draw(A,viewer);
1354:   }
1355:   return 0;
1356: }

1358: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1359: {
1360:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1361:   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1362:   MatScalar      *vp;

1364:   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
1365:   /* To do: compress out the unused elements */
1366:   MatMarkDiagonal_SeqSELL(A);
1367:   PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);
1368:   PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs);
1369:   PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax);
1370:   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1371:   for (i=0; i<a->totalslices; ++i) {
1372:     shift = a->sliidx[i];    /* starting index of the slice */
1373:     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1374:     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1375:     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1376:       row  = 8*i + row_in_slice;
1377:       nrow = a->rlen[row]; /* number of nonzeros in row */
1378:       /*
1379:         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1380:         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1381:       */
1382:       lastcol = 0;
1383:       if (nrow>0) { /* nonempty row */
1384:         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1385:       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1386:         for (j=1;j<8;j++) {
1387:           if (a->rlen[8*i+j]) {
1388:             lastcol = cp[j];
1389:             break;
1390:           }
1391:         }
1392:       } else {
1393:         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1394:       }

1396:       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1397:         cp[8*k+row_in_slice] = lastcol;
1398:         vp[8*k+row_in_slice] = (MatScalar)0;
1399:       }
1400:     }
1401:   }

1403:   A->info.mallocs += a->reallocs;
1404:   a->reallocs      = 0;

1406:   MatSeqSELLInvalidateDiagonal(A);
1407:   return 0;
1408: }

1410: PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1411: {
1412:   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;

1414:   info->block_size   = 1.0;
1415:   info->nz_allocated = a->maxallocmat;
1416:   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1417:   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1418:   info->assemblies   = A->num_ass;
1419:   info->mallocs      = A->info.mallocs;
1420:   info->memory       = ((PetscObject)A)->mem;
1421:   if (A->factortype) {
1422:     info->fill_ratio_given  = A->info.fill_ratio_given;
1423:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1424:     info->factor_mallocs    = A->info.factor_mallocs;
1425:   } else {
1426:     info->fill_ratio_given  = 0;
1427:     info->fill_ratio_needed = 0;
1428:     info->factor_mallocs    = 0;
1429:   }
1430:   return 0;
1431: }

1433: PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1434: {
1435:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1436:   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1437:   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1438:   MatScalar      *vp,value;

1440:   for (k=0; k<m; k++) { /* loop over added rows */
1441:     row = im[k];
1442:     if (row < 0) continue;
1444:     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1445:     cp    = a->colidx+shift; /* pointer to the row */
1446:     vp    = a->val+shift; /* pointer to the row */
1447:     nrow  = a->rlen[row];
1448:     low   = 0;
1449:     high  = nrow;

1451:     for (l=0; l<n; l++) { /* loop over added columns */
1452:       col = in[l];
1453:       if (col<0) continue;
1455:       if (a->roworiented) {
1456:         value = v[l+k*n];
1457:       } else {
1458:         value = v[k+l*m];
1459:       }
1460:       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;

1462:       /* search in this row for the specified column, i indicates the column to be set */
1463:       if (col <= lastcol) low = 0;
1464:       else high = nrow;
1465:       lastcol = col;
1466:       while (high-low > 5) {
1467:         t = (low+high)/2;
1468:         if (*(cp+t*8) > col) high = t;
1469:         else low = t;
1470:       }
1471:       for (i=low; i<high; i++) {
1472:         if (*(cp+i*8) > col) break;
1473:         if (*(cp+i*8) == col) {
1474:           if (is == ADD_VALUES) *(vp+i*8) += value;
1475:           else *(vp+i*8) = value;
1476:           low = i + 1;
1477:           goto noinsert;
1478:         }
1479:       }
1480:       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1481:       if (nonew == 1) goto noinsert;
1483:       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1484:       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1485:       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1486:       for (ii=nrow-1; ii>=i; ii--) {
1487:         *(cp+(ii+1)*8) = *(cp+ii*8);
1488:         *(vp+(ii+1)*8) = *(vp+ii*8);
1489:       }
1490:       a->rlen[row]++;
1491:       *(cp+i*8) = col;
1492:       *(vp+i*8) = value;
1493:       a->nz++;
1494:       A->nonzerostate++;
1495:       low = i+1; high++; nrow++;
1496: noinsert:;
1497:     }
1498:     a->rlen[row] = nrow;
1499:   }
1500:   return 0;
1501: }

1503: PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1504: {
1505:   /* If the two matrices have the same copy implementation, use fast copy. */
1506:   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1507:     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1508:     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;

1511:     PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);
1512:   } else {
1513:     MatCopy_Basic(A,B,str);
1514:   }
1515:   return 0;
1516: }

1518: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1519: {
1520:   MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL);
1521:   return 0;
1522: }

1524: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1525: {
1526:   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;

1528:   *array = a->val;
1529:   return 0;
1530: }

1532: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1533: {
1534:   return 0;
1535: }

1537: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1538: {
1539:   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1540:   PetscInt    i;
1541:   MatScalar   *aval=a->val;

1543:   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1544:   return 0;
1545: }

1547: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1548: {
1549:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1550:   PetscInt       i;
1551:   MatScalar      *aval=a->val;

1553:   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1554:   MatSeqSELLInvalidateDiagonal(A);
1555:   return 0;
1556: }

1558: PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1559: {
1560:   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1561:   MatScalar      *aval=a->val;
1562:   PetscScalar    oalpha=alpha;
1563:   PetscBLASInt   one=1,size;

1565:   PetscBLASIntCast(a->sliidx[a->totalslices],&size);
1566:   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1567:   PetscLogFlops(a->nz);
1568:   MatSeqSELLInvalidateDiagonal(inA);
1569:   return 0;
1570: }

1572: PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1573: {
1574:   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;

1576:   if (!Y->preallocated || !y->nz) {
1577:     MatSeqSELLSetPreallocation(Y,1,NULL);
1578:   }
1579:   MatShift_Basic(Y,a);
1580:   return 0;
1581: }

1583: PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1584: {
1585:   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1586:   PetscScalar       *x,sum,*t;
1587:   const MatScalar   *idiag=NULL,*mdiag;
1588:   const PetscScalar *b,*xb;
1589:   PetscInt          n,m=A->rmap->n,i,j,shift;
1590:   const PetscInt    *diag;

1592:   its = its*lits;

1594:   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1595:   if (!a->idiagvalid) MatInvertDiagonal_SeqSELL(A,omega,fshift);
1596:   a->fshift = fshift;
1597:   a->omega  = omega;

1599:   diag  = a->diag;
1600:   t     = a->ssor_work;
1601:   idiag = a->idiag;
1602:   mdiag = a->mdiag;

1604:   VecGetArray(xx,&x);
1605:   VecGetArrayRead(bb,&b);
1606:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */

1611:   if (flag & SOR_ZERO_INITIAL_GUESS) {
1612:     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1613:       for (i=0; i<m; i++) {
1614:         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1615:         sum   = b[i];
1616:         n     = (diag[i]-shift)/8;
1617:         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1618:         t[i]  = sum;
1619:         x[i]  = sum*idiag[i];
1620:       }
1621:       xb   = t;
1622:       PetscLogFlops(a->nz);
1623:     } else xb = b;
1624:     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1625:       for (i=m-1; i>=0; i--) {
1626:         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1627:         sum   = xb[i];
1628:         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1629:         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1630:         if (xb == b) {
1631:           x[i] = sum*idiag[i];
1632:         } else {
1633:           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1634:         }
1635:       }
1636:       PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1637:     }
1638:     its--;
1639:   }
1640:   while (its--) {
1641:     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1642:       for (i=0; i<m; i++) {
1643:         /* lower */
1644:         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1645:         sum   = b[i];
1646:         n     = (diag[i]-shift)/8;
1647:         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1648:         t[i]  = sum;             /* save application of the lower-triangular part */
1649:         /* upper */
1650:         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1651:         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1652:         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1653:       }
1654:       xb   = t;
1655:       PetscLogFlops(2.0*a->nz);
1656:     } else xb = b;
1657:     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1658:       for (i=m-1; i>=0; i--) {
1659:         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1660:         sum = xb[i];
1661:         if (xb == b) {
1662:           /* whole matrix (no checkpointing available) */
1663:           n     = a->rlen[i];
1664:           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1665:           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1666:         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1667:           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1668:           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1669:           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1670:         }
1671:       }
1672:       if (xb == b) {
1673:         PetscLogFlops(2.0*a->nz);
1674:       } else {
1675:         PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1676:       }
1677:     }
1678:   }
1679:   VecRestoreArray(xx,&x);
1680:   VecRestoreArrayRead(bb,&b);
1681:   return 0;
1682: }

1684: /* -------------------------------------------------------------------*/
1685: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1686:                                        MatGetRow_SeqSELL,
1687:                                        MatRestoreRow_SeqSELL,
1688:                                        MatMult_SeqSELL,
1689:                                /* 4*/  MatMultAdd_SeqSELL,
1690:                                        MatMultTranspose_SeqSELL,
1691:                                        MatMultTransposeAdd_SeqSELL,
1692:                                        NULL,
1693:                                        NULL,
1694:                                        NULL,
1695:                                /* 10*/ NULL,
1696:                                        NULL,
1697:                                        NULL,
1698:                                        MatSOR_SeqSELL,
1699:                                        NULL,
1700:                                /* 15*/ MatGetInfo_SeqSELL,
1701:                                        MatEqual_SeqSELL,
1702:                                        MatGetDiagonal_SeqSELL,
1703:                                        MatDiagonalScale_SeqSELL,
1704:                                        NULL,
1705:                                /* 20*/ NULL,
1706:                                        MatAssemblyEnd_SeqSELL,
1707:                                        MatSetOption_SeqSELL,
1708:                                        MatZeroEntries_SeqSELL,
1709:                                /* 24*/ NULL,
1710:                                        NULL,
1711:                                        NULL,
1712:                                        NULL,
1713:                                        NULL,
1714:                                /* 29*/ MatSetUp_SeqSELL,
1715:                                        NULL,
1716:                                        NULL,
1717:                                        NULL,
1718:                                        NULL,
1719:                                /* 34*/ MatDuplicate_SeqSELL,
1720:                                        NULL,
1721:                                        NULL,
1722:                                        NULL,
1723:                                        NULL,
1724:                                /* 39*/ NULL,
1725:                                        NULL,
1726:                                        NULL,
1727:                                        MatGetValues_SeqSELL,
1728:                                        MatCopy_SeqSELL,
1729:                                /* 44*/ NULL,
1730:                                        MatScale_SeqSELL,
1731:                                        MatShift_SeqSELL,
1732:                                        NULL,
1733:                                        NULL,
1734:                                /* 49*/ NULL,
1735:                                        NULL,
1736:                                        NULL,
1737:                                        NULL,
1738:                                        NULL,
1739:                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1740:                                        NULL,
1741:                                        NULL,
1742:                                        NULL,
1743:                                        NULL,
1744:                                /* 59*/ NULL,
1745:                                        MatDestroy_SeqSELL,
1746:                                        MatView_SeqSELL,
1747:                                        NULL,
1748:                                        NULL,
1749:                                /* 64*/ NULL,
1750:                                        NULL,
1751:                                        NULL,
1752:                                        NULL,
1753:                                        NULL,
1754:                                /* 69*/ NULL,
1755:                                        NULL,
1756:                                        NULL,
1757:                                        NULL,
1758:                                        NULL,
1759:                                /* 74*/ NULL,
1760:                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1761:                                        NULL,
1762:                                        NULL,
1763:                                        NULL,
1764:                                /* 79*/ NULL,
1765:                                        NULL,
1766:                                        NULL,
1767:                                        NULL,
1768:                                        NULL,
1769:                                /* 84*/ NULL,
1770:                                        NULL,
1771:                                        NULL,
1772:                                        NULL,
1773:                                        NULL,
1774:                                /* 89*/ NULL,
1775:                                        NULL,
1776:                                        NULL,
1777:                                        NULL,
1778:                                        NULL,
1779:                                /* 94*/ NULL,
1780:                                        NULL,
1781:                                        NULL,
1782:                                        NULL,
1783:                                        NULL,
1784:                                /* 99*/ NULL,
1785:                                        NULL,
1786:                                        NULL,
1787:                                        MatConjugate_SeqSELL,
1788:                                        NULL,
1789:                                /*104*/ NULL,
1790:                                        NULL,
1791:                                        NULL,
1792:                                        NULL,
1793:                                        NULL,
1794:                                /*109*/ NULL,
1795:                                        NULL,
1796:                                        NULL,
1797:                                        NULL,
1798:                                        MatMissingDiagonal_SeqSELL,
1799:                                /*114*/ NULL,
1800:                                        NULL,
1801:                                        NULL,
1802:                                        NULL,
1803:                                        NULL,
1804:                                /*119*/ NULL,
1805:                                        NULL,
1806:                                        NULL,
1807:                                        NULL,
1808:                                        NULL,
1809:                                /*124*/ NULL,
1810:                                        NULL,
1811:                                        NULL,
1812:                                        NULL,
1813:                                        NULL,
1814:                                /*129*/ NULL,
1815:                                        NULL,
1816:                                        NULL,
1817:                                        NULL,
1818:                                        NULL,
1819:                                /*134*/ NULL,
1820:                                        NULL,
1821:                                        NULL,
1822:                                        NULL,
1823:                                        NULL,
1824:                                /*139*/ NULL,
1825:                                        NULL,
1826:                                        NULL,
1827:                                        MatFDColoringSetUp_SeqXAIJ,
1828:                                        NULL,
1829:                                /*144*/ NULL,
1830:                                        NULL,
1831:                                        NULL,
1832:                                        NULL
1833: };

1835: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1836: {
1837:   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;


1841:   /* allocate space for values if not already there */
1842:   if (!a->saved_values) {
1843:     PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);
1844:     PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));
1845:   }

1847:   /* copy values over */
1848:   PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);
1849:   return 0;
1850: }

1852: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1853: {
1854:   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;

1858:   PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);
1859:   return 0;
1860: }

1862: /*@C
1863:  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()

1865:  Not Collective

1867:  Input Parameters:
1868:  .  mat - a MATSEQSELL matrix
1869:  .  array - pointer to the data

1871:  Level: intermediate

1873:  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1874:  @*/
1875: PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1876: {
1877:   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1878:   return 0;
1879: }

1881: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1882: {
1883:   Mat_SeqSELL    *b;
1884:   PetscMPIInt    size;

1886:   PetscCitationsRegister(citation,&cited);
1887:   MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);

1890:   PetscNewLog(B,&b);

1892:   B->data = (void*)b;

1894:   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));

1896:   b->row                = NULL;
1897:   b->col                = NULL;
1898:   b->icol               = NULL;
1899:   b->reallocs           = 0;
1900:   b->ignorezeroentries  = PETSC_FALSE;
1901:   b->roworiented        = PETSC_TRUE;
1902:   b->nonew              = 0;
1903:   b->diag               = NULL;
1904:   b->solve_work         = NULL;
1905:   B->spptr              = NULL;
1906:   b->saved_values       = NULL;
1907:   b->idiag              = NULL;
1908:   b->mdiag              = NULL;
1909:   b->ssor_work          = NULL;
1910:   b->omega              = 1.0;
1911:   b->fshift             = 0.0;
1912:   b->idiagvalid         = PETSC_FALSE;
1913:   b->keepnonzeropattern = PETSC_FALSE;

1915:   PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);
1916:   PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);
1917:   PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);
1918:   PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);
1919:   PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);
1920:   PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);
1921:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);
1922:   return 0;
1923: }

1925: /*
1926:  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1927:  */
1928: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1929: {
1930:   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1931:   PetscInt       i,m=A->rmap->n;
1932:   PetscInt       totalslices=a->totalslices;

1934:   C->factortype = A->factortype;
1935:   c->row        = NULL;
1936:   c->col        = NULL;
1937:   c->icol       = NULL;
1938:   c->reallocs   = 0;
1939:   C->assembled = PETSC_TRUE;

1941:   PetscLayoutReference(A->rmap,&C->rmap);
1942:   PetscLayoutReference(A->cmap,&C->cmap);

1944:   PetscMalloc1(8*totalslices,&c->rlen);
1945:   PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));
1946:   PetscMalloc1(totalslices+1,&c->sliidx);
1947:   PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));

1949:   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1950:   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];

1952:   /* allocate the matrix space */
1953:   if (mallocmatspace) {
1954:     PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);
1955:     PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));

1957:     c->singlemalloc = PETSC_TRUE;

1959:     if (m > 0) {
1960:       PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);
1961:       if (cpvalues == MAT_COPY_VALUES) {
1962:         PetscArraycpy(c->val,a->val,a->maxallocmat);
1963:       } else {
1964:         PetscArrayzero(c->val,a->maxallocmat);
1965:       }
1966:     }
1967:   }

1969:   c->ignorezeroentries = a->ignorezeroentries;
1970:   c->roworiented       = a->roworiented;
1971:   c->nonew             = a->nonew;
1972:   if (a->diag) {
1973:     PetscMalloc1(m,&c->diag);
1974:     PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));
1975:     for (i=0; i<m; i++) {
1976:       c->diag[i] = a->diag[i];
1977:     }
1978:   } else c->diag = NULL;

1980:   c->solve_work         = NULL;
1981:   c->saved_values       = NULL;
1982:   c->idiag              = NULL;
1983:   c->ssor_work          = NULL;
1984:   c->keepnonzeropattern = a->keepnonzeropattern;
1985:   c->free_val           = PETSC_TRUE;
1986:   c->free_colidx        = PETSC_TRUE;

1988:   c->maxallocmat  = a->maxallocmat;
1989:   c->maxallocrow  = a->maxallocrow;
1990:   c->rlenmax      = a->rlenmax;
1991:   c->nz           = a->nz;
1992:   C->preallocated = PETSC_TRUE;

1994:   c->nonzerorowcnt = a->nonzerorowcnt;
1995:   C->nonzerostate  = A->nonzerostate;

1997:   PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);
1998:   return 0;
1999: }

2001: PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2002: {
2003:   MatCreate(PetscObjectComm((PetscObject)A),B);
2004:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
2005:   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2006:     MatSetBlockSizesFromMats(*B,A,A);
2007:   }
2008:   MatSetType(*B,((PetscObject)A)->type_name);
2009:   MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);
2010:   return 0;
2011: }

2013: /*MC
2014:    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2015:    based on the sliced Ellpack format

2017:    Options Database Keys:
2018: . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()

2020:    Level: beginner

2022: .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ
2023: M*/

2025: /*MC
2026:    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.

2028:    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2029:    and MATMPISELL otherwise.  As a result, for single process communicators,
2030:   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2031:   for communicators controlling multiple processes.  It is recommended that you call both of
2032:   the above preallocation routines for simplicity.

2034:    Options Database Keys:
2035: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()

2037:   Level: beginner

2039:   Notes:
2040:    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).

2042:    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2043:    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.

2045:   Developer Notes:
2046:    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.

2048:    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2049: .vb
2050:                             (2 0  3 4)
2051:    Consider the matrix A =  (5 0  6 0)
2052:                             (0 0  7 8)
2053:                             (0 0  9 9)

2055:    symbolically the Ellpack format can be written as

2057:         (2 3 4 |)           (0 2 3 |)
2058:    v =  (5 6 0 |)  colidx = (0 2 2 |)
2059:         --------            ---------
2060:         (7 8 |)             (2 3 |)
2061:         (9 9 |)             (2 3 |)

2063:     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2064:     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2065:     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.

2067:     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)

2069: .ve

2071:       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.

2073:  References:
2074: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2075:    Proceedings of the 47th International Conference on Parallel Processing, 2018.

2077: .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ
2078: M*/

2080: /*@C
2081:        MatCreateSeqSELL - Creates a sparse matrix in SELL format.

2083:  Collective on comm

2085:  Input Parameters:
2086: +  comm - MPI communicator, set to PETSC_COMM_SELF
2087: .  m - number of rows
2088: .  n - number of columns
2089: .  rlenmax - maximum number of nonzeros in a row
2090: -  rlen - array containing the number of nonzeros in the various rows
2091:  (possibly different for each row) or NULL

2093:  Output Parameter:
2094: .  A - the matrix

2096:  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2097:  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2098:  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]

2100:  Notes:
2101:  If nnz is given then nz is ignored

2103:  Specify the preallocated storage with either rlenmax or rlen (not both).
2104:  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2105:  allocation.  For large problems you MUST preallocate memory or you
2106:  will get TERRIBLE performance, see the users' manual chapter on matrices.

2108:  Level: intermediate

2110:  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL

2112:  @*/
2113: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2114: {
2115:   MatCreate(comm,A);
2116:   MatSetSizes(*A,m,n,m,n);
2117:   MatSetType(*A,MATSEQSELL);
2118:   MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);
2119:   return 0;
2120: }

2122: PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2123: {
2124:   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2125:   PetscInt       totalslices=a->totalslices;

2127:   /* If the  matrix dimensions are not equal,or no of nonzeros */
2128:   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2129:     *flg = PETSC_FALSE;
2130:     return 0;
2131:   }
2132:   /* if the a->colidx are the same */
2133:   PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);
2134:   if (!*flg) return 0;
2135:   /* if a->val are the same */
2136:   PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);
2137:   return 0;
2138: }

2140: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2141: {
2142:   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;

2144:   a->idiagvalid  = PETSC_FALSE;
2145:   return 0;
2146: }

2148: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2149: {
2150: #if defined(PETSC_USE_COMPLEX)
2151:   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2152:   PetscInt    i;
2153:   PetscScalar *val = a->val;

2155:   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2156:     val[i] = PetscConj(val[i]);
2157:   }
2158: #else
2159: #endif
2160:   return 0;
2161: }