Actual source code: aijperm.c
  1: /*
  2:   Defines basic operations for the MATSEQAIJPERM matrix class.
  3:   This class is derived from the MATSEQAIJ class and retains the
  4:   compressed row storage (aka Yale sparse matrix format) but augments
  5:   it with some permutation information that enables some operations
  6:   to be more vectorizable.  A physically rearranged copy of the matrix
  7:   may be stored if the user desires.

  9:   Eventually a variety of permutations may be supported.
 10: */

 12: #include <../src/mat/impls/aij/seq/aij.h>

 14: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
 15:   #include <immintrin.h>

 17:   #if !defined(_MM_SCALE_8)
 18:     #define _MM_SCALE_8 8
 19:   #endif
 20:   #if !defined(_MM_SCALE_4)
 21:     #define _MM_SCALE_4 4
 22:   #endif
 23: #endif

 25: #define NDIM 512
 26: /* NDIM specifies how many rows at a time we should work with when
 27:  * performing the vectorized mat-vec.  This depends on various factors
 28:  * such as vector register length, etc., and I really need to add a
 29:  * way for the user (or the library) to tune this.  I'm setting it to
 30:  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
 31:  * routines. */

 33: typedef struct {
 34:   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */

 36:   PetscInt  ngroup;
 37:   PetscInt *xgroup;
 38:   /* Denotes where groups of rows with same number of nonzeros
 39:    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
 40:    * where the ith group begins. */

 42:   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
 43:   PetscInt *iperm;   /* The permutation vector. */

 45:   /* Some of this stuff is for Ed's recursive triangular solve.
 46:    * I'm not sure what I need yet. */
 47:   PetscInt   blocksize;
 48:   PetscInt   nstep;
 49:   PetscInt  *jstart_list;
 50:   PetscInt  *jend_list;
 51:   PetscInt  *action_list;
 52:   PetscInt  *ngroup_list;
 53:   PetscInt **ipointer_list;
 54:   PetscInt **xgroup_list;
 55:   PetscInt **nzgroup_list;
 56:   PetscInt **iperm_list;
 57: } Mat_SeqAIJPERM;

 59: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
 60: {
 61:   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
 62:   /* so we will ignore 'MatType type'. */
 63:   Mat             B       = *newmat;
 64:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;

 66:   PetscFunctionBegin;
 67:   if (reuse == MAT_INITIAL_MATRIX) {
 68:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
 69:     aijperm = (Mat_SeqAIJPERM *)B->spptr;
 70:   }

 72:   /* Reset the original function pointers. */
 73:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
 74:   B->ops->destroy     = MatDestroy_SeqAIJ;
 75:   B->ops->duplicate   = MatDuplicate_SeqAIJ;
 76:   B->ops->mult        = MatMult_SeqAIJ;
 77:   B->ops->multadd     = MatMultAdd_SeqAIJ;

 79:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));

 81:   /* Free everything in the Mat_SeqAIJPERM data structure.*/
 82:   PetscCall(PetscFree(aijperm->xgroup));
 83:   PetscCall(PetscFree(aijperm->nzgroup));
 84:   PetscCall(PetscFree(aijperm->iperm));
 85:   PetscCall(PetscFree(B->spptr));

 87:   /* Change the type of B to MATSEQAIJ. */
 88:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));

 90:   *newmat = B;
 91:   PetscFunctionReturn(PETSC_SUCCESS);
 92: }

 94: static PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
 95: {
 96:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;

 98:   PetscFunctionBegin;
 99:   if (aijperm) {
100:     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
101:     PetscCall(PetscFree(aijperm->xgroup));
102:     PetscCall(PetscFree(aijperm->nzgroup));
103:     PetscCall(PetscFree(aijperm->iperm));
104:     PetscCall(PetscFree(A->spptr));
105:   }
106:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107:    * to destroy everything that remains. */
108:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
109:   /* Note that I don't call MatSetType().  I believe this is because that
110:    * is only to be called when *building* a matrix.  I could be wrong, but
111:    * that is how things work for the SuperLU matrix class. */
112:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
113:   PetscCall(MatDestroy_SeqAIJ(A));
114:   PetscFunctionReturn(PETSC_SUCCESS);
115: }

117: static PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118: {
119:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
120:   Mat_SeqAIJPERM *aijperm_dest;
121:   PetscBool       perm;

123:   PetscFunctionBegin;
124:   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
125:   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
126:   if (perm) {
127:     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
128:     PetscCall(PetscFree(aijperm_dest->xgroup));
129:     PetscCall(PetscFree(aijperm_dest->nzgroup));
130:     PetscCall(PetscFree(aijperm_dest->iperm));
131:   } else {
132:     PetscCall(PetscNew(&aijperm_dest));
133:     (*M)->spptr = (void *)aijperm_dest;
134:     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
135:     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
136:   }
137:   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
138:   /* Allocate space for, and copy the grouping and permutation info.
139:    * I note that when the groups are initially determined in
140:    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141:    * necessary.  But at this point, we know how large they need to be, and
142:    * allocate only the necessary amount of memory.  So the duplicated matrix
143:    * may actually use slightly less storage than the original! */
144:   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
145:   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
146:   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
147:   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
148:   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
149:   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: static PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154: {
155:   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
156:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
157:   PetscInt        m;     /* Number of rows in the matrix. */
158:   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
159:   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
160:   PetscInt       *rows_in_bucket;
161:   /* To construct the permutation, we sort each row into one of maxnz
162:    * buckets based on how many nonzeros are in the row. */
163:   PetscInt  nz;
164:   PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
165:   PetscInt *ipnz;
166:   /* When constructing the iperm permutation vector,
167:    * ipnz[nz] is used to point to the next place in the permutation vector
168:    * that a row with nz nonzero elements should be placed.*/
169:   PetscInt i, ngroup, istart, ipos;

171:   PetscFunctionBegin;
172:   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
173:   aijperm->nonzerostate = A->nonzerostate;
174:   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
175:   PetscCall(PetscFree(aijperm->xgroup));
176:   PetscCall(PetscFree(aijperm->nzgroup));
177:   PetscCall(PetscFree(aijperm->iperm));

179:   m  = A->rmap->n;
180:   ia = a->i;

182:   /* Allocate the arrays that will hold the permutation vector. */
183:   PetscCall(PetscMalloc1(m, &aijperm->iperm));

185:   /* Allocate some temporary work arrays that will be used in
186:    * calculating the permutation vector and groupings. */
187:   PetscCall(PetscMalloc1(m, &nz_in_row));

189:   /* Now actually figure out the permutation and grouping. */

191:   /* First pass: Determine number of nonzeros in each row, maximum
192:    * number of nonzeros in any row, and how many rows fall into each
193:    * "bucket" of rows with same number of nonzeros. */
194:   maxnz = 0;
195:   for (i = 0; i < m; i++) {
196:     nz_in_row[i] = ia[i + 1] - ia[i];
197:     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198:   }
199:   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
200:   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));

202:   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
203:   for (i = 0; i < m; i++) {
204:     nz = nz_in_row[i];
205:     rows_in_bucket[nz]++;
206:   }

208:   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
209:    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
210:    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
211:    * then xgroup[] must consist of (maxnz + 2) elements, since the last
212:    * element of xgroup will tell us where the (maxnz + 1)th group ends.
213:    * We allocate space for the maximum number of groups;
214:    * that is potentially a little wasteful, but not too much so.
215:    * Perhaps I should fix it later. */
216:   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
217:   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));

219:   /* Second pass.  Look at what is in the buckets and create the groupings.
220:    * Note that it is OK to have a group of rows with no non-zero values. */
221:   ngroup = 0;
222:   istart = 0;
223:   for (i = 0; i <= maxnz; i++) {
224:     if (rows_in_bucket[i] > 0) {
225:       aijperm->nzgroup[ngroup] = i;
226:       aijperm->xgroup[ngroup]  = istart;
227:       ngroup++;
228:       istart += rows_in_bucket[i];
229:     }
230:   }

232:   aijperm->xgroup[ngroup] = istart;
233:   aijperm->ngroup         = ngroup;

235:   /* Now fill in the permutation vector iperm. */
236:   ipnz[0] = 0;
237:   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];

239:   for (i = 0; i < m; i++) {
240:     nz                   = nz_in_row[i];
241:     ipos                 = ipnz[nz];
242:     aijperm->iperm[ipos] = i;
243:     ipnz[nz]++;
244:   }

246:   /* Clean up temporary work arrays. */
247:   PetscCall(PetscFree(rows_in_bucket));
248:   PetscCall(PetscFree(ipnz));
249:   PetscCall(PetscFree(nz_in_row));
250:   PetscFunctionReturn(PETSC_SUCCESS);
251: }

253: static PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
254: {
255:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

257:   PetscFunctionBegin;
258:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);

260:   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
261:    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
262:    * I'm not sure if this is the best way to do this, but it avoids
263:    * a lot of code duplication.
264:    * I also note that currently MATSEQAIJPERM doesn't know anything about
265:    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
266:    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
267:    * this, this may break things.  (Don't know... haven't looked at it.) */
268:   a->inode.use = PETSC_FALSE;
269:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

271:   /* Now calculate the permutation and grouping information. */
272:   PetscCall(MatSeqAIJPERM_create_perm(A));
273:   PetscFunctionReturn(PETSC_SUCCESS);
274: }

276: static PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
277: {
278:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
279:   const PetscScalar *x;
280:   PetscScalar       *y;
281:   const MatScalar   *aa;
282:   const PetscInt    *aj, *ai;
283: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
284:   PetscInt i, j;
285: #endif
286: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
287:   __m512d  vec_x, vec_y, vec_vals;
288:   __m256i  vec_idx, vec_ipos, vec_j;
289:   __mmask8 mask;
290: #endif

292:   /* Variables that don't appear in MatMult_SeqAIJ. */
293:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
294:   PetscInt       *iperm; /* Points to the permutation vector. */
295:   PetscInt       *xgroup;
296:   /* Denotes where groups of rows with same number of nonzeros
297:    * begin and end in iperm. */
298:   PetscInt *nzgroup;
299:   PetscInt  ngroup;
300:   PetscInt  igroup;
301:   PetscInt  jstart, jend;
302:   /* jstart is used in loops to denote the position in iperm where a
303:    * group starts; jend denotes the position where it ends.
304:    * (jend + 1 is where the next group starts.) */
305:   PetscInt    iold, nz;
306:   PetscInt    istart, iend, isize;
307:   PetscInt    ipos;
308:   PetscScalar yp[NDIM];
309:   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */

311: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
312:   #pragma disjoint(*x, *y, *aa)
313: #endif

315:   PetscFunctionBegin;
316:   PetscCall(VecGetArrayRead(xx, &x));
317:   PetscCall(VecGetArray(yy, &y));
318:   aj = a->j; /* aj[k] gives column index for element aa[k]. */
319:   aa = a->a; /* Nonzero elements stored row-by-row. */
320:   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */

322:   /* Get the info we need about the permutations and groupings. */
323:   iperm   = aijperm->iperm;
324:   ngroup  = aijperm->ngroup;
325:   xgroup  = aijperm->xgroup;
326:   nzgroup = aijperm->nzgroup;

328: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
329:   fortranmultaijperm_(&m, x, ii, aj, aa, y);
330: #else

332:   for (igroup = 0; igroup < ngroup; igroup++) {
333:     jstart = xgroup[igroup];
334:     jend   = xgroup[igroup + 1] - 1;
335:     nz     = nzgroup[igroup];

337:     /* Handle the special cases where the number of nonzeros per row
338:      * in the group is either 0 or 1. */
339:     if (nz == 0) {
340:       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
341:     } else if (nz == 1) {
342:       for (i = jstart; i <= jend; i++) {
343:         iold    = iperm[i];
344:         ipos    = ai[iold];
345:         y[iold] = aa[ipos] * x[aj[ipos]];
346:       }
347:     } else {
348:       /* We work our way through the current group in chunks of NDIM rows
349:        * at a time. */

351:       for (istart = jstart; istart <= jend; istart += NDIM) {
352:         /* Figure out where the chunk of 'isize' rows ends in iperm.
353:          * 'isize may of course be less than NDIM for the last chunk. */
354:         iend = istart + (NDIM - 1);

356:         if (iend > jend) iend = jend;

358:         isize = iend - istart + 1;

360:         /* Initialize the yp[] array that will be used to hold part of
361:          * the permuted results vector, and figure out where in aa each
362:          * row of the chunk will begin. */
363:         for (i = 0; i < isize; i++) {
364:           iold = iperm[istart + i];
365:           /* iold is a row number from the matrix A *before* reordering. */
366:           ip[i] = ai[iold];
367:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
368:           yp[i] = (PetscScalar)0.0;
369:         }

371:         /* If the number of zeros per row exceeds the number of rows in
372:          * the chunk, we should vectorize along nz, that is, perform the
373:          * mat-vec one row at a time as in the usual CSR case. */
374:         if (nz > isize) {
375:   #if defined(PETSC_HAVE_CRAY_VECTOR)
376:     #pragma _CRI preferstream
377:   #endif
378:           for (i = 0; i < isize; i++) {
379:   #if defined(PETSC_HAVE_CRAY_VECTOR)
380:     #pragma _CRI prefervector
381:   #endif

383:   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
384:             vec_y = _mm512_setzero_pd();
385:             ipos  = ip[i];
386:             for (j = 0; j < (nz >> 3); j++) {
387:               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
388:               vec_vals = _mm512_loadu_pd(&aa[ipos]);
389:               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
390:               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
391:               ipos += 8;
392:             }
393:             if ((nz & 0x07) > 2) {
394:               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
395:               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
396:               vec_vals = _mm512_loadu_pd(&aa[ipos]);
397:               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
398:               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
399:             } else if ((nz & 0x07) == 2) {
400:               yp[i] += aa[ipos] * x[aj[ipos]];
401:               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
402:             } else if ((nz & 0x07) == 1) {
403:               yp[i] += aa[ipos] * x[aj[ipos]];
404:             }
405:             yp[i] += _mm512_reduce_add_pd(vec_y);
406:   #else
407:             for (j = 0; j < nz; j++) {
408:               ipos = ip[i] + j;
409:               yp[i] += aa[ipos] * x[aj[ipos]];
410:             }
411:   #endif
412:           }
413:         } else {
414:           /* Otherwise, there are enough rows in the chunk to make it
415:            * worthwhile to vectorize across the rows, that is, to do the
416:            * matvec by operating with "columns" of the chunk. */
417:           for (j = 0; j < nz; j++) {
418:   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
419:             vec_j = _mm256_set1_epi32(j);
420:             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
421:               vec_y    = _mm512_loadu_pd(&yp[i]);
422:               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
423:               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
424:               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
425:               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
426:               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
427:               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
428:               _mm512_storeu_pd(&yp[i], vec_y);
429:             }
430:             for (i = isize - (isize & 0x07); i < isize; i++) {
431:               ipos = ip[i] + j;
432:               yp[i] += aa[ipos] * x[aj[ipos]];
433:             }
434:   #else
435:             for (i = 0; i < isize; i++) {
436:               ipos = ip[i] + j;
437:               yp[i] += aa[ipos] * x[aj[ipos]];
438:             }
439:   #endif
440:           }
441:         }

443:   #if defined(PETSC_HAVE_CRAY_VECTOR)
444:     #pragma _CRI ivdep
445:   #endif
446:         /* Put results from yp[] into non-permuted result vector y. */
447:         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
448:       } /* End processing chunk of isize rows of a group. */
449:     }   /* End handling matvec for chunk with nz > 1. */
450:   }     /* End loop over igroup. */
451: #endif
452:   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
453:   PetscCall(VecRestoreArrayRead(xx, &x));
454:   PetscCall(VecRestoreArray(yy, &y));
455:   PetscFunctionReturn(PETSC_SUCCESS);
456: }

458: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
459:  * Note that the names I used to designate the vectors differs from that
460:  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
461:  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
462: /*
463:     I hate having virtually identical code for the mult and the multadd!!!
464: */
465: static PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
466: {
467:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
468:   const PetscScalar *x;
469:   PetscScalar       *y, *w;
470:   const MatScalar   *aa;
471:   const PetscInt    *aj, *ai;
472: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
473:   PetscInt i, j;
474: #endif

476:   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
477:   Mat_SeqAIJPERM *aijperm;
478:   PetscInt       *iperm; /* Points to the permutation vector. */
479:   PetscInt       *xgroup;
480:   /* Denotes where groups of rows with same number of nonzeros
481:    * begin and end in iperm. */
482:   PetscInt *nzgroup;
483:   PetscInt  ngroup;
484:   PetscInt  igroup;
485:   PetscInt  jstart, jend;
486:   /* jstart is used in loops to denote the position in iperm where a
487:    * group starts; jend denotes the position where it ends.
488:    * (jend + 1 is where the next group starts.) */
489:   PetscInt    iold, nz;
490:   PetscInt    istart, iend, isize;
491:   PetscInt    ipos;
492:   PetscScalar yp[NDIM];
493:   PetscInt    ip[NDIM];
494:   /* yp[] and ip[] are treated as vector "registers" for performing
495:    * the mat-vec. */

497: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
498:   #pragma disjoint(*x, *y, *aa)
499: #endif

501:   PetscFunctionBegin;
502:   PetscCall(VecGetArrayRead(xx, &x));
503:   PetscCall(VecGetArrayPair(yy, ww, &y, &w));

505:   aj = a->j; /* aj[k] gives column index for element aa[k]. */
506:   aa = a->a; /* Nonzero elements stored row-by-row. */
507:   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */

509:   /* Get the info we need about the permutations and groupings. */
510:   aijperm = (Mat_SeqAIJPERM *)A->spptr;
511:   iperm   = aijperm->iperm;
512:   ngroup  = aijperm->ngroup;
513:   xgroup  = aijperm->xgroup;
514:   nzgroup = aijperm->nzgroup;

516: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
517:   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
518: #else

520:   for (igroup = 0; igroup < ngroup; igroup++) {
521:     jstart = xgroup[igroup];
522:     jend   = xgroup[igroup + 1] - 1;

524:     nz = nzgroup[igroup];

526:     /* Handle the special cases where the number of nonzeros per row
527:      * in the group is either 0 or 1. */
528:     if (nz == 0) {
529:       for (i = jstart; i <= jend; i++) {
530:         iold    = iperm[i];
531:         y[iold] = w[iold];
532:       }
533:     } else if (nz == 1) {
534:       for (i = jstart; i <= jend; i++) {
535:         iold    = iperm[i];
536:         ipos    = ai[iold];
537:         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
538:       }
539:     }
540:     /* For the general case: */
541:     else {
542:       /* We work our way through the current group in chunks of NDIM rows
543:        * at a time. */

545:       for (istart = jstart; istart <= jend; istart += NDIM) {
546:         /* Figure out where the chunk of 'isize' rows ends in iperm.
547:          * 'isize may of course be less than NDIM for the last chunk. */
548:         iend = istart + (NDIM - 1);
549:         if (iend > jend) iend = jend;
550:         isize = iend - istart + 1;

552:         /* Initialize the yp[] array that will be used to hold part of
553:          * the permuted results vector, and figure out where in aa each
554:          * row of the chunk will begin. */
555:         for (i = 0; i < isize; i++) {
556:           iold = iperm[istart + i];
557:           /* iold is a row number from the matrix A *before* reordering. */
558:           ip[i] = ai[iold];
559:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
560:           yp[i] = w[iold];
561:         }

563:         /* If the number of zeros per row exceeds the number of rows in
564:          * the chunk, we should vectorize along nz, that is, perform the
565:          * mat-vec one row at a time as in the usual CSR case. */
566:         if (nz > isize) {
567:   #if defined(PETSC_HAVE_CRAY_VECTOR)
568:     #pragma _CRI preferstream
569:   #endif
570:           for (i = 0; i < isize; i++) {
571:   #if defined(PETSC_HAVE_CRAY_VECTOR)
572:     #pragma _CRI prefervector
573:   #endif
574:             for (j = 0; j < nz; j++) {
575:               ipos = ip[i] + j;
576:               yp[i] += aa[ipos] * x[aj[ipos]];
577:             }
578:           }
579:         }
580:         /* Otherwise, there are enough rows in the chunk to make it
581:          * worthwhile to vectorize across the rows, that is, to do the
582:          * matvec by operating with "columns" of the chunk. */
583:         else {
584:           for (j = 0; j < nz; j++) {
585:             for (i = 0; i < isize; i++) {
586:               ipos = ip[i] + j;
587:               yp[i] += aa[ipos] * x[aj[ipos]];
588:             }
589:           }
590:         }

592:   #if defined(PETSC_HAVE_CRAY_VECTOR)
593:     #pragma _CRI ivdep
594:   #endif
595:         /* Put results from yp[] into non-permuted result vector y. */
596:         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
597:       } /* End processing chunk of isize rows of a group. */

599:     } /* End handling matvec for chunk with nz > 1. */
600:   }   /* End loop over igroup. */

602: #endif
603:   PetscCall(PetscLogFlops(2.0 * a->nz));
604:   PetscCall(VecRestoreArrayRead(xx, &x));
605:   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
606:   PetscFunctionReturn(PETSC_SUCCESS);
607: }

609: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
610:  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
611:  * routine, but can also be used to convert an assembled SeqAIJ matrix
612:  * into a SeqAIJPERM one. */
613: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
614: {
615:   Mat             B = *newmat;
616:   Mat_SeqAIJPERM *aijperm;
617:   PetscBool       sametype;

619:   PetscFunctionBegin;
620:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
621:   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
622:   if (sametype) PetscFunctionReturn(PETSC_SUCCESS);

624:   PetscCall(PetscNew(&aijperm));
625:   B->spptr = (void *)aijperm;

627:   /* Set function pointers for methods that we inherit from AIJ but override. */
628:   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
629:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
630:   B->ops->destroy     = MatDestroy_SeqAIJPERM;
631:   B->ops->mult        = MatMult_SeqAIJPERM;
632:   B->ops->multadd     = MatMultAdd_SeqAIJPERM;

634:   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
635:   /* If A has already been assembled, compute the permutation. */
636:   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));

638:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));

640:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
641:   *newmat = B;
642:   PetscFunctionReturn(PETSC_SUCCESS);
643: }

645: /*@C
646:   MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.

648:   Collective

650:   Input Parameters:
651: + comm - MPI communicator, set to `PETSC_COMM_SELF`
652: . m    - number of rows
653: . n    - number of columns
654: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is given
655: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

657:   Output Parameter:
658: . A - the matrix

660:   Level: intermediate

662:   Notes:
663:   This type inherits from `MATSEQAIJ`, but calculates some additional permutation information
664:   that is used to allow better vectorization of some operations.  At the cost of increased
665:   storage, the `MATSEQAIJ` formatted matrix can be copied to a format in which pieces of the
666:   matrix are stored in ELLPACK format, allowing the vectorized matrix multiply routine to use
667:   stride-1 memory accesses.

669: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
670: @*/
671: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
672: {
673:   PetscFunctionBegin;
674:   PetscCall(MatCreate(comm, A));
675:   PetscCall(MatSetSizes(*A, m, n, m, n));
676:   PetscCall(MatSetType(*A, MATSEQAIJPERM));
677:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
678:   PetscFunctionReturn(PETSC_SUCCESS);
679: }

681: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
682: {
683:   PetscFunctionBegin;
684:   PetscCall(MatSetType(A, MATSEQAIJ));
685:   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
686:   PetscFunctionReturn(PETSC_SUCCESS);
687: }