petsc-3.10.5 2019-03-28

  2: /*
  3:   Defines basic operations for the MATSEQAIJPERM matrix class.
  4:   This class is derived from the MATSEQAIJ class and retains the
  5:   compressed row storage (aka Yale sparse matrix format) but augments
  6:   it with some permutation information that enables some operations
  7:   to be more vectorizable.  A physically rearranged copy of the matrix
  8:   may be stored if the user desires.

 10:   Eventually a variety of permutations may be supported.
 11: */

 13:  #include <../src/mat/impls/aij/seq/aij.h>

 15: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
 16: #include <immintrin.h>

 18: #if !defined(_MM_SCALE_8)
 19: #define _MM_SCALE_8    8
 20: #endif
 21: #if !defined(_MM_SCALE_4)
 22: #define _MM_SCALE_4    4
 23: #endif
 24: #endif

 26: #define NDIM 512
 27: /* NDIM specifies how many rows at a time we should work with when
 28:  * performing the vectorized mat-vec.  This depends on various factors
 29:  * such as vector register length, etc., and I really need to add a
 30:  * way for the user (or the library) to tune this.  I'm setting it to
 31:  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
 32:  * routines. */

 34: typedef struct {
 35:   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */

 37:   PetscInt         ngroup;
 38:   PetscInt         *xgroup;
 39:   /* Denotes where groups of rows with same number of nonzeros
 40:    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
 41:    * where the ith group begins. */

 43:   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
 44:   PetscInt         *iperm;  /* The permutation vector. */

 46:   /* Some of this stuff is for Ed's recursive triangular solve.
 47:    * I'm not sure what I need yet. */
 48:   PetscInt         blocksize;
 49:   PetscInt         nstep;
 50:   PetscInt         *jstart_list;
 51:   PetscInt         *jend_list;
 52:   PetscInt         *action_list;
 53:   PetscInt         *ngroup_list;
 54:   PetscInt         **ipointer_list;
 55:   PetscInt         **xgroup_list;
 56:   PetscInt         **nzgroup_list;
 57:   PetscInt         **iperm_list;
 58: } Mat_SeqAIJPERM;

 60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
 61: {
 62:   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
 63:   /* so we will ignore 'MatType type'. */
 65:   Mat            B       = *newmat;
 66:   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;

 69:   if (reuse == MAT_INITIAL_MATRIX) {
 70:     MatDuplicate(A,MAT_COPY_VALUES,&B);
 71:     aijperm=(Mat_SeqAIJPERM*)B->spptr;
 72:   }

 74:   /* Reset the original function pointers. */
 75:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
 76:   B->ops->destroy     = MatDestroy_SeqAIJ;
 77:   B->ops->duplicate   = MatDuplicate_SeqAIJ;
 78:   B->ops->mult        = MatMult_SeqAIJ;
 79:   B->ops->multadd     = MatMultAdd_SeqAIJ;

 81:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);
 82:   PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);
 83:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);
 84:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);
 85:   PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",NULL);

 87:   /* Free everything in the Mat_SeqAIJPERM data structure.*/
 88:   PetscFree(aijperm->xgroup);
 89:   PetscFree(aijperm->nzgroup);
 90:   PetscFree(aijperm->iperm);
 91:   PetscFree(B->spptr);

 93:   /* Change the type of B to MATSEQAIJ. */
 94:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);

 96:   *newmat = B;
 97:   return(0);
 98: }

100: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
101: {
103:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;

106:   if (aijperm) {
107:     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
108:     PetscFree(aijperm->xgroup);
109:     PetscFree(aijperm->nzgroup);
110:     PetscFree(aijperm->iperm);
111:     PetscFree(A->spptr);
112:   }
113:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
114:    * to destroy everything that remains. */
115:   PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
116:   /* Note that I don't call MatSetType().  I believe this is because that
117:    * is only to be called when *building* a matrix.  I could be wrong, but
118:    * that is how things work for the SuperLU matrix class. */
119:   MatDestroy_SeqAIJ(A);
120:   return(0);
121: }

123: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
124: {
126:   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
127:   Mat_SeqAIJPERM *aijperm_dest;
128:   PetscBool      perm;

131:   MatDuplicate_SeqAIJ(A,op,M);
132:   PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);
133:   if (perm) {
134:     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
135:     PetscFree(aijperm_dest->xgroup);
136:     PetscFree(aijperm_dest->nzgroup);
137:     PetscFree(aijperm_dest->iperm);
138:   } else {
139:     PetscNewLog(*M,&aijperm_dest);
140:     (*M)->spptr = (void*) aijperm_dest;
141:     PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);
142:     PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
143:     PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);
144:     PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
145:     PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
146:   }
147:   PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));
148:   /* Allocate space for, and copy the grouping and permutation info.
149:    * I note that when the groups are initially determined in
150:    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
151:    * necessary.  But at this point, we know how large they need to be, and
152:    * allocate only the necessary amount of memory.  So the duplicated matrix
153:    * may actually use slightly less storage than the original! */
154:   PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
155:   PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);
156:   PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
157:   PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);
158:   PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));
159:   PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);
160:   return(0);
161: }

163: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
164: {
166:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
167:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
168:   PetscInt       m;       /* Number of rows in the matrix. */
169:   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
170:   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
171:   PetscInt       *rows_in_bucket;
172:   /* To construct the permutation, we sort each row into one of maxnz
173:    * buckets based on how many nonzeros are in the row. */
174:   PetscInt       nz;
175:   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
176:   PetscInt       *ipnz;
177:   /* When constructing the iperm permutation vector,
178:    * ipnz[nz] is used to point to the next place in the permutation vector
179:    * that a row with nz nonzero elements should be placed.*/
180:   PetscInt       i, ngroup, istart, ipos;

183:   if (aijperm->nonzerostate == A->nonzerostate) return(0); /* permutation exists and matches current nonzero structure */
184:   aijperm->nonzerostate = A->nonzerostate;
185:  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
186:   PetscFree(aijperm->xgroup);
187:   PetscFree(aijperm->nzgroup);
188:   PetscFree(aijperm->iperm);

190:   m  = A->rmap->n;
191:   ia = a->i;

193:   /* Allocate the arrays that will hold the permutation vector. */
194:   PetscMalloc1(m, &aijperm->iperm);

196:   /* Allocate some temporary work arrays that will be used in
197:    * calculating the permuation vector and groupings. */
198:   PetscMalloc1(m, &nz_in_row);

200:   /* Now actually figure out the permutation and grouping. */

202:   /* First pass: Determine number of nonzeros in each row, maximum
203:    * number of nonzeros in any row, and how many rows fall into each
204:    * "bucket" of rows with same number of nonzeros. */
205:   maxnz = 0;
206:   for (i=0; i<m; i++) {
207:     nz_in_row[i] = ia[i+1]-ia[i];
208:     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
209:   }
210:   PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);
211:   PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);

213:   for (i=0; i<=maxnz; i++) {
214:     rows_in_bucket[i] = 0;
215:   }
216:   for (i=0; i<m; i++) {
217:     nz = nz_in_row[i];
218:     rows_in_bucket[nz]++;
219:   }

221:   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
222:    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
223:    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
224:    * then xgroup[] must consist of (maxnz + 2) elements, since the last
225:    * element of xgroup will tell us where the (maxnz + 1)th group ends.
226:    * We allocate space for the maximum number of groups;
227:    * that is potentially a little wasteful, but not too much so.
228:    * Perhaps I should fix it later. */
229:   PetscMalloc1(maxnz+2, &aijperm->xgroup);
230:   PetscMalloc1(maxnz+1, &aijperm->nzgroup);

232:   /* Second pass.  Look at what is in the buckets and create the groupings.
233:    * Note that it is OK to have a group of rows with no non-zero values. */
234:   ngroup = 0;
235:   istart = 0;
236:   for (i=0; i<=maxnz; i++) {
237:     if (rows_in_bucket[i] > 0) {
238:       aijperm->nzgroup[ngroup] = i;
239:       aijperm->xgroup[ngroup]  = istart;
240:       ngroup++;
241:       istart += rows_in_bucket[i];
242:     }
243:   }

245:   aijperm->xgroup[ngroup] = istart;
246:   aijperm->ngroup         = ngroup;

248:   /* Now fill in the permutation vector iperm. */
249:   ipnz[0] = 0;
250:   for (i=0; i<maxnz; i++) {
251:     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
252:   }

254:   for (i=0; i<m; i++) {
255:     nz                   = nz_in_row[i];
256:     ipos                 = ipnz[nz];
257:     aijperm->iperm[ipos] = i;
258:     ipnz[nz]++;
259:   }

261:   /* Clean up temporary work arrays. */
262:   PetscFree(rows_in_bucket);
263:   PetscFree(ipnz);
264:   PetscFree(nz_in_row);
265:   return(0);
266: }


269: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
270: {
272:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

275:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);

277:   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
278:    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
279:    * I'm not sure if this is the best way to do this, but it avoids
280:    * a lot of code duplication.
281:    * I also note that currently MATSEQAIJPERM doesn't know anything about
282:    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
283:    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
284:    * this, this may break things.  (Don't know... haven't looked at it.) */
285:   a->inode.use = PETSC_FALSE;
286:   MatAssemblyEnd_SeqAIJ(A, mode);

288:   /* Now calculate the permutation and grouping information. */
289:   MatSeqAIJPERM_create_perm(A);
290:   return(0);
291: }

293: PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
294: {
295:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
296:   const PetscScalar *x;
297:   PetscScalar       *y;
298:   const MatScalar   *aa;
299:   PetscErrorCode    ierr;
300:   const PetscInt    *aj,*ai;
301: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
302:   PetscInt          i,j;
303: #endif
304: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
305:   __m512d           vec_x,vec_y,vec_vals;
306:   __m256i           vec_idx,vec_ipos,vec_j;
307:   __mmask8           mask;
308: #endif

310:   /* Variables that don't appear in MatMult_SeqAIJ. */
311:   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
312:   PetscInt          *iperm;  /* Points to the permutation vector. */
313:   PetscInt          *xgroup;
314:   /* Denotes where groups of rows with same number of nonzeros
315:    * begin and end in iperm. */
316:   PetscInt          *nzgroup;
317:   PetscInt          ngroup;
318:   PetscInt          igroup;
319:   PetscInt          jstart,jend;
320:   /* jstart is used in loops to denote the position in iperm where a
321:    * group starts; jend denotes the position where it ends.
322:    * (jend + 1 is where the next group starts.) */
323:   PetscInt          iold,nz;
324:   PetscInt          istart,iend,isize;
325:   PetscInt          ipos;
326:   PetscScalar       yp[NDIM];
327:   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */

329: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330: #pragma disjoint(*x,*y,*aa)
331: #endif

334:   VecGetArrayRead(xx,&x);
335:   VecGetArray(yy,&y);
336:   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
337:   aa   = a->a; /* Nonzero elements stored row-by-row. */
338:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

340:   /* Get the info we need about the permutations and groupings. */
341:   iperm   = aijperm->iperm;
342:   ngroup  = aijperm->ngroup;
343:   xgroup  = aijperm->xgroup;
344:   nzgroup = aijperm->nzgroup;

346: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
347:   fortranmultaijperm_(&m,x,ii,aj,aa,y);
348: #else

350:   for (igroup=0; igroup<ngroup; igroup++) {
351:     jstart = xgroup[igroup];
352:     jend   = xgroup[igroup+1] - 1;
353:     nz     = nzgroup[igroup];

355:     /* Handle the special cases where the number of nonzeros per row
356:      * in the group is either 0 or 1. */
357:     if (nz == 0) {
358:       for (i=jstart; i<=jend; i++) {
359:         y[iperm[i]] = 0.0;
360:       }
361:     } else if (nz == 1) {
362:       for (i=jstart; i<=jend; i++) {
363:         iold    = iperm[i];
364:         ipos    = ai[iold];
365:         y[iold] = aa[ipos] * x[aj[ipos]];
366:       }
367:     } else {

369:       /* We work our way through the current group in chunks of NDIM rows
370:        * at a time. */

372:       for (istart=jstart; istart<=jend; istart+=NDIM) {
373:         /* Figure out where the chunk of 'isize' rows ends in iperm.
374:          * 'isize may of course be less than NDIM for the last chunk. */
375:         iend = istart + (NDIM - 1);

377:         if (iend > jend) iend = jend;

379:         isize = iend - istart + 1;

381:         /* Initialize the yp[] array that will be used to hold part of
382:          * the permuted results vector, and figure out where in aa each
383:          * row of the chunk will begin. */
384:         for (i=0; i<isize; i++) {
385:           iold = iperm[istart + i];
386:           /* iold is a row number from the matrix A *before* reordering. */
387:           ip[i] = ai[iold];
388:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
389:           yp[i] = (PetscScalar) 0.0;
390:         }

392:         /* If the number of zeros per row exceeds the number of rows in
393:          * the chunk, we should vectorize along nz, that is, perform the
394:          * mat-vec one row at a time as in the usual CSR case. */
395:         if (nz > isize) {
396: #if defined(PETSC_HAVE_CRAY_VECTOR)
397: #pragma _CRI preferstream
398: #endif
399:           for (i=0; i<isize; i++) {
400: #if defined(PETSC_HAVE_CRAY_VECTOR)
401: #pragma _CRI prefervector
402: #endif

404: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
405:             vec_y = _mm512_setzero_pd();
406:             ipos = ip[i];
407:             for (j=0; j<(nz>>3); j++) {
408:               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
409:               vec_vals = _mm512_loadu_pd(&aa[ipos]);
410:               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
411:               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
412:               ipos += 8;
413:             }
414:             if ((nz&0x07)>2) {
415:               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
416:               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
417:               vec_vals = _mm512_loadu_pd(&aa[ipos]);
418:               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
419:               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
420:             } else if ((nz&0x07)==2) {
421:               yp[i] += aa[ipos]*x[aj[ipos]];
422:               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
423:             } else if ((nz&0x07)==1) {
424:               yp[i] += aa[ipos]*x[aj[ipos]];
425:             }
426:             yp[i] += _mm512_reduce_add_pd(vec_y);
427: #else
428:             for (j=0; j<nz; j++) {
429:               ipos   = ip[i] + j;
430:               yp[i] += aa[ipos] * x[aj[ipos]];
431:             }
432: #endif
433:           }
434:         } else {
435:           /* Otherwise, there are enough rows in the chunk to make it
436:            * worthwhile to vectorize across the rows, that is, to do the
437:            * matvec by operating with "columns" of the chunk. */
438:           for (j=0; j<nz; j++) {
439: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
440:             vec_j = _mm256_set1_epi32(j);
441:             for (i=0; i<((isize>>3)<<3); i+=8) {
442:               vec_y    = _mm512_loadu_pd(&yp[i]);
443:               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
444:               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
445:               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
446:               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
447:               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
448:               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
449:               _mm512_storeu_pd(&yp[i],vec_y);
450:             }
451:             for (i=isize-(isize&0x07); i<isize; i++) {
452:               ipos = ip[i]+j;
453:               yp[i] += aa[ipos]*x[aj[ipos]];
454:             }
455: #else
456:             for (i=0; i<isize; i++) {
457:               ipos   = ip[i] + j;
458:               yp[i] += aa[ipos] * x[aj[ipos]];
459:             }
460: #endif
461:           }
462:         }

464: #if defined(PETSC_HAVE_CRAY_VECTOR)
465: #pragma _CRI ivdep
466: #endif
467:         /* Put results from yp[] into non-permuted result vector y. */
468:         for (i=0; i<isize; i++) {
469:           y[iperm[istart+i]] = yp[i];
470:         }
471:       } /* End processing chunk of isize rows of a group. */
472:     } /* End handling matvec for chunk with nz > 1. */
473:   } /* End loop over igroup. */
474: #endif
475:   PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));
476:   VecRestoreArrayRead(xx,&x);
477:   VecRestoreArray(yy,&y);
478:   return(0);
479: }


482: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
483:  * Note that the names I used to designate the vectors differs from that
484:  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
485:  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
486: /*
487:     I hate having virtually identical code for the mult and the multadd!!!
488: */
489: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
490: {
491:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
492:   const PetscScalar *x;
493:   PetscScalar       *y,*w;
494:   const MatScalar   *aa;
495:   PetscErrorCode    ierr;
496:   const PetscInt    *aj,*ai;
497: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
498:   PetscInt i,j;
499: #endif

501:   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
502:   Mat_SeqAIJPERM * aijperm;
503:   PetscInt       *iperm;    /* Points to the permutation vector. */
504:   PetscInt       *xgroup;
505:   /* Denotes where groups of rows with same number of nonzeros
506:    * begin and end in iperm. */
507:   PetscInt *nzgroup;
508:   PetscInt ngroup;
509:   PetscInt igroup;
510:   PetscInt jstart,jend;
511:   /* jstart is used in loops to denote the position in iperm where a
512:    * group starts; jend denotes the position where it ends.
513:    * (jend + 1 is where the next group starts.) */
514:   PetscInt    iold,nz;
515:   PetscInt    istart,iend,isize;
516:   PetscInt    ipos;
517:   PetscScalar yp[NDIM];
518:   PetscInt    ip[NDIM];
519:   /* yp[] and ip[] are treated as vector "registers" for performing
520:    * the mat-vec. */

522: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
523: #pragma disjoint(*x,*y,*aa)
524: #endif

527:   VecGetArrayRead(xx,&x);
528:   VecGetArrayPair(yy,ww,&y,&w);

530:   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
531:   aa = a->a;   /* Nonzero elements stored row-by-row. */
532:   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */

534:   /* Get the info we need about the permutations and groupings. */
535:   aijperm = (Mat_SeqAIJPERM*) A->spptr;
536:   iperm   = aijperm->iperm;
537:   ngroup  = aijperm->ngroup;
538:   xgroup  = aijperm->xgroup;
539:   nzgroup = aijperm->nzgroup;

541: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
542:   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
543: #else

545:   for (igroup=0; igroup<ngroup; igroup++) {
546:     jstart = xgroup[igroup];
547:     jend   = xgroup[igroup+1] - 1;

549:     nz = nzgroup[igroup];

551:     /* Handle the special cases where the number of nonzeros per row
552:      * in the group is either 0 or 1. */
553:     if (nz == 0) {
554:       for (i=jstart; i<=jend; i++) {
555:         iold    = iperm[i];
556:         y[iold] = w[iold];
557:       }
558:     }
559:     else if (nz == 1) {
560:       for (i=jstart; i<=jend; i++) {
561:         iold    = iperm[i];
562:         ipos    = ai[iold];
563:         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
564:       }
565:     }
566:     /* For the general case: */
567:     else {

569:       /* We work our way through the current group in chunks of NDIM rows
570:        * at a time. */

572:       for (istart=jstart; istart<=jend; istart+=NDIM) {
573:         /* Figure out where the chunk of 'isize' rows ends in iperm.
574:          * 'isize may of course be less than NDIM for the last chunk. */
575:         iend = istart + (NDIM - 1);
576:         if (iend > jend) iend = jend;
577:         isize = iend - istart + 1;

579:         /* Initialize the yp[] array that will be used to hold part of
580:          * the permuted results vector, and figure out where in aa each
581:          * row of the chunk will begin. */
582:         for (i=0; i<isize; i++) {
583:           iold = iperm[istart + i];
584:           /* iold is a row number from the matrix A *before* reordering. */
585:           ip[i] = ai[iold];
586:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
587:           yp[i] = w[iold];
588:         }

590:         /* If the number of zeros per row exceeds the number of rows in
591:          * the chunk, we should vectorize along nz, that is, perform the
592:          * mat-vec one row at a time as in the usual CSR case. */
593:         if (nz > isize) {
594: #if defined(PETSC_HAVE_CRAY_VECTOR)
595: #pragma _CRI preferstream
596: #endif
597:           for (i=0; i<isize; i++) {
598: #if defined(PETSC_HAVE_CRAY_VECTOR)
599: #pragma _CRI prefervector
600: #endif
601:             for (j=0; j<nz; j++) {
602:               ipos   = ip[i] + j;
603:               yp[i] += aa[ipos] * x[aj[ipos]];
604:             }
605:           }
606:         }
607:         /* Otherwise, there are enough rows in the chunk to make it
608:          * worthwhile to vectorize across the rows, that is, to do the
609:          * matvec by operating with "columns" of the chunk. */
610:         else {
611:           for (j=0; j<nz; j++) {
612:             for (i=0; i<isize; i++) {
613:               ipos   = ip[i] + j;
614:               yp[i] += aa[ipos] * x[aj[ipos]];
615:             }
616:           }
617:         }

619: #if defined(PETSC_HAVE_CRAY_VECTOR)
620: #pragma _CRI ivdep
621: #endif
622:         /* Put results from yp[] into non-permuted result vector y. */
623:         for (i=0; i<isize; i++) {
624:           y[iperm[istart+i]] = yp[i];
625:         }
626:       } /* End processing chunk of isize rows of a group. */

628:     } /* End handling matvec for chunk with nz > 1. */
629:   } /* End loop over igroup. */

631: #endif
632:   PetscLogFlops(2.0*a->nz);
633:   VecRestoreArrayRead(xx,&x);
634:   VecRestoreArrayPair(yy,ww,&y,&w);
635:   return(0);
636: }

638: /* This function prototype is needed in MatConvert_SeqAIJ_SeqAIJPERM(), below. */
639: PETSC_INTERN PetscErrorCode MatPtAP_IS_XAIJ(Mat,Mat,MatReuse,PetscReal,Mat*);

641: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
642:  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
643:  * routine, but can also be used to convert an assembled SeqAIJ matrix
644:  * into a SeqAIJPERM one. */
645: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
646: {
648:   Mat            B = *newmat;
649:   Mat_SeqAIJPERM *aijperm;
650:   PetscBool      sametype;

653:   if (reuse == MAT_INITIAL_MATRIX) {
654:     MatDuplicate(A,MAT_COPY_VALUES,&B);
655:   }
656:   PetscObjectTypeCompare((PetscObject)A,type,&sametype);
657:   if (sametype) return(0);

659:   PetscNewLog(B,&aijperm);
660:   B->spptr = (void*) aijperm;

662:   /* Set function pointers for methods that we inherit from AIJ but override. */
663:   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
664:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
665:   B->ops->destroy     = MatDestroy_SeqAIJPERM;
666:   B->ops->mult        = MatMult_SeqAIJPERM;
667:   B->ops->multadd     = MatMultAdd_SeqAIJPERM;

669:   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
670:   /* If A has already been assembled, compute the permutation. */
671:   if (A->assembled) {
672:     MatSeqAIJPERM_create_perm(B);
673:   }

675:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
676:   PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);
677:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
678:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
679:   PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",MatPtAP_IS_XAIJ);

681:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);
682:   *newmat = B;
683:   return(0);
684: }

686: /*@C
687:    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
688:    This type inherits from AIJ, but calculates some additional permutation
689:    information that is used to allow better vectorization of some
690:    operations.  At the cost of increased storage, the AIJ formatted
691:    matrix can be copied to a format in which pieces of the matrix are
692:    stored in ELLPACK format, allowing the vectorized matrix multiply
693:    routine to use stride-1 memory accesses.  As with the AIJ type, it is
694:    important to preallocate matrix storage in order to get good assembly
695:    performance.

697:    Collective on MPI_Comm

699:    Input Parameters:
700: +  comm - MPI communicator, set to PETSC_COMM_SELF
701: .  m - number of rows
702: .  n - number of columns
703: .  nz - number of nonzeros per row (same for all rows)
704: -  nnz - array containing the number of nonzeros in the various rows
705:          (possibly different for each row) or NULL

707:    Output Parameter:
708: .  A - the matrix

710:    Notes:
711:    If nnz is given then nz is ignored

713:    Level: intermediate

715: .keywords: matrix, cray, sparse, parallel

717: .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
718: @*/
719: PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
720: {

724:   MatCreate(comm,A);
725:   MatSetSizes(*A,m,n,m,n);
726:   MatSetType(*A,MATSEQAIJPERM);
727:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
728:   return(0);
729: }

731: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
732: {

736:   MatSetType(A,MATSEQAIJ);
737:   MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);
738:   return(0);
739: }