Actual source code: csrperm.c

petsc-3.8.4 2018-03-24
Report Typos and Errors

  2: /*
  3:   Defines basic operations for the MATSEQAIJPERM matrix class.
  4:   This class is derived from the MATSEQAIJ class and retains the
  5:   compressed row storage (aka Yale sparse matrix format) but augments
  6:   it with some permutation information that enables some operations
  7:   to be more vectorizable.  A physically rearranged copy of the matrix
  8:   may be stored if the user desires.

 10:   Eventually a variety of permutations may be supported.
 11: */

 13:  #include <../src/mat/impls/aij/seq/aij.h>

 15: #define NDIM 512
 16: /* NDIM specifies how many rows at a time we should work with when
 17:  * performing the vectorized mat-vec.  This depends on various factors
 18:  * such as vector register length, etc., and I really need to add a
 19:  * way for the user (or the library) to tune this.  I'm setting it to
 20:  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
 21:  * routines. */

 23: typedef struct {
 24:   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */

 26:   PetscInt         ngroup;
 27:   PetscInt         *xgroup;
 28:   /* Denotes where groups of rows with same number of nonzeros
 29:    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
 30:    * where the ith group begins. */

 32:   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
 33:   PetscInt         *iperm;  /* The permutation vector. */

 35:   /* Some of this stuff is for Ed's recursive triangular solve.
 36:    * I'm not sure what I need yet. */
 37:   PetscInt         blocksize;
 38:   PetscInt         nstep;
 39:   PetscInt         *jstart_list;
 40:   PetscInt         *jend_list;
 41:   PetscInt         *action_list;
 42:   PetscInt         *ngroup_list;
 43:   PetscInt         **ipointer_list;
 44:   PetscInt         **xgroup_list;
 45:   PetscInt         **nzgroup_list;
 46:   PetscInt         **iperm_list;
 47: } Mat_SeqAIJPERM;

 49: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
 50: {
 51:   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
 52:   /* so we will ignore 'MatType type'. */
 54:   Mat            B       = *newmat;
 55:   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;

 58:   if (reuse == MAT_INITIAL_MATRIX) {
 59:     MatDuplicate(A,MAT_COPY_VALUES,&B);
 60:     aijperm=(Mat_SeqAIJPERM*)B->spptr;
 61:   }

 63:   /* Reset the original function pointers. */
 64:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
 65:   B->ops->destroy     = MatDestroy_SeqAIJ;
 66:   B->ops->duplicate   = MatDuplicate_SeqAIJ;
 67:   B->ops->mult        = MatMult_SeqAIJ;
 68:   B->ops->multadd     = MatMultAdd_SeqAIJ;

 70:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);
 71:   PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);
 72:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);
 73:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);

 75:   /* Free everything in the Mat_SeqAIJPERM data structure.*/
 76:   PetscFree(aijperm->xgroup);
 77:   PetscFree(aijperm->nzgroup);
 78:   PetscFree(aijperm->iperm);
 79:   PetscFree(B->spptr);

 81:   /* Change the type of B to MATSEQAIJ. */
 82:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);

 84:   *newmat = B;
 85:   return(0);
 86: }

 88: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
 89: {
 91:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;

 94:   if (aijperm) {
 95:     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
 96:     PetscFree(aijperm->xgroup);
 97:     PetscFree(aijperm->nzgroup);
 98:     PetscFree(aijperm->iperm);
 99:     PetscFree(A->spptr);
100:   }
101:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
102:    * to destroy everything that remains. */
103:   PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
104:   /* Note that I don't call MatSetType().  I believe this is because that
105:    * is only to be called when *building* a matrix.  I could be wrong, but
106:    * that is how things work for the SuperLU matrix class. */
107:   MatDestroy_SeqAIJ(A);
108:   return(0);
109: }

111: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
112: {
114:   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
115:   Mat_SeqAIJPERM *aijperm_dest;
116:   PetscBool      perm;

119:   MatDuplicate_SeqAIJ(A,op,M);
120:   PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);
121:   if (perm) {
122:     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
123:     PetscFree(aijperm_dest->xgroup);
124:     PetscFree(aijperm_dest->nzgroup);
125:     PetscFree(aijperm_dest->iperm);
126:   } else {
127:     PetscNewLog(*M,&aijperm_dest);
128:     (*M)->spptr = (void*) aijperm_dest;
129:     PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);
130:     PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
131:     PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);
132:     PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
133:     PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
134:   }
135:   PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));
136:   /* Allocate space for, and copy the grouping and permutation info.
137:    * I note that when the groups are initially determined in
138:    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139:    * necessary.  But at this point, we know how large they need to be, and
140:    * allocate only the necessary amount of memory.  So the duplicated matrix
141:    * may actually use slightly less storage than the original! */
142:   PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
143:   PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);
144:   PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
145:   PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);
146:   PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));
147:   PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);
148:   return(0);
149: }

151: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
152: {
154:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
155:   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
156:   PetscInt       m;       /* Number of rows in the matrix. */
157:   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
158:   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
159:   PetscInt       *rows_in_bucket;
160:   /* To construct the permutation, we sort each row into one of maxnz
161:    * buckets based on how many nonzeros are in the row. */
162:   PetscInt       nz;
163:   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
164:   PetscInt       *ipnz;
165:   /* When constructing the iperm permutation vector,
166:    * ipnz[nz] is used to point to the next place in the permutation vector
167:    * that a row with nz nonzero elements should be placed.*/
168:   PetscInt       i, ngroup, istart, ipos;

171:   if (aijperm->nonzerostate == A->nonzerostate) return(0); /* permutation exists and matches current nonzero structure */
172:   aijperm->nonzerostate = A->nonzerostate;
173:  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
174:   PetscFree(aijperm->xgroup);
175:   PetscFree(aijperm->nzgroup);
176:   PetscFree(aijperm->iperm);

178:   m  = A->rmap->n;
179:   ia = a->i;

181:   /* Allocate the arrays that will hold the permutation vector. */
182:   PetscMalloc1(m, &aijperm->iperm);

184:   /* Allocate some temporary work arrays that will be used in
185:    * calculating the permuation vector and groupings. */
186:   PetscMalloc1(m, &nz_in_row);

188:   /* Now actually figure out the permutation and grouping. */

190:   /* First pass: Determine number of nonzeros in each row, maximum
191:    * number of nonzeros in any row, and how many rows fall into each
192:    * "bucket" of rows with same number of nonzeros. */
193:   maxnz = 0;
194:   for (i=0; i<m; i++) {
195:     nz_in_row[i] = ia[i+1]-ia[i];
196:     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
197:   }
198:   PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);
199:   PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);

201:   for (i=0; i<=maxnz; i++) {
202:     rows_in_bucket[i] = 0;
203:   }
204:   for (i=0; i<m; i++) {
205:     nz = nz_in_row[i];
206:     rows_in_bucket[nz]++;
207:   }

209:   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
210:    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
211:    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
212:    * then xgroup[] must consist of (maxnz + 2) elements, since the last
213:    * element of xgroup will tell us where the (maxnz + 1)th group ends.
214:    * We allocate space for the maximum number of groups;
215:    * that is potentially a little wasteful, but not too much so.
216:    * Perhaps I should fix it later. */
217:   PetscMalloc1(maxnz+2, &aijperm->xgroup);
218:   PetscMalloc1(maxnz+1, &aijperm->nzgroup);

220:   /* Second pass.  Look at what is in the buckets and create the groupings.
221:    * Note that it is OK to have a group of rows with no non-zero values. */
222:   ngroup = 0;
223:   istart = 0;
224:   for (i=0; i<=maxnz; i++) {
225:     if (rows_in_bucket[i] > 0) {
226:       aijperm->nzgroup[ngroup] = i;
227:       aijperm->xgroup[ngroup]  = istart;
228:       ngroup++;
229:       istart += rows_in_bucket[i];
230:     }
231:   }

233:   aijperm->xgroup[ngroup] = istart;
234:   aijperm->ngroup         = ngroup;

236:   /* Now fill in the permutation vector iperm. */
237:   ipnz[0] = 0;
238:   for (i=0; i<maxnz; i++) {
239:     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
240:   }

242:   for (i=0; i<m; i++) {
243:     nz                   = nz_in_row[i];
244:     ipos                 = ipnz[nz];
245:     aijperm->iperm[ipos] = i;
246:     ipnz[nz]++;
247:   }

249:   /* Clean up temporary work arrays. */
250:   PetscFree(rows_in_bucket);
251:   PetscFree(ipnz);
252:   PetscFree(nz_in_row);
253:   return(0);
254: }


257: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
258: {
260:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

263:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);

265:   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
266:    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
267:    * I'm not sure if this is the best way to do this, but it avoids
268:    * a lot of code duplication.
269:    * I also note that currently MATSEQAIJPERM doesn't know anything about
270:    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
271:    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
272:    * this, this may break things.  (Don't know... haven't looked at it.) */
273:   a->inode.use = PETSC_FALSE;
274:   MatAssemblyEnd_SeqAIJ(A, mode);

276:   /* Now calculate the permutation and grouping information. */
277:   MatSeqAIJPERM_create_perm(A);
278:   return(0);
279: }

281: PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
282: {
283:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
284:   const PetscScalar *x;
285:   PetscScalar       *y;
286:   const MatScalar   *aa;
287:   PetscErrorCode    ierr;
288:   const PetscInt    *aj,*ai;
289: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
290:   PetscInt          i,j;
291: #endif

293:   /* Variables that don't appear in MatMult_SeqAIJ. */
294:   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
295:   PetscInt          *iperm;  /* Points to the permutation vector. */
296:   PetscInt          *xgroup;
297:   /* Denotes where groups of rows with same number of nonzeros
298:    * begin and end in iperm. */
299:   PetscInt          *nzgroup;
300:   PetscInt          ngroup;
301:   PetscInt          igroup;
302:   PetscInt          jstart,jend;
303:   /* jstart is used in loops to denote the position in iperm where a
304:    * group starts; jend denotes the position where it ends.
305:    * (jend + 1 is where the next group starts.) */
306:   PetscInt          iold,nz;
307:   PetscInt          istart,iend,isize;
308:   PetscInt          ipos;
309:   PetscScalar       yp[NDIM];
310:   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */

312: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
313: #pragma disjoint(*x,*y,*aa)
314: #endif

317:   VecGetArrayRead(xx,&x);
318:   VecGetArray(yy,&y);
319:   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
320:   aa   = a->a; /* Nonzero elements stored row-by-row. */
321:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

323:   /* Get the info we need about the permutations and groupings. */
324:   iperm   = aijperm->iperm;
325:   ngroup  = aijperm->ngroup;
326:   xgroup  = aijperm->xgroup;
327:   nzgroup = aijperm->nzgroup;

329: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
330:   fortranmultaijperm_(&m,x,ii,aj,aa,y);
331: #else

333:   for (igroup=0; igroup<ngroup; igroup++) {
334:     jstart = xgroup[igroup];
335:     jend   = xgroup[igroup+1] - 1;
336:     nz     = nzgroup[igroup];

338:     /* Handle the special cases where the number of nonzeros per row
339:      * in the group is either 0 or 1. */
340:     if (nz == 0) {
341:       for (i=jstart; i<=jend; i++) {
342:         y[iperm[i]] = 0.0;
343:       }
344:     } else if (nz == 1) {
345:       for (i=jstart; i<=jend; i++) {
346:         iold    = iperm[i];
347:         ipos    = ai[iold];
348:         y[iold] = aa[ipos] * x[aj[ipos]];
349:       }
350:     } else {

352:       /* We work our way through the current group in chunks of NDIM rows
353:        * at a time. */

355:       for (istart=jstart; istart<=jend; istart+=NDIM) {
356:         /* Figure out where the chunk of 'isize' rows ends in iperm.
357:          * 'isize may of course be less than NDIM for the last chunk. */
358:         iend = istart + (NDIM - 1);

360:         if (iend > jend) iend = jend;

362:         isize = iend - istart + 1;

364:         /* Initialize the yp[] array that will be used to hold part of
365:          * the permuted results vector, and figure out where in aa each
366:          * row of the chunk will begin. */
367:         for (i=0; i<isize; i++) {
368:           iold = iperm[istart + i];
369:           /* iold is a row number from the matrix A *before* reordering. */
370:           ip[i] = ai[iold];
371:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
372:           yp[i] = (PetscScalar) 0.0;
373:         }

375:         /* If the number of zeros per row exceeds the number of rows in
376:          * the chunk, we should vectorize along nz, that is, perform the
377:          * mat-vec one row at a time as in the usual CSR case. */
378:         if (nz > isize) {
379: #if defined(PETSC_HAVE_CRAY_VECTOR)
380: #pragma _CRI preferstream
381: #endif
382:           for (i=0; i<isize; i++) {
383: #if defined(PETSC_HAVE_CRAY_VECTOR)
384: #pragma _CRI prefervector
385: #endif
386:             for (j=0; j<nz; j++) {
387:               ipos   = ip[i] + j;
388:               yp[i] += aa[ipos] * x[aj[ipos]];
389:             }
390:           }
391:         } else {
392:           /* Otherwise, there are enough rows in the chunk to make it
393:            * worthwhile to vectorize across the rows, that is, to do the
394:            * matvec by operating with "columns" of the chunk. */
395:           for (j=0; j<nz; j++) {
396:             for (i=0; i<isize; i++) {
397:               ipos   = ip[i] + j;
398:               yp[i] += aa[ipos] * x[aj[ipos]];
399:             }
400:           }
401:         }

403: #if defined(PETSC_HAVE_CRAY_VECTOR)
404: #pragma _CRI ivdep
405: #endif
406:         /* Put results from yp[] into non-permuted result vector y. */
407:         for (i=0; i<isize; i++) {
408:           y[iperm[istart+i]] = yp[i];
409:         }
410:       } /* End processing chunk of isize rows of a group. */
411:     } /* End handling matvec for chunk with nz > 1. */
412:   } /* End loop over igroup. */
413: #endif
414:   PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));
415:   VecRestoreArrayRead(xx,&x);
416:   VecRestoreArray(yy,&y);
417:   return(0);
418: }


421: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
422:  * Note that the names I used to designate the vectors differs from that
423:  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
424:  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
425: /*
426:     I hate having virtually identical code for the mult and the multadd!!!
427: */
428: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
429: {
430:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
431:   const PetscScalar *x;
432:   PetscScalar       *y,*w;
433:   const MatScalar   *aa;
434:   PetscErrorCode    ierr;
435:   const PetscInt    *aj,*ai;
436: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
437:   PetscInt i,j;
438: #endif

440:   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
441:   Mat_SeqAIJPERM * aijperm;
442:   PetscInt       *iperm;    /* Points to the permutation vector. */
443:   PetscInt       *xgroup;
444:   /* Denotes where groups of rows with same number of nonzeros
445:    * begin and end in iperm. */
446:   PetscInt *nzgroup;
447:   PetscInt ngroup;
448:   PetscInt igroup;
449:   PetscInt jstart,jend;
450:   /* jstart is used in loops to denote the position in iperm where a
451:    * group starts; jend denotes the position where it ends.
452:    * (jend + 1 is where the next group starts.) */
453:   PetscInt    iold,nz;
454:   PetscInt    istart,iend,isize;
455:   PetscInt    ipos;
456:   PetscScalar yp[NDIM];
457:   PetscInt    ip[NDIM];
458:   /* yp[] and ip[] are treated as vector "registers" for performing
459:    * the mat-vec. */

461: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
462: #pragma disjoint(*x,*y,*aa)
463: #endif

466:   VecGetArrayRead(xx,&x);
467:   VecGetArrayPair(yy,ww,&y,&w);

469:   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
470:   aa = a->a;   /* Nonzero elements stored row-by-row. */
471:   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */

473:   /* Get the info we need about the permutations and groupings. */
474:   aijperm = (Mat_SeqAIJPERM*) A->spptr;
475:   iperm   = aijperm->iperm;
476:   ngroup  = aijperm->ngroup;
477:   xgroup  = aijperm->xgroup;
478:   nzgroup = aijperm->nzgroup;

480: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
481:   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
482: #else

484:   for (igroup=0; igroup<ngroup; igroup++) {
485:     jstart = xgroup[igroup];
486:     jend   = xgroup[igroup+1] - 1;

488:     nz = nzgroup[igroup];

490:     /* Handle the special cases where the number of nonzeros per row
491:      * in the group is either 0 or 1. */
492:     if (nz == 0) {
493:       for (i=jstart; i<=jend; i++) {
494:         iold    = iperm[i];
495:         y[iold] = w[iold];
496:       }
497:     }
498:     else if (nz == 1) {
499:       for (i=jstart; i<=jend; i++) {
500:         iold    = iperm[i];
501:         ipos    = ai[iold];
502:         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
503:       }
504:     }
505:     /* For the general case: */
506:     else {

508:       /* We work our way through the current group in chunks of NDIM rows
509:        * at a time. */

511:       for (istart=jstart; istart<=jend; istart+=NDIM) {
512:         /* Figure out where the chunk of 'isize' rows ends in iperm.
513:          * 'isize may of course be less than NDIM for the last chunk. */
514:         iend = istart + (NDIM - 1);
515:         if (iend > jend) iend = jend;
516:         isize = iend - istart + 1;

518:         /* Initialize the yp[] array that will be used to hold part of
519:          * the permuted results vector, and figure out where in aa each
520:          * row of the chunk will begin. */
521:         for (i=0; i<isize; i++) {
522:           iold = iperm[istart + i];
523:           /* iold is a row number from the matrix A *before* reordering. */
524:           ip[i] = ai[iold];
525:           /* ip[i] tells us where the ith row of the chunk begins in aa. */
526:           yp[i] = w[iold];
527:         }

529:         /* If the number of zeros per row exceeds the number of rows in
530:          * the chunk, we should vectorize along nz, that is, perform the
531:          * mat-vec one row at a time as in the usual CSR case. */
532:         if (nz > isize) {
533: #if defined(PETSC_HAVE_CRAY_VECTOR)
534: #pragma _CRI preferstream
535: #endif
536:           for (i=0; i<isize; i++) {
537: #if defined(PETSC_HAVE_CRAY_VECTOR)
538: #pragma _CRI prefervector
539: #endif
540:             for (j=0; j<nz; j++) {
541:               ipos   = ip[i] + j;
542:               yp[i] += aa[ipos] * x[aj[ipos]];
543:             }
544:           }
545:         }
546:         /* Otherwise, there are enough rows in the chunk to make it
547:          * worthwhile to vectorize across the rows, that is, to do the
548:          * matvec by operating with "columns" of the chunk. */
549:         else {
550:           for (j=0; j<nz; j++) {
551:             for (i=0; i<isize; i++) {
552:               ipos   = ip[i] + j;
553:               yp[i] += aa[ipos] * x[aj[ipos]];
554:             }
555:           }
556:         }

558: #if defined(PETSC_HAVE_CRAY_VECTOR)
559: #pragma _CRI ivdep
560: #endif
561:         /* Put results from yp[] into non-permuted result vector y. */
562:         for (i=0; i<isize; i++) {
563:           y[iperm[istart+i]] = yp[i];
564:         }
565:       } /* End processing chunk of isize rows of a group. */

567:     } /* End handling matvec for chunk with nz > 1. */
568:   } /* End loop over igroup. */

570: #endif
571:   PetscLogFlops(2.0*a->nz);
572:   VecRestoreArrayRead(xx,&x);
573:   VecRestoreArrayPair(yy,ww,&y,&w);
574:   return(0);
575: }


578: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
579:  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
580:  * routine, but can also be used to convert an assembled SeqAIJ matrix
581:  * into a SeqAIJPERM one. */
582: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
583: {
585:   Mat            B = *newmat;
586:   Mat_SeqAIJPERM *aijperm;
587:   PetscBool      sametype;

590:   if (reuse == MAT_INITIAL_MATRIX) {
591:     MatDuplicate(A,MAT_COPY_VALUES,&B);
592:   }
593:   PetscObjectTypeCompare((PetscObject)A,type,&sametype);
594:   if (sametype) return(0);

596:   PetscNewLog(B,&aijperm);
597:   B->spptr = (void*) aijperm;

599:   /* Set function pointers for methods that we inherit from AIJ but override. */
600:   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
601:   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
602:   B->ops->destroy     = MatDestroy_SeqAIJPERM;
603:   B->ops->mult        = MatMult_SeqAIJPERM;
604:   B->ops->multadd     = MatMultAdd_SeqAIJPERM;

606:   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
607:   /* If A has already been assembled, compute the permutation. */
608:   if (A->assembled) {
609:     MatSeqAIJPERM_create_perm(B);
610:   }

612:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
613:   PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);
614:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
615:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);

617:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);
618:   *newmat = B;
619:   return(0);
620: }

622: /*@C
623:    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
624:    This type inherits from AIJ, but calculates some additional permutation
625:    information that is used to allow better vectorization of some
626:    operations.  At the cost of increased storage, the AIJ formatted
627:    matrix can be copied to a format in which pieces of the matrix are
628:    stored in ELLPACK format, allowing the vectorized matrix multiply
629:    routine to use stride-1 memory accesses.  As with the AIJ type, it is
630:    important to preallocate matrix storage in order to get good assembly
631:    performance.

633:    Collective on MPI_Comm

635:    Input Parameters:
636: +  comm - MPI communicator, set to PETSC_COMM_SELF
637: .  m - number of rows
638: .  n - number of columns
639: .  nz - number of nonzeros per row (same for all rows)
640: -  nnz - array containing the number of nonzeros in the various rows
641:          (possibly different for each row) or NULL

643:    Output Parameter:
644: .  A - the matrix

646:    Notes:
647:    If nnz is given then nz is ignored

649:    Level: intermediate

651: .keywords: matrix, cray, sparse, parallel

653: .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
654: @*/
655: PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
656: {

660:   MatCreate(comm,A);
661:   MatSetSizes(*A,m,n,m,n);
662:   MatSetType(*A,MATSEQAIJPERM);
663:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
664:   return(0);
665: }

667: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
668: {

672:   MatSetType(A,MATSEQAIJ);
673:   MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);
674:   return(0);
675: }