Actual source code: aijperm.c
2: /*
3: Defines basic operations for the MATSEQAIJPERM matrix class.
4: This class is derived from the MATSEQAIJ class and retains the
5: compressed row storage (aka Yale sparse matrix format) but augments
6: it with some permutation information that enables some operations
7: to be more vectorizable. A physically rearranged copy of the matrix
8: may be stored if the user desires.
10: Eventually a variety of permutations may be supported.
11: */
13: #include <../src/mat/impls/aij/seq/aij.h>
15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16: #include <immintrin.h>
18: #if !defined(_MM_SCALE_8)
19: #define _MM_SCALE_8 8
20: #endif
21: #if !defined(_MM_SCALE_4)
22: #define _MM_SCALE_4 4
23: #endif
24: #endif
26: #define NDIM 512
27: /* NDIM specifies how many rows at a time we should work with when
28: * performing the vectorized mat-vec. This depends on various factors
29: * such as vector register length, etc., and I really need to add a
30: * way for the user (or the library) to tune this. I'm setting it to
31: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32: * routines. */
34: typedef struct {
35: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
37: PetscInt ngroup;
38: PetscInt *xgroup;
39: /* Denotes where groups of rows with same number of nonzeros
40: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41: * where the ith group begins. */
43: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44: PetscInt *iperm; /* The permutation vector. */
46: /* Some of this stuff is for Ed's recursive triangular solve.
47: * I'm not sure what I need yet. */
48: PetscInt blocksize;
49: PetscInt nstep;
50: PetscInt *jstart_list;
51: PetscInt *jend_list;
52: PetscInt *action_list;
53: PetscInt *ngroup_list;
54: PetscInt **ipointer_list;
55: PetscInt **xgroup_list;
56: PetscInt **nzgroup_list;
57: PetscInt **iperm_list;
58: } Mat_SeqAIJPERM;
60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61: {
62: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63: /* so we will ignore 'MatType type'. */
64: Mat B = *newmat;
65: Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67: if (reuse == MAT_INITIAL_MATRIX) {
68: MatDuplicate(A,MAT_COPY_VALUES,&B);
69: aijperm=(Mat_SeqAIJPERM*)B->spptr;
70: }
72: /* Reset the original function pointers. */
73: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74: B->ops->destroy = MatDestroy_SeqAIJ;
75: B->ops->duplicate = MatDuplicate_SeqAIJ;
76: B->ops->mult = MatMult_SeqAIJ;
77: B->ops->multadd = MatMultAdd_SeqAIJ;
79: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);
81: /* Free everything in the Mat_SeqAIJPERM data structure.*/
82: PetscFree(aijperm->xgroup);
83: PetscFree(aijperm->nzgroup);
84: PetscFree(aijperm->iperm);
85: PetscFree(B->spptr);
87: /* Change the type of B to MATSEQAIJ. */
88: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);
90: *newmat = B;
91: return 0;
92: }
94: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
95: {
96: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
98: if (aijperm) {
99: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
100: PetscFree(aijperm->xgroup);
101: PetscFree(aijperm->nzgroup);
102: PetscFree(aijperm->iperm);
103: PetscFree(A->spptr);
104: }
105: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106: * to destroy everything that remains. */
107: PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
108: /* Note that I don't call MatSetType(). I believe this is because that
109: * is only to be called when *building* a matrix. I could be wrong, but
110: * that is how things work for the SuperLU matrix class. */
111: MatDestroy_SeqAIJ(A);
112: return 0;
113: }
115: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
116: {
117: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
118: Mat_SeqAIJPERM *aijperm_dest;
119: PetscBool perm;
121: MatDuplicate_SeqAIJ(A,op,M);
122: PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);
123: if (perm) {
124: aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
125: PetscFree(aijperm_dest->xgroup);
126: PetscFree(aijperm_dest->nzgroup);
127: PetscFree(aijperm_dest->iperm);
128: } else {
129: PetscNewLog(*M,&aijperm_dest);
130: (*M)->spptr = (void*) aijperm_dest;
131: PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);
132: PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
133: }
134: PetscArraycpy(aijperm_dest,aijperm,1);
135: /* Allocate space for, and copy the grouping and permutation info.
136: * I note that when the groups are initially determined in
137: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
138: * necessary. But at this point, we know how large they need to be, and
139: * allocate only the necessary amount of memory. So the duplicated matrix
140: * may actually use slightly less storage than the original! */
141: PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
142: PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);
143: PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
144: PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);
145: PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);
146: PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);
147: return 0;
148: }
150: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
151: {
152: Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data;
153: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
154: PetscInt m; /* Number of rows in the matrix. */
155: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
156: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
157: PetscInt *rows_in_bucket;
158: /* To construct the permutation, we sort each row into one of maxnz
159: * buckets based on how many nonzeros are in the row. */
160: PetscInt nz;
161: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
162: PetscInt *ipnz;
163: /* When constructing the iperm permutation vector,
164: * ipnz[nz] is used to point to the next place in the permutation vector
165: * that a row with nz nonzero elements should be placed.*/
166: PetscInt i, ngroup, istart, ipos;
168: if (aijperm->nonzerostate == A->nonzerostate) return 0; /* permutation exists and matches current nonzero structure */
169: aijperm->nonzerostate = A->nonzerostate;
170: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
171: PetscFree(aijperm->xgroup);
172: PetscFree(aijperm->nzgroup);
173: PetscFree(aijperm->iperm);
175: m = A->rmap->n;
176: ia = a->i;
178: /* Allocate the arrays that will hold the permutation vector. */
179: PetscMalloc1(m, &aijperm->iperm);
181: /* Allocate some temporary work arrays that will be used in
182: * calculating the permutation vector and groupings. */
183: PetscMalloc1(m, &nz_in_row);
185: /* Now actually figure out the permutation and grouping. */
187: /* First pass: Determine number of nonzeros in each row, maximum
188: * number of nonzeros in any row, and how many rows fall into each
189: * "bucket" of rows with same number of nonzeros. */
190: maxnz = 0;
191: for (i=0; i<m; i++) {
192: nz_in_row[i] = ia[i+1]-ia[i];
193: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
194: }
195: PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);
196: PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);
198: for (i=0; i<=maxnz; i++) {
199: rows_in_bucket[i] = 0;
200: }
201: for (i=0; i<m; i++) {
202: nz = nz_in_row[i];
203: rows_in_bucket[nz]++;
204: }
206: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
207: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
208: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
209: * then xgroup[] must consist of (maxnz + 2) elements, since the last
210: * element of xgroup will tell us where the (maxnz + 1)th group ends.
211: * We allocate space for the maximum number of groups;
212: * that is potentially a little wasteful, but not too much so.
213: * Perhaps I should fix it later. */
214: PetscMalloc1(maxnz+2, &aijperm->xgroup);
215: PetscMalloc1(maxnz+1, &aijperm->nzgroup);
217: /* Second pass. Look at what is in the buckets and create the groupings.
218: * Note that it is OK to have a group of rows with no non-zero values. */
219: ngroup = 0;
220: istart = 0;
221: for (i=0; i<=maxnz; i++) {
222: if (rows_in_bucket[i] > 0) {
223: aijperm->nzgroup[ngroup] = i;
224: aijperm->xgroup[ngroup] = istart;
225: ngroup++;
226: istart += rows_in_bucket[i];
227: }
228: }
230: aijperm->xgroup[ngroup] = istart;
231: aijperm->ngroup = ngroup;
233: /* Now fill in the permutation vector iperm. */
234: ipnz[0] = 0;
235: for (i=0; i<maxnz; i++) {
236: ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
237: }
239: for (i=0; i<m; i++) {
240: nz = nz_in_row[i];
241: ipos = ipnz[nz];
242: aijperm->iperm[ipos] = i;
243: ipnz[nz]++;
244: }
246: /* Clean up temporary work arrays. */
247: PetscFree(rows_in_bucket);
248: PetscFree(ipnz);
249: PetscFree(nz_in_row);
250: return 0;
251: }
253: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
254: {
255: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
257: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
259: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
260: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
261: * I'm not sure if this is the best way to do this, but it avoids
262: * a lot of code duplication.
263: * I also note that currently MATSEQAIJPERM doesn't know anything about
264: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
265: * are many zero rows. If the SeqAIJ assembly end routine decides to use
266: * this, this may break things. (Don't know... haven't looked at it.) */
267: a->inode.use = PETSC_FALSE;
268: MatAssemblyEnd_SeqAIJ(A, mode);
270: /* Now calculate the permutation and grouping information. */
271: MatSeqAIJPERM_create_perm(A);
272: return 0;
273: }
275: PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
278: const PetscScalar *x;
279: PetscScalar *y;
280: const MatScalar *aa;
281: const PetscInt *aj,*ai;
282: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
283: PetscInt i,j;
284: #endif
285: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
286: __m512d vec_x,vec_y,vec_vals;
287: __m256i vec_idx,vec_ipos,vec_j;
288: __mmask8 mask;
289: #endif
291: /* Variables that don't appear in MatMult_SeqAIJ. */
292: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
293: PetscInt *iperm; /* Points to the permutation vector. */
294: PetscInt *xgroup;
295: /* Denotes where groups of rows with same number of nonzeros
296: * begin and end in iperm. */
297: PetscInt *nzgroup;
298: PetscInt ngroup;
299: PetscInt igroup;
300: PetscInt jstart,jend;
301: /* jstart is used in loops to denote the position in iperm where a
302: * group starts; jend denotes the position where it ends.
303: * (jend + 1 is where the next group starts.) */
304: PetscInt iold,nz;
305: PetscInt istart,iend,isize;
306: PetscInt ipos;
307: PetscScalar yp[NDIM];
308: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
310: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
311: #pragma disjoint(*x,*y,*aa)
312: #endif
314: VecGetArrayRead(xx,&x);
315: VecGetArray(yy,&y);
316: aj = a->j; /* aj[k] gives column index for element aa[k]. */
317: aa = a->a; /* Nonzero elements stored row-by-row. */
318: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
320: /* Get the info we need about the permutations and groupings. */
321: iperm = aijperm->iperm;
322: ngroup = aijperm->ngroup;
323: xgroup = aijperm->xgroup;
324: nzgroup = aijperm->nzgroup;
326: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
327: fortranmultaijperm_(&m,x,ii,aj,aa,y);
328: #else
330: for (igroup=0; igroup<ngroup; igroup++) {
331: jstart = xgroup[igroup];
332: jend = xgroup[igroup+1] - 1;
333: nz = nzgroup[igroup];
335: /* Handle the special cases where the number of nonzeros per row
336: * in the group is either 0 or 1. */
337: if (nz == 0) {
338: for (i=jstart; i<=jend; i++) {
339: y[iperm[i]] = 0.0;
340: }
341: } else if (nz == 1) {
342: for (i=jstart; i<=jend; i++) {
343: iold = iperm[i];
344: ipos = ai[iold];
345: y[iold] = aa[ipos] * x[aj[ipos]];
346: }
347: } else {
349: /* We work our way through the current group in chunks of NDIM rows
350: * at a time. */
352: for (istart=jstart; istart<=jend; istart+=NDIM) {
353: /* Figure out where the chunk of 'isize' rows ends in iperm.
354: * 'isize may of course be less than NDIM for the last chunk. */
355: iend = istart + (NDIM - 1);
357: if (iend > jend) iend = jend;
359: isize = iend - istart + 1;
361: /* Initialize the yp[] array that will be used to hold part of
362: * the permuted results vector, and figure out where in aa each
363: * row of the chunk will begin. */
364: for (i=0; i<isize; i++) {
365: iold = iperm[istart + i];
366: /* iold is a row number from the matrix A *before* reordering. */
367: ip[i] = ai[iold];
368: /* ip[i] tells us where the ith row of the chunk begins in aa. */
369: yp[i] = (PetscScalar) 0.0;
370: }
372: /* If the number of zeros per row exceeds the number of rows in
373: * the chunk, we should vectorize along nz, that is, perform the
374: * mat-vec one row at a time as in the usual CSR case. */
375: if (nz > isize) {
376: #if defined(PETSC_HAVE_CRAY_VECTOR)
377: #pragma _CRI preferstream
378: #endif
379: for (i=0; i<isize; i++) {
380: #if defined(PETSC_HAVE_CRAY_VECTOR)
381: #pragma _CRI prefervector
382: #endif
384: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
385: vec_y = _mm512_setzero_pd();
386: ipos = ip[i];
387: for (j=0; j<(nz>>3); j++) {
388: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
389: vec_vals = _mm512_loadu_pd(&aa[ipos]);
390: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
391: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
392: ipos += 8;
393: }
394: if ((nz&0x07)>2) {
395: mask = (__mmask8)(0xff >> (8-(nz&0x07)));
396: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
397: vec_vals = _mm512_loadu_pd(&aa[ipos]);
398: vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
399: vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
400: } else if ((nz&0x07)==2) {
401: yp[i] += aa[ipos]*x[aj[ipos]];
402: yp[i] += aa[ipos+1]*x[aj[ipos+1]];
403: } else if ((nz&0x07)==1) {
404: yp[i] += aa[ipos]*x[aj[ipos]];
405: }
406: yp[i] += _mm512_reduce_add_pd(vec_y);
407: #else
408: for (j=0; j<nz; j++) {
409: ipos = ip[i] + j;
410: yp[i] += aa[ipos] * x[aj[ipos]];
411: }
412: #endif
413: }
414: } else {
415: /* Otherwise, there are enough rows in the chunk to make it
416: * worthwhile to vectorize across the rows, that is, to do the
417: * matvec by operating with "columns" of the chunk. */
418: for (j=0; j<nz; j++) {
419: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
420: vec_j = _mm256_set1_epi32(j);
421: for (i=0; i<((isize>>3)<<3); i+=8) {
422: vec_y = _mm512_loadu_pd(&yp[i]);
423: vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
424: vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
425: vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
426: vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
427: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
428: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
429: _mm512_storeu_pd(&yp[i],vec_y);
430: }
431: for (i=isize-(isize&0x07); i<isize; i++) {
432: ipos = ip[i]+j;
433: yp[i] += aa[ipos]*x[aj[ipos]];
434: }
435: #else
436: for (i=0; i<isize; i++) {
437: ipos = ip[i] + j;
438: yp[i] += aa[ipos] * x[aj[ipos]];
439: }
440: #endif
441: }
442: }
444: #if defined(PETSC_HAVE_CRAY_VECTOR)
445: #pragma _CRI ivdep
446: #endif
447: /* Put results from yp[] into non-permuted result vector y. */
448: for (i=0; i<isize; i++) {
449: y[iperm[istart+i]] = yp[i];
450: }
451: } /* End processing chunk of isize rows of a group. */
452: } /* End handling matvec for chunk with nz > 1. */
453: } /* End loop over igroup. */
454: #endif
455: PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));
456: VecRestoreArrayRead(xx,&x);
457: VecRestoreArray(yy,&y);
458: return 0;
459: }
461: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
462: * Note that the names I used to designate the vectors differs from that
463: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
464: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
465: /*
466: I hate having virtually identical code for the mult and the multadd!!!
467: */
468: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
469: {
470: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
471: const PetscScalar *x;
472: PetscScalar *y,*w;
473: const MatScalar *aa;
474: const PetscInt *aj,*ai;
475: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
476: PetscInt i,j;
477: #endif
479: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
480: Mat_SeqAIJPERM * aijperm;
481: PetscInt *iperm; /* Points to the permutation vector. */
482: PetscInt *xgroup;
483: /* Denotes where groups of rows with same number of nonzeros
484: * begin and end in iperm. */
485: PetscInt *nzgroup;
486: PetscInt ngroup;
487: PetscInt igroup;
488: PetscInt jstart,jend;
489: /* jstart is used in loops to denote the position in iperm where a
490: * group starts; jend denotes the position where it ends.
491: * (jend + 1 is where the next group starts.) */
492: PetscInt iold,nz;
493: PetscInt istart,iend,isize;
494: PetscInt ipos;
495: PetscScalar yp[NDIM];
496: PetscInt ip[NDIM];
497: /* yp[] and ip[] are treated as vector "registers" for performing
498: * the mat-vec. */
500: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
501: #pragma disjoint(*x,*y,*aa)
502: #endif
504: VecGetArrayRead(xx,&x);
505: VecGetArrayPair(yy,ww,&y,&w);
507: aj = a->j; /* aj[k] gives column index for element aa[k]. */
508: aa = a->a; /* Nonzero elements stored row-by-row. */
509: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
511: /* Get the info we need about the permutations and groupings. */
512: aijperm = (Mat_SeqAIJPERM*) A->spptr;
513: iperm = aijperm->iperm;
514: ngroup = aijperm->ngroup;
515: xgroup = aijperm->xgroup;
516: nzgroup = aijperm->nzgroup;
518: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
519: fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
520: #else
522: for (igroup=0; igroup<ngroup; igroup++) {
523: jstart = xgroup[igroup];
524: jend = xgroup[igroup+1] - 1;
526: nz = nzgroup[igroup];
528: /* Handle the special cases where the number of nonzeros per row
529: * in the group is either 0 or 1. */
530: if (nz == 0) {
531: for (i=jstart; i<=jend; i++) {
532: iold = iperm[i];
533: y[iold] = w[iold];
534: }
535: }
536: else if (nz == 1) {
537: for (i=jstart; i<=jend; i++) {
538: iold = iperm[i];
539: ipos = ai[iold];
540: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
541: }
542: }
543: /* For the general case: */
544: else {
546: /* We work our way through the current group in chunks of NDIM rows
547: * at a time. */
549: for (istart=jstart; istart<=jend; istart+=NDIM) {
550: /* Figure out where the chunk of 'isize' rows ends in iperm.
551: * 'isize may of course be less than NDIM for the last chunk. */
552: iend = istart + (NDIM - 1);
553: if (iend > jend) iend = jend;
554: isize = iend - istart + 1;
556: /* Initialize the yp[] array that will be used to hold part of
557: * the permuted results vector, and figure out where in aa each
558: * row of the chunk will begin. */
559: for (i=0; i<isize; i++) {
560: iold = iperm[istart + i];
561: /* iold is a row number from the matrix A *before* reordering. */
562: ip[i] = ai[iold];
563: /* ip[i] tells us where the ith row of the chunk begins in aa. */
564: yp[i] = w[iold];
565: }
567: /* If the number of zeros per row exceeds the number of rows in
568: * the chunk, we should vectorize along nz, that is, perform the
569: * mat-vec one row at a time as in the usual CSR case. */
570: if (nz > isize) {
571: #if defined(PETSC_HAVE_CRAY_VECTOR)
572: #pragma _CRI preferstream
573: #endif
574: for (i=0; i<isize; i++) {
575: #if defined(PETSC_HAVE_CRAY_VECTOR)
576: #pragma _CRI prefervector
577: #endif
578: for (j=0; j<nz; j++) {
579: ipos = ip[i] + j;
580: yp[i] += aa[ipos] * x[aj[ipos]];
581: }
582: }
583: }
584: /* Otherwise, there are enough rows in the chunk to make it
585: * worthwhile to vectorize across the rows, that is, to do the
586: * matvec by operating with "columns" of the chunk. */
587: else {
588: for (j=0; j<nz; j++) {
589: for (i=0; i<isize; i++) {
590: ipos = ip[i] + j;
591: yp[i] += aa[ipos] * x[aj[ipos]];
592: }
593: }
594: }
596: #if defined(PETSC_HAVE_CRAY_VECTOR)
597: #pragma _CRI ivdep
598: #endif
599: /* Put results from yp[] into non-permuted result vector y. */
600: for (i=0; i<isize; i++) {
601: y[iperm[istart+i]] = yp[i];
602: }
603: } /* End processing chunk of isize rows of a group. */
605: } /* End handling matvec for chunk with nz > 1. */
606: } /* End loop over igroup. */
608: #endif
609: PetscLogFlops(2.0*a->nz);
610: VecRestoreArrayRead(xx,&x);
611: VecRestoreArrayPair(yy,ww,&y,&w);
612: return 0;
613: }
615: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
616: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
617: * routine, but can also be used to convert an assembled SeqAIJ matrix
618: * into a SeqAIJPERM one. */
619: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
620: {
621: Mat B = *newmat;
622: Mat_SeqAIJPERM *aijperm;
623: PetscBool sametype;
625: if (reuse == MAT_INITIAL_MATRIX) {
626: MatDuplicate(A,MAT_COPY_VALUES,&B);
627: }
628: PetscObjectTypeCompare((PetscObject)A,type,&sametype);
629: if (sametype) return 0;
631: PetscNewLog(B,&aijperm);
632: B->spptr = (void*) aijperm;
634: /* Set function pointers for methods that we inherit from AIJ but override. */
635: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
636: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
637: B->ops->destroy = MatDestroy_SeqAIJPERM;
638: B->ops->mult = MatMult_SeqAIJPERM;
639: B->ops->multadd = MatMultAdd_SeqAIJPERM;
641: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
642: /* If A has already been assembled, compute the permutation. */
643: if (A->assembled) {
644: MatSeqAIJPERM_create_perm(B);
645: }
647: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
649: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);
650: *newmat = B;
651: return 0;
652: }
654: /*@C
655: MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
656: This type inherits from AIJ, but calculates some additional permutation
657: information that is used to allow better vectorization of some
658: operations. At the cost of increased storage, the AIJ formatted
659: matrix can be copied to a format in which pieces of the matrix are
660: stored in ELLPACK format, allowing the vectorized matrix multiply
661: routine to use stride-1 memory accesses. As with the AIJ type, it is
662: important to preallocate matrix storage in order to get good assembly
663: performance.
665: Collective
667: Input Parameters:
668: + comm - MPI communicator, set to PETSC_COMM_SELF
669: . m - number of rows
670: . n - number of columns
671: . nz - number of nonzeros per row (same for all rows)
672: - nnz - array containing the number of nonzeros in the various rows
673: (possibly different for each row) or NULL
675: Output Parameter:
676: . A - the matrix
678: Notes:
679: If nnz is given then nz is ignored
681: Level: intermediate
683: .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
684: @*/
685: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
686: {
687: MatCreate(comm,A);
688: MatSetSizes(*A,m,n,m,n);
689: MatSetType(*A,MATSEQAIJPERM);
690: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
691: return 0;
692: }
694: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
695: {
696: MatSetType(A,MATSEQAIJ);
697: MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);
698: return 0;
699: }