2: /*
3: Defines basic operations for the MATSEQAIJPERM matrix class.
4: This class is derived from the MATSEQAIJ class and retains the
5: compressed row storage (aka Yale sparse matrix format) but augments
6: it with some permutation information that enables some operations
7: to be more vectorizable. A physically rearranged copy of the matrix
8: may be stored if the user desires.
10: Eventually a variety of permutations may be supported.
11: */
13: #include <../src/mat/impls/aij/seq/aij.h> 15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16: #include <immintrin.h>
18: #if !defined(_MM_SCALE_8)
19: #define _MM_SCALE_8 8 20: #endif
21: #if !defined(_MM_SCALE_4)
22: #define _MM_SCALE_4 4 23: #endif
24: #endif
26: #define NDIM 512 27: /* NDIM specifies how many rows at a time we should work with when
28: * performing the vectorized mat-vec. This depends on various factors
29: * such as vector register length, etc., and I really need to add a
30: * way for the user (or the library) to tune this. I'm setting it to
31: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32: * routines. */
34: typedef struct {
35: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
37: PetscInt ngroup;
38: PetscInt *xgroup;
39: /* Denotes where groups of rows with same number of nonzeros
40: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41: * where the ith group begins. */
43: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44: PetscInt *iperm; /* The permutation vector. */
46: /* Some of this stuff is for Ed's recursive triangular solve.
47: * I'm not sure what I need yet. */
48: PetscInt blocksize;
49: PetscInt nstep;
50: PetscInt *jstart_list;
51: PetscInt *jend_list;
52: PetscInt *action_list;
53: PetscInt *ngroup_list;
54: PetscInt **ipointer_list;
55: PetscInt **xgroup_list;
56: PetscInt **nzgroup_list;
57: PetscInt **iperm_list;
58: } Mat_SeqAIJPERM;
60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61: {
62: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63: /* so we will ignore 'MatType type'. */
65: Mat B = *newmat;
66: Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
69: if (reuse == MAT_INITIAL_MATRIX) {
70: MatDuplicate(A,MAT_COPY_VALUES,&B);
71: aijperm=(Mat_SeqAIJPERM*)B->spptr;
72: }
74: /* Reset the original function pointers. */
75: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76: B->ops->destroy = MatDestroy_SeqAIJ;
77: B->ops->duplicate = MatDuplicate_SeqAIJ;
78: B->ops->mult = MatMult_SeqAIJ;
79: B->ops->multadd = MatMultAdd_SeqAIJ;
81: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);
82: PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);
83: PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);
85: /* Free everything in the Mat_SeqAIJPERM data structure.*/
86: PetscFree(aijperm->xgroup);
87: PetscFree(aijperm->nzgroup);
88: PetscFree(aijperm->iperm);
89: PetscFree(B->spptr);
91: /* Change the type of B to MATSEQAIJ. */
92: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);
94: *newmat = B;
95: return(0);
96: }
98: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 99: {
101: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
104: if (aijperm) {
105: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
106: PetscFree(aijperm->xgroup);
107: PetscFree(aijperm->nzgroup);
108: PetscFree(aijperm->iperm);
109: PetscFree(A->spptr);
110: }
111: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
112: * to destroy everything that remains. */
113: PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
114: /* Note that I don't call MatSetType(). I believe this is because that
115: * is only to be called when *building* a matrix. I could be wrong, but
116: * that is how things work for the SuperLU matrix class. */
117: MatDestroy_SeqAIJ(A);
118: return(0);
119: }
121: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)122: {
124: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
125: Mat_SeqAIJPERM *aijperm_dest;
126: PetscBool perm;
129: MatDuplicate_SeqAIJ(A,op,M);
130: PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);
131: if (perm) {
132: aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
133: PetscFree(aijperm_dest->xgroup);
134: PetscFree(aijperm_dest->nzgroup);
135: PetscFree(aijperm_dest->iperm);
136: } else {
137: PetscNewLog(*M,&aijperm_dest);
138: (*M)->spptr = (void*) aijperm_dest;
139: PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);
140: PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
141: PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
142: PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
143: }
144: PetscArraycpy(aijperm_dest,aijperm,1);
145: /* Allocate space for, and copy the grouping and permutation info.
146: * I note that when the groups are initially determined in
147: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
148: * necessary. But at this point, we know how large they need to be, and
149: * allocate only the necessary amount of memory. So the duplicated matrix
150: * may actually use slightly less storage than the original! */
151: PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
152: PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);
153: PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
154: PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);
155: PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);
156: PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);
157: return(0);
158: }
160: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)161: {
163: Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data;
164: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
165: PetscInt m; /* Number of rows in the matrix. */
166: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
167: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
168: PetscInt *rows_in_bucket;
169: /* To construct the permutation, we sort each row into one of maxnz
170: * buckets based on how many nonzeros are in the row. */
171: PetscInt nz;
172: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
173: PetscInt *ipnz;
174: /* When constructing the iperm permutation vector,
175: * ipnz[nz] is used to point to the next place in the permutation vector
176: * that a row with nz nonzero elements should be placed.*/
177: PetscInt i, ngroup, istart, ipos;
180: if (aijperm->nonzerostate == A->nonzerostate) return(0); /* permutation exists and matches current nonzero structure */
181: aijperm->nonzerostate = A->nonzerostate;
182: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
183: PetscFree(aijperm->xgroup);
184: PetscFree(aijperm->nzgroup);
185: PetscFree(aijperm->iperm);
187: m = A->rmap->n;
188: ia = a->i;
190: /* Allocate the arrays that will hold the permutation vector. */
191: PetscMalloc1(m, &aijperm->iperm);
193: /* Allocate some temporary work arrays that will be used in
194: * calculating the permuation vector and groupings. */
195: PetscMalloc1(m, &nz_in_row);
197: /* Now actually figure out the permutation and grouping. */
199: /* First pass: Determine number of nonzeros in each row, maximum
200: * number of nonzeros in any row, and how many rows fall into each
201: * "bucket" of rows with same number of nonzeros. */
202: maxnz = 0;
203: for (i=0; i<m; i++) {
204: nz_in_row[i] = ia[i+1]-ia[i];
205: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
206: }
207: PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);
208: PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);
210: for (i=0; i<=maxnz; i++) {
211: rows_in_bucket[i] = 0;
212: }
213: for (i=0; i<m; i++) {
214: nz = nz_in_row[i];
215: rows_in_bucket[nz]++;
216: }
218: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
219: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
220: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
221: * then xgroup[] must consist of (maxnz + 2) elements, since the last
222: * element of xgroup will tell us where the (maxnz + 1)th group ends.
223: * We allocate space for the maximum number of groups;
224: * that is potentially a little wasteful, but not too much so.
225: * Perhaps I should fix it later. */
226: PetscMalloc1(maxnz+2, &aijperm->xgroup);
227: PetscMalloc1(maxnz+1, &aijperm->nzgroup);
229: /* Second pass. Look at what is in the buckets and create the groupings.
230: * Note that it is OK to have a group of rows with no non-zero values. */
231: ngroup = 0;
232: istart = 0;
233: for (i=0; i<=maxnz; i++) {
234: if (rows_in_bucket[i] > 0) {
235: aijperm->nzgroup[ngroup] = i;
236: aijperm->xgroup[ngroup] = istart;
237: ngroup++;
238: istart += rows_in_bucket[i];
239: }
240: }
242: aijperm->xgroup[ngroup] = istart;
243: aijperm->ngroup = ngroup;
245: /* Now fill in the permutation vector iperm. */
246: ipnz[0] = 0;
247: for (i=0; i<maxnz; i++) {
248: ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
249: }
251: for (i=0; i<m; i++) {
252: nz = nz_in_row[i];
253: ipos = ipnz[nz];
254: aijperm->iperm[ipos] = i;
255: ipnz[nz]++;
256: }
258: /* Clean up temporary work arrays. */
259: PetscFree(rows_in_bucket);
260: PetscFree(ipnz);
261: PetscFree(nz_in_row);
262: return(0);
263: }
266: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)267: {
269: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
272: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
274: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
275: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
276: * I'm not sure if this is the best way to do this, but it avoids
277: * a lot of code duplication.
278: * I also note that currently MATSEQAIJPERM doesn't know anything about
279: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
280: * are many zero rows. If the SeqAIJ assembly end routine decides to use
281: * this, this may break things. (Don't know... haven't looked at it.) */
282: a->inode.use = PETSC_FALSE;
283: MatAssemblyEnd_SeqAIJ(A, mode);
285: /* Now calculate the permutation and grouping information. */
286: MatSeqAIJPERM_create_perm(A);
287: return(0);
288: }
290: PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)291: {
292: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
293: const PetscScalar *x;
294: PetscScalar *y;
295: const MatScalar *aa;
296: PetscErrorCode ierr;
297: const PetscInt *aj,*ai;
298: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
299: PetscInt i,j;
300: #endif
301: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
302: __m512d vec_x,vec_y,vec_vals;
303: __m256i vec_idx,vec_ipos,vec_j;
304: __mmask8 mask;
305: #endif
307: /* Variables that don't appear in MatMult_SeqAIJ. */
308: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
309: PetscInt *iperm; /* Points to the permutation vector. */
310: PetscInt *xgroup;
311: /* Denotes where groups of rows with same number of nonzeros
312: * begin and end in iperm. */
313: PetscInt *nzgroup;
314: PetscInt ngroup;
315: PetscInt igroup;
316: PetscInt jstart,jend;
317: /* jstart is used in loops to denote the position in iperm where a
318: * group starts; jend denotes the position where it ends.
319: * (jend + 1 is where the next group starts.) */
320: PetscInt iold,nz;
321: PetscInt istart,iend,isize;
322: PetscInt ipos;
323: PetscScalar yp[NDIM];
324: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
326: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
327: #pragma disjoint(*x,*y,*aa)
328: #endif
331: VecGetArrayRead(xx,&x);
332: VecGetArray(yy,&y);
333: aj = a->j; /* aj[k] gives column index for element aa[k]. */
334: aa = a->a; /* Nonzero elements stored row-by-row. */
335: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
337: /* Get the info we need about the permutations and groupings. */
338: iperm = aijperm->iperm;
339: ngroup = aijperm->ngroup;
340: xgroup = aijperm->xgroup;
341: nzgroup = aijperm->nzgroup;
343: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
344: fortranmultaijperm_(&m,x,ii,aj,aa,y);
345: #else
347: for (igroup=0; igroup<ngroup; igroup++) {
348: jstart = xgroup[igroup];
349: jend = xgroup[igroup+1] - 1;
350: nz = nzgroup[igroup];
352: /* Handle the special cases where the number of nonzeros per row
353: * in the group is either 0 or 1. */
354: if (nz == 0) {
355: for (i=jstart; i<=jend; i++) {
356: y[iperm[i]] = 0.0;
357: }
358: } else if (nz == 1) {
359: for (i=jstart; i<=jend; i++) {
360: iold = iperm[i];
361: ipos = ai[iold];
362: y[iold] = aa[ipos] * x[aj[ipos]];
363: }
364: } else {
366: /* We work our way through the current group in chunks of NDIM rows
367: * at a time. */
369: for (istart=jstart; istart<=jend; istart+=NDIM) {
370: /* Figure out where the chunk of 'isize' rows ends in iperm.
371: * 'isize may of course be less than NDIM for the last chunk. */
372: iend = istart + (NDIM - 1);
374: if (iend > jend) iend = jend;
376: isize = iend - istart + 1;
378: /* Initialize the yp[] array that will be used to hold part of
379: * the permuted results vector, and figure out where in aa each
380: * row of the chunk will begin. */
381: for (i=0; i<isize; i++) {
382: iold = iperm[istart + i];
383: /* iold is a row number from the matrix A *before* reordering. */
384: ip[i] = ai[iold];
385: /* ip[i] tells us where the ith row of the chunk begins in aa. */
386: yp[i] = (PetscScalar) 0.0;
387: }
389: /* If the number of zeros per row exceeds the number of rows in
390: * the chunk, we should vectorize along nz, that is, perform the
391: * mat-vec one row at a time as in the usual CSR case. */
392: if (nz > isize) {
393: #if defined(PETSC_HAVE_CRAY_VECTOR)
394: #pragma _CRI preferstream
395: #endif
396: for (i=0; i<isize; i++) {
397: #if defined(PETSC_HAVE_CRAY_VECTOR)
398: #pragma _CRI prefervector
399: #endif
401: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
402: vec_y = _mm512_setzero_pd();
403: ipos = ip[i];
404: for (j=0; j<(nz>>3); j++) {
405: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
406: vec_vals = _mm512_loadu_pd(&aa[ipos]);
407: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
408: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
409: ipos += 8;
410: }
411: if ((nz&0x07)>2) {
412: mask = (__mmask8)(0xff >> (8-(nz&0x07)));
413: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
414: vec_vals = _mm512_loadu_pd(&aa[ipos]);
415: vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
416: vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
417: } else if ((nz&0x07)==2) {
418: yp[i] += aa[ipos]*x[aj[ipos]];
419: yp[i] += aa[ipos+1]*x[aj[ipos+1]];
420: } else if ((nz&0x07)==1) {
421: yp[i] += aa[ipos]*x[aj[ipos]];
422: }
423: yp[i] += _mm512_reduce_add_pd(vec_y);
424: #else
425: for (j=0; j<nz; j++) {
426: ipos = ip[i] + j;
427: yp[i] += aa[ipos] * x[aj[ipos]];
428: }
429: #endif
430: }
431: } else {
432: /* Otherwise, there are enough rows in the chunk to make it
433: * worthwhile to vectorize across the rows, that is, to do the
434: * matvec by operating with "columns" of the chunk. */
435: for (j=0; j<nz; j++) {
436: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
437: vec_j = _mm256_set1_epi32(j);
438: for (i=0; i<((isize>>3)<<3); i+=8) {
439: vec_y = _mm512_loadu_pd(&yp[i]);
440: vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
441: vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
442: vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
443: vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
444: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
445: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
446: _mm512_storeu_pd(&yp[i],vec_y);
447: }
448: for (i=isize-(isize&0x07); i<isize; i++) {
449: ipos = ip[i]+j;
450: yp[i] += aa[ipos]*x[aj[ipos]];
451: }
452: #else
453: for (i=0; i<isize; i++) {
454: ipos = ip[i] + j;
455: yp[i] += aa[ipos] * x[aj[ipos]];
456: }
457: #endif
458: }
459: }
461: #if defined(PETSC_HAVE_CRAY_VECTOR)
462: #pragma _CRI ivdep
463: #endif
464: /* Put results from yp[] into non-permuted result vector y. */
465: for (i=0; i<isize; i++) {
466: y[iperm[istart+i]] = yp[i];
467: }
468: } /* End processing chunk of isize rows of a group. */
469: } /* End handling matvec for chunk with nz > 1. */
470: } /* End loop over igroup. */
471: #endif
472: PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));
473: VecRestoreArrayRead(xx,&x);
474: VecRestoreArray(yy,&y);
475: return(0);
476: }
479: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
480: * Note that the names I used to designate the vectors differs from that
481: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
482: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
483: /*
484: I hate having virtually identical code for the mult and the multadd!!!
485: */
486: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)487: {
488: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
489: const PetscScalar *x;
490: PetscScalar *y,*w;
491: const MatScalar *aa;
492: PetscErrorCode ierr;
493: const PetscInt *aj,*ai;
494: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
495: PetscInt i,j;
496: #endif
498: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
499: Mat_SeqAIJPERM * aijperm;
500: PetscInt *iperm; /* Points to the permutation vector. */
501: PetscInt *xgroup;
502: /* Denotes where groups of rows with same number of nonzeros
503: * begin and end in iperm. */
504: PetscInt *nzgroup;
505: PetscInt ngroup;
506: PetscInt igroup;
507: PetscInt jstart,jend;
508: /* jstart is used in loops to denote the position in iperm where a
509: * group starts; jend denotes the position where it ends.
510: * (jend + 1 is where the next group starts.) */
511: PetscInt iold,nz;
512: PetscInt istart,iend,isize;
513: PetscInt ipos;
514: PetscScalar yp[NDIM];
515: PetscInt ip[NDIM];
516: /* yp[] and ip[] are treated as vector "registers" for performing
517: * the mat-vec. */
519: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
520: #pragma disjoint(*x,*y,*aa)
521: #endif
524: VecGetArrayRead(xx,&x);
525: VecGetArrayPair(yy,ww,&y,&w);
527: aj = a->j; /* aj[k] gives column index for element aa[k]. */
528: aa = a->a; /* Nonzero elements stored row-by-row. */
529: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
531: /* Get the info we need about the permutations and groupings. */
532: aijperm = (Mat_SeqAIJPERM*) A->spptr;
533: iperm = aijperm->iperm;
534: ngroup = aijperm->ngroup;
535: xgroup = aijperm->xgroup;
536: nzgroup = aijperm->nzgroup;
538: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
539: fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
540: #else
542: for (igroup=0; igroup<ngroup; igroup++) {
543: jstart = xgroup[igroup];
544: jend = xgroup[igroup+1] - 1;
546: nz = nzgroup[igroup];
548: /* Handle the special cases where the number of nonzeros per row
549: * in the group is either 0 or 1. */
550: if (nz == 0) {
551: for (i=jstart; i<=jend; i++) {
552: iold = iperm[i];
553: y[iold] = w[iold];
554: }
555: }
556: else if (nz == 1) {
557: for (i=jstart; i<=jend; i++) {
558: iold = iperm[i];
559: ipos = ai[iold];
560: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
561: }
562: }
563: /* For the general case: */
564: else {
566: /* We work our way through the current group in chunks of NDIM rows
567: * at a time. */
569: for (istart=jstart; istart<=jend; istart+=NDIM) {
570: /* Figure out where the chunk of 'isize' rows ends in iperm.
571: * 'isize may of course be less than NDIM for the last chunk. */
572: iend = istart + (NDIM - 1);
573: if (iend > jend) iend = jend;
574: isize = iend - istart + 1;
576: /* Initialize the yp[] array that will be used to hold part of
577: * the permuted results vector, and figure out where in aa each
578: * row of the chunk will begin. */
579: for (i=0; i<isize; i++) {
580: iold = iperm[istart + i];
581: /* iold is a row number from the matrix A *before* reordering. */
582: ip[i] = ai[iold];
583: /* ip[i] tells us where the ith row of the chunk begins in aa. */
584: yp[i] = w[iold];
585: }
587: /* If the number of zeros per row exceeds the number of rows in
588: * the chunk, we should vectorize along nz, that is, perform the
589: * mat-vec one row at a time as in the usual CSR case. */
590: if (nz > isize) {
591: #if defined(PETSC_HAVE_CRAY_VECTOR)
592: #pragma _CRI preferstream
593: #endif
594: for (i=0; i<isize; i++) {
595: #if defined(PETSC_HAVE_CRAY_VECTOR)
596: #pragma _CRI prefervector
597: #endif
598: for (j=0; j<nz; j++) {
599: ipos = ip[i] + j;
600: yp[i] += aa[ipos] * x[aj[ipos]];
601: }
602: }
603: }
604: /* Otherwise, there are enough rows in the chunk to make it
605: * worthwhile to vectorize across the rows, that is, to do the
606: * matvec by operating with "columns" of the chunk. */
607: else {
608: for (j=0; j<nz; j++) {
609: for (i=0; i<isize; i++) {
610: ipos = ip[i] + j;
611: yp[i] += aa[ipos] * x[aj[ipos]];
612: }
613: }
614: }
616: #if defined(PETSC_HAVE_CRAY_VECTOR)
617: #pragma _CRI ivdep
618: #endif
619: /* Put results from yp[] into non-permuted result vector y. */
620: for (i=0; i<isize; i++) {
621: y[iperm[istart+i]] = yp[i];
622: }
623: } /* End processing chunk of isize rows of a group. */
625: } /* End handling matvec for chunk with nz > 1. */
626: } /* End loop over igroup. */
628: #endif
629: PetscLogFlops(2.0*a->nz);
630: VecRestoreArrayRead(xx,&x);
631: VecRestoreArrayPair(yy,ww,&y,&w);
632: return(0);
633: }
635: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
636: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
637: * routine, but can also be used to convert an assembled SeqAIJ matrix
638: * into a SeqAIJPERM one. */
639: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)640: {
642: Mat B = *newmat;
643: Mat_SeqAIJPERM *aijperm;
644: PetscBool sametype;
647: if (reuse == MAT_INITIAL_MATRIX) {
648: MatDuplicate(A,MAT_COPY_VALUES,&B);
649: }
650: PetscObjectTypeCompare((PetscObject)A,type,&sametype);
651: if (sametype) return(0);
653: PetscNewLog(B,&aijperm);
654: B->spptr = (void*) aijperm;
656: /* Set function pointers for methods that we inherit from AIJ but override. */
657: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
658: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
659: B->ops->destroy = MatDestroy_SeqAIJPERM;
660: B->ops->mult = MatMult_SeqAIJPERM;
661: B->ops->multadd = MatMultAdd_SeqAIJPERM;
663: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
664: /* If A has already been assembled, compute the permutation. */
665: if (A->assembled) {
666: MatSeqAIJPERM_create_perm(B);
667: }
669: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
670: PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
671: PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
673: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);
674: *newmat = B;
675: return(0);
676: }
678: /*@C
679: MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
680: This type inherits from AIJ, but calculates some additional permutation
681: information that is used to allow better vectorization of some
682: operations. At the cost of increased storage, the AIJ formatted
683: matrix can be copied to a format in which pieces of the matrix are
684: stored in ELLPACK format, allowing the vectorized matrix multiply
685: routine to use stride-1 memory accesses. As with the AIJ type, it is
686: important to preallocate matrix storage in order to get good assembly
687: performance.
689: Collective
691: Input Parameters:
692: + comm - MPI communicator, set to PETSC_COMM_SELF693: . m - number of rows
694: . n - number of columns
695: . nz - number of nonzeros per row (same for all rows)
696: - nnz - array containing the number of nonzeros in the various rows
697: (possibly different for each row) or NULL
699: Output Parameter:
700: . A - the matrix
702: Notes:
703: If nnz is given then nz is ignored
705: Level: intermediate
707: .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
708: @*/
709: PetscErrorCodeMatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)710: {
714: MatCreate(comm,A);
715: MatSetSizes(*A,m,n,m,n);
716: MatSetType(*A,MATSEQAIJPERM);
717: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
718: return(0);
719: }
721: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)722: {
726: MatSetType(A,MATSEQAIJ);
727: MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);
728: return(0);
729: }