2: /*
3: Defines basic operations for the MATSEQAIJPERM matrix class.
4: This class is derived from the MATSEQAIJ class and retains the
5: compressed row storage (aka Yale sparse matrix format) but augments
6: it with some permutation information that enables some operations
7: to be more vectorizable. A physically rearranged copy of the matrix
8: may be stored if the user desires.
10: Eventually a variety of permutations may be supported.
11: */
13: #include <../src/mat/impls/aij/seq/aij.h> 15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16: #include <immintrin.h>
18: #if !defined(_MM_SCALE_8)
19: #define _MM_SCALE_8 8 20: #endif
21: #if !defined(_MM_SCALE_4)
22: #define _MM_SCALE_4 4 23: #endif
24: #endif
26: #define NDIM 512 27: /* NDIM specifies how many rows at a time we should work with when
28: * performing the vectorized mat-vec. This depends on various factors
29: * such as vector register length, etc., and I really need to add a
30: * way for the user (or the library) to tune this. I'm setting it to
31: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32: * routines. */
34: typedef struct {
35: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
37: PetscInt ngroup;
38: PetscInt *xgroup;
39: /* Denotes where groups of rows with same number of nonzeros
40: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41: * where the ith group begins. */
43: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44: PetscInt *iperm; /* The permutation vector. */
46: /* Some of this stuff is for Ed's recursive triangular solve.
47: * I'm not sure what I need yet. */
48: PetscInt blocksize;
49: PetscInt nstep;
50: PetscInt *jstart_list;
51: PetscInt *jend_list;
52: PetscInt *action_list;
53: PetscInt *ngroup_list;
54: PetscInt **ipointer_list;
55: PetscInt **xgroup_list;
56: PetscInt **nzgroup_list;
57: PetscInt **iperm_list;
58: } Mat_SeqAIJPERM;
60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61: {
62: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63: /* so we will ignore 'MatType type'. */
65: Mat B = *newmat;
66: Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
69: if (reuse == MAT_INITIAL_MATRIX) {
70: MatDuplicate(A,MAT_COPY_VALUES,&B);
71: aijperm=(Mat_SeqAIJPERM*)B->spptr;
72: }
74: /* Reset the original function pointers. */
75: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76: B->ops->destroy = MatDestroy_SeqAIJ;
77: B->ops->duplicate = MatDuplicate_SeqAIJ;
78: B->ops->mult = MatMult_SeqAIJ;
79: B->ops->multadd = MatMultAdd_SeqAIJ;
81: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);
82: PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);
83: PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);
84: PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);
85: PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",NULL);
87: /* Free everything in the Mat_SeqAIJPERM data structure.*/
88: PetscFree(aijperm->xgroup);
89: PetscFree(aijperm->nzgroup);
90: PetscFree(aijperm->iperm);
91: PetscFree(B->spptr);
93: /* Change the type of B to MATSEQAIJ. */
94: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);
96: *newmat = B;
97: return(0);
98: }
100: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)101: {
103: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
106: if (aijperm) {
107: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
108: PetscFree(aijperm->xgroup);
109: PetscFree(aijperm->nzgroup);
110: PetscFree(aijperm->iperm);
111: PetscFree(A->spptr);
112: }
113: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
114: * to destroy everything that remains. */
115: PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
116: /* Note that I don't call MatSetType(). I believe this is because that
117: * is only to be called when *building* a matrix. I could be wrong, but
118: * that is how things work for the SuperLU matrix class. */
119: MatDestroy_SeqAIJ(A);
120: return(0);
121: }
123: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)124: {
126: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
127: Mat_SeqAIJPERM *aijperm_dest;
128: PetscBool perm;
131: MatDuplicate_SeqAIJ(A,op,M);
132: PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);
133: if (perm) {
134: aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
135: PetscFree(aijperm_dest->xgroup);
136: PetscFree(aijperm_dest->nzgroup);
137: PetscFree(aijperm_dest->iperm);
138: } else {
139: PetscNewLog(*M,&aijperm_dest);
140: (*M)->spptr = (void*) aijperm_dest;
141: PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);
142: PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
143: PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);
144: PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
145: PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
146: }
147: PetscArraycpy(aijperm_dest,aijperm,1);
148: /* Allocate space for, and copy the grouping and permutation info.
149: * I note that when the groups are initially determined in
150: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
151: * necessary. But at this point, we know how large they need to be, and
152: * allocate only the necessary amount of memory. So the duplicated matrix
153: * may actually use slightly less storage than the original! */
154: PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
155: PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);
156: PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
157: PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);
158: PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);
159: PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);
160: return(0);
161: }
163: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)164: {
166: Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data;
167: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
168: PetscInt m; /* Number of rows in the matrix. */
169: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
170: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
171: PetscInt *rows_in_bucket;
172: /* To construct the permutation, we sort each row into one of maxnz
173: * buckets based on how many nonzeros are in the row. */
174: PetscInt nz;
175: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
176: PetscInt *ipnz;
177: /* When constructing the iperm permutation vector,
178: * ipnz[nz] is used to point to the next place in the permutation vector
179: * that a row with nz nonzero elements should be placed.*/
180: PetscInt i, ngroup, istart, ipos;
183: if (aijperm->nonzerostate == A->nonzerostate) return(0); /* permutation exists and matches current nonzero structure */
184: aijperm->nonzerostate = A->nonzerostate;
185: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
186: PetscFree(aijperm->xgroup);
187: PetscFree(aijperm->nzgroup);
188: PetscFree(aijperm->iperm);
190: m = A->rmap->n;
191: ia = a->i;
193: /* Allocate the arrays that will hold the permutation vector. */
194: PetscMalloc1(m, &aijperm->iperm);
196: /* Allocate some temporary work arrays that will be used in
197: * calculating the permuation vector and groupings. */
198: PetscMalloc1(m, &nz_in_row);
200: /* Now actually figure out the permutation and grouping. */
202: /* First pass: Determine number of nonzeros in each row, maximum
203: * number of nonzeros in any row, and how many rows fall into each
204: * "bucket" of rows with same number of nonzeros. */
205: maxnz = 0;
206: for (i=0; i<m; i++) {
207: nz_in_row[i] = ia[i+1]-ia[i];
208: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
209: }
210: PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);
211: PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);
213: for (i=0; i<=maxnz; i++) {
214: rows_in_bucket[i] = 0;
215: }
216: for (i=0; i<m; i++) {
217: nz = nz_in_row[i];
218: rows_in_bucket[nz]++;
219: }
221: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
222: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
223: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
224: * then xgroup[] must consist of (maxnz + 2) elements, since the last
225: * element of xgroup will tell us where the (maxnz + 1)th group ends.
226: * We allocate space for the maximum number of groups;
227: * that is potentially a little wasteful, but not too much so.
228: * Perhaps I should fix it later. */
229: PetscMalloc1(maxnz+2, &aijperm->xgroup);
230: PetscMalloc1(maxnz+1, &aijperm->nzgroup);
232: /* Second pass. Look at what is in the buckets and create the groupings.
233: * Note that it is OK to have a group of rows with no non-zero values. */
234: ngroup = 0;
235: istart = 0;
236: for (i=0; i<=maxnz; i++) {
237: if (rows_in_bucket[i] > 0) {
238: aijperm->nzgroup[ngroup] = i;
239: aijperm->xgroup[ngroup] = istart;
240: ngroup++;
241: istart += rows_in_bucket[i];
242: }
243: }
245: aijperm->xgroup[ngroup] = istart;
246: aijperm->ngroup = ngroup;
248: /* Now fill in the permutation vector iperm. */
249: ipnz[0] = 0;
250: for (i=0; i<maxnz; i++) {
251: ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
252: }
254: for (i=0; i<m; i++) {
255: nz = nz_in_row[i];
256: ipos = ipnz[nz];
257: aijperm->iperm[ipos] = i;
258: ipnz[nz]++;
259: }
261: /* Clean up temporary work arrays. */
262: PetscFree(rows_in_bucket);
263: PetscFree(ipnz);
264: PetscFree(nz_in_row);
265: return(0);
266: }
269: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)270: {
272: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
275: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
277: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
278: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
279: * I'm not sure if this is the best way to do this, but it avoids
280: * a lot of code duplication.
281: * I also note that currently MATSEQAIJPERM doesn't know anything about
282: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
283: * are many zero rows. If the SeqAIJ assembly end routine decides to use
284: * this, this may break things. (Don't know... haven't looked at it.) */
285: a->inode.use = PETSC_FALSE;
286: MatAssemblyEnd_SeqAIJ(A, mode);
288: /* Now calculate the permutation and grouping information. */
289: MatSeqAIJPERM_create_perm(A);
290: return(0);
291: }
293: PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)294: {
295: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
296: const PetscScalar *x;
297: PetscScalar *y;
298: const MatScalar *aa;
299: PetscErrorCode ierr;
300: const PetscInt *aj,*ai;
301: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
302: PetscInt i,j;
303: #endif
304: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
305: __m512d vec_x,vec_y,vec_vals;
306: __m256i vec_idx,vec_ipos,vec_j;
307: __mmask8 mask;
308: #endif
310: /* Variables that don't appear in MatMult_SeqAIJ. */
311: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
312: PetscInt *iperm; /* Points to the permutation vector. */
313: PetscInt *xgroup;
314: /* Denotes where groups of rows with same number of nonzeros
315: * begin and end in iperm. */
316: PetscInt *nzgroup;
317: PetscInt ngroup;
318: PetscInt igroup;
319: PetscInt jstart,jend;
320: /* jstart is used in loops to denote the position in iperm where a
321: * group starts; jend denotes the position where it ends.
322: * (jend + 1 is where the next group starts.) */
323: PetscInt iold,nz;
324: PetscInt istart,iend,isize;
325: PetscInt ipos;
326: PetscScalar yp[NDIM];
327: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
329: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330: #pragma disjoint(*x,*y,*aa)
331: #endif
334: VecGetArrayRead(xx,&x);
335: VecGetArray(yy,&y);
336: aj = a->j; /* aj[k] gives column index for element aa[k]. */
337: aa = a->a; /* Nonzero elements stored row-by-row. */
338: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
340: /* Get the info we need about the permutations and groupings. */
341: iperm = aijperm->iperm;
342: ngroup = aijperm->ngroup;
343: xgroup = aijperm->xgroup;
344: nzgroup = aijperm->nzgroup;
346: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
347: fortranmultaijperm_(&m,x,ii,aj,aa,y);
348: #else
350: for (igroup=0; igroup<ngroup; igroup++) {
351: jstart = xgroup[igroup];
352: jend = xgroup[igroup+1] - 1;
353: nz = nzgroup[igroup];
355: /* Handle the special cases where the number of nonzeros per row
356: * in the group is either 0 or 1. */
357: if (nz == 0) {
358: for (i=jstart; i<=jend; i++) {
359: y[iperm[i]] = 0.0;
360: }
361: } else if (nz == 1) {
362: for (i=jstart; i<=jend; i++) {
363: iold = iperm[i];
364: ipos = ai[iold];
365: y[iold] = aa[ipos] * x[aj[ipos]];
366: }
367: } else {
369: /* We work our way through the current group in chunks of NDIM rows
370: * at a time. */
372: for (istart=jstart; istart<=jend; istart+=NDIM) {
373: /* Figure out where the chunk of 'isize' rows ends in iperm.
374: * 'isize may of course be less than NDIM for the last chunk. */
375: iend = istart + (NDIM - 1);
377: if (iend > jend) iend = jend;
379: isize = iend - istart + 1;
381: /* Initialize the yp[] array that will be used to hold part of
382: * the permuted results vector, and figure out where in aa each
383: * row of the chunk will begin. */
384: for (i=0; i<isize; i++) {
385: iold = iperm[istart + i];
386: /* iold is a row number from the matrix A *before* reordering. */
387: ip[i] = ai[iold];
388: /* ip[i] tells us where the ith row of the chunk begins in aa. */
389: yp[i] = (PetscScalar) 0.0;
390: }
392: /* If the number of zeros per row exceeds the number of rows in
393: * the chunk, we should vectorize along nz, that is, perform the
394: * mat-vec one row at a time as in the usual CSR case. */
395: if (nz > isize) {
396: #if defined(PETSC_HAVE_CRAY_VECTOR)
397: #pragma _CRI preferstream
398: #endif
399: for (i=0; i<isize; i++) {
400: #if defined(PETSC_HAVE_CRAY_VECTOR)
401: #pragma _CRI prefervector
402: #endif
404: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
405: vec_y = _mm512_setzero_pd();
406: ipos = ip[i];
407: for (j=0; j<(nz>>3); j++) {
408: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
409: vec_vals = _mm512_loadu_pd(&aa[ipos]);
410: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
411: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
412: ipos += 8;
413: }
414: if ((nz&0x07)>2) {
415: mask = (__mmask8)(0xff >> (8-(nz&0x07)));
416: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
417: vec_vals = _mm512_loadu_pd(&aa[ipos]);
418: vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
419: vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
420: } else if ((nz&0x07)==2) {
421: yp[i] += aa[ipos]*x[aj[ipos]];
422: yp[i] += aa[ipos+1]*x[aj[ipos+1]];
423: } else if ((nz&0x07)==1) {
424: yp[i] += aa[ipos]*x[aj[ipos]];
425: }
426: yp[i] += _mm512_reduce_add_pd(vec_y);
427: #else
428: for (j=0; j<nz; j++) {
429: ipos = ip[i] + j;
430: yp[i] += aa[ipos] * x[aj[ipos]];
431: }
432: #endif
433: }
434: } else {
435: /* Otherwise, there are enough rows in the chunk to make it
436: * worthwhile to vectorize across the rows, that is, to do the
437: * matvec by operating with "columns" of the chunk. */
438: for (j=0; j<nz; j++) {
439: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
440: vec_j = _mm256_set1_epi32(j);
441: for (i=0; i<((isize>>3)<<3); i+=8) {
442: vec_y = _mm512_loadu_pd(&yp[i]);
443: vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
444: vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
445: vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
446: vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
447: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
448: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
449: _mm512_storeu_pd(&yp[i],vec_y);
450: }
451: for (i=isize-(isize&0x07); i<isize; i++) {
452: ipos = ip[i]+j;
453: yp[i] += aa[ipos]*x[aj[ipos]];
454: }
455: #else
456: for (i=0; i<isize; i++) {
457: ipos = ip[i] + j;
458: yp[i] += aa[ipos] * x[aj[ipos]];
459: }
460: #endif
461: }
462: }
464: #if defined(PETSC_HAVE_CRAY_VECTOR)
465: #pragma _CRI ivdep
466: #endif
467: /* Put results from yp[] into non-permuted result vector y. */
468: for (i=0; i<isize; i++) {
469: y[iperm[istart+i]] = yp[i];
470: }
471: } /* End processing chunk of isize rows of a group. */
472: } /* End handling matvec for chunk with nz > 1. */
473: } /* End loop over igroup. */
474: #endif
475: PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));
476: VecRestoreArrayRead(xx,&x);
477: VecRestoreArray(yy,&y);
478: return(0);
479: }
482: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
483: * Note that the names I used to designate the vectors differs from that
484: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
485: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
486: /*
487: I hate having virtually identical code for the mult and the multadd!!!
488: */
489: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)490: {
491: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
492: const PetscScalar *x;
493: PetscScalar *y,*w;
494: const MatScalar *aa;
495: PetscErrorCode ierr;
496: const PetscInt *aj,*ai;
497: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
498: PetscInt i,j;
499: #endif
501: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
502: Mat_SeqAIJPERM * aijperm;
503: PetscInt *iperm; /* Points to the permutation vector. */
504: PetscInt *xgroup;
505: /* Denotes where groups of rows with same number of nonzeros
506: * begin and end in iperm. */
507: PetscInt *nzgroup;
508: PetscInt ngroup;
509: PetscInt igroup;
510: PetscInt jstart,jend;
511: /* jstart is used in loops to denote the position in iperm where a
512: * group starts; jend denotes the position where it ends.
513: * (jend + 1 is where the next group starts.) */
514: PetscInt iold,nz;
515: PetscInt istart,iend,isize;
516: PetscInt ipos;
517: PetscScalar yp[NDIM];
518: PetscInt ip[NDIM];
519: /* yp[] and ip[] are treated as vector "registers" for performing
520: * the mat-vec. */
522: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
523: #pragma disjoint(*x,*y,*aa)
524: #endif
527: VecGetArrayRead(xx,&x);
528: VecGetArrayPair(yy,ww,&y,&w);
530: aj = a->j; /* aj[k] gives column index for element aa[k]. */
531: aa = a->a; /* Nonzero elements stored row-by-row. */
532: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
534: /* Get the info we need about the permutations and groupings. */
535: aijperm = (Mat_SeqAIJPERM*) A->spptr;
536: iperm = aijperm->iperm;
537: ngroup = aijperm->ngroup;
538: xgroup = aijperm->xgroup;
539: nzgroup = aijperm->nzgroup;
541: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
542: fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
543: #else
545: for (igroup=0; igroup<ngroup; igroup++) {
546: jstart = xgroup[igroup];
547: jend = xgroup[igroup+1] - 1;
549: nz = nzgroup[igroup];
551: /* Handle the special cases where the number of nonzeros per row
552: * in the group is either 0 or 1. */
553: if (nz == 0) {
554: for (i=jstart; i<=jend; i++) {
555: iold = iperm[i];
556: y[iold] = w[iold];
557: }
558: }
559: else if (nz == 1) {
560: for (i=jstart; i<=jend; i++) {
561: iold = iperm[i];
562: ipos = ai[iold];
563: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
564: }
565: }
566: /* For the general case: */
567: else {
569: /* We work our way through the current group in chunks of NDIM rows
570: * at a time. */
572: for (istart=jstart; istart<=jend; istart+=NDIM) {
573: /* Figure out where the chunk of 'isize' rows ends in iperm.
574: * 'isize may of course be less than NDIM for the last chunk. */
575: iend = istart + (NDIM - 1);
576: if (iend > jend) iend = jend;
577: isize = iend - istart + 1;
579: /* Initialize the yp[] array that will be used to hold part of
580: * the permuted results vector, and figure out where in aa each
581: * row of the chunk will begin. */
582: for (i=0; i<isize; i++) {
583: iold = iperm[istart + i];
584: /* iold is a row number from the matrix A *before* reordering. */
585: ip[i] = ai[iold];
586: /* ip[i] tells us where the ith row of the chunk begins in aa. */
587: yp[i] = w[iold];
588: }
590: /* If the number of zeros per row exceeds the number of rows in
591: * the chunk, we should vectorize along nz, that is, perform the
592: * mat-vec one row at a time as in the usual CSR case. */
593: if (nz > isize) {
594: #if defined(PETSC_HAVE_CRAY_VECTOR)
595: #pragma _CRI preferstream
596: #endif
597: for (i=0; i<isize; i++) {
598: #if defined(PETSC_HAVE_CRAY_VECTOR)
599: #pragma _CRI prefervector
600: #endif
601: for (j=0; j<nz; j++) {
602: ipos = ip[i] + j;
603: yp[i] += aa[ipos] * x[aj[ipos]];
604: }
605: }
606: }
607: /* Otherwise, there are enough rows in the chunk to make it
608: * worthwhile to vectorize across the rows, that is, to do the
609: * matvec by operating with "columns" of the chunk. */
610: else {
611: for (j=0; j<nz; j++) {
612: for (i=0; i<isize; i++) {
613: ipos = ip[i] + j;
614: yp[i] += aa[ipos] * x[aj[ipos]];
615: }
616: }
617: }
619: #if defined(PETSC_HAVE_CRAY_VECTOR)
620: #pragma _CRI ivdep
621: #endif
622: /* Put results from yp[] into non-permuted result vector y. */
623: for (i=0; i<isize; i++) {
624: y[iperm[istart+i]] = yp[i];
625: }
626: } /* End processing chunk of isize rows of a group. */
628: } /* End handling matvec for chunk with nz > 1. */
629: } /* End loop over igroup. */
631: #endif
632: PetscLogFlops(2.0*a->nz);
633: VecRestoreArrayRead(xx,&x);
634: VecRestoreArrayPair(yy,ww,&y,&w);
635: return(0);
636: }
638: /* This function prototype is needed in MatConvert_SeqAIJ_SeqAIJPERM(), below. */
639: PETSC_INTERN PetscErrorCode MatPtAP_IS_XAIJ(Mat,Mat,MatReuse,PetscReal,Mat*);
641: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
642: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
643: * routine, but can also be used to convert an assembled SeqAIJ matrix
644: * into a SeqAIJPERM one. */
645: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)646: {
648: Mat B = *newmat;
649: Mat_SeqAIJPERM *aijperm;
650: PetscBool sametype;
653: if (reuse == MAT_INITIAL_MATRIX) {
654: MatDuplicate(A,MAT_COPY_VALUES,&B);
655: }
656: PetscObjectTypeCompare((PetscObject)A,type,&sametype);
657: if (sametype) return(0);
659: PetscNewLog(B,&aijperm);
660: B->spptr = (void*) aijperm;
662: /* Set function pointers for methods that we inherit from AIJ but override. */
663: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
664: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
665: B->ops->destroy = MatDestroy_SeqAIJPERM;
666: B->ops->mult = MatMult_SeqAIJPERM;
667: B->ops->multadd = MatMultAdd_SeqAIJPERM;
669: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
670: /* If A has already been assembled, compute the permutation. */
671: if (A->assembled) {
672: MatSeqAIJPERM_create_perm(B);
673: }
675: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
676: PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);
677: PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
678: PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);
679: PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",MatPtAP_IS_XAIJ);
681: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);
682: *newmat = B;
683: return(0);
684: }
686: /*@C
687: MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
688: This type inherits from AIJ, but calculates some additional permutation
689: information that is used to allow better vectorization of some
690: operations. At the cost of increased storage, the AIJ formatted
691: matrix can be copied to a format in which pieces of the matrix are
692: stored in ELLPACK format, allowing the vectorized matrix multiply
693: routine to use stride-1 memory accesses. As with the AIJ type, it is
694: important to preallocate matrix storage in order to get good assembly
695: performance.
697: Collective
699: Input Parameters:
700: + comm - MPI communicator, set to PETSC_COMM_SELF701: . m - number of rows
702: . n - number of columns
703: . nz - number of nonzeros per row (same for all rows)
704: - nnz - array containing the number of nonzeros in the various rows
705: (possibly different for each row) or NULL
707: Output Parameter:
708: . A - the matrix
710: Notes:
711: If nnz is given then nz is ignored
713: Level: intermediate
715: .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
716: @*/
717: PetscErrorCodeMatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)718: {
722: MatCreate(comm,A);
723: MatSetSizes(*A,m,n,m,n);
724: MatSetType(*A,MATSEQAIJPERM);
725: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
726: return(0);
727: }
729: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)730: {
734: MatSetType(A,MATSEQAIJ);
735: MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);
736: return(0);
737: }