Actual source code: aijperm.c
1: /*
2: Defines basic operations for the MATSEQAIJPERM matrix class.
3: This class is derived from the MATSEQAIJ class and retains the
4: compressed row storage (aka Yale sparse matrix format) but augments
5: it with some permutation information that enables some operations
6: to be more vectorizable. A physically rearranged copy of the matrix
7: may be stored if the user desires.
9: Eventually a variety of permutations may be supported.
10: */
12: #include <../src/mat/impls/aij/seq/aij.h>
14: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
15: #include <immintrin.h>
17: #if !defined(_MM_SCALE_8)
18: #define _MM_SCALE_8 8
19: #endif
20: #if !defined(_MM_SCALE_4)
21: #define _MM_SCALE_4 4
22: #endif
23: #endif
25: #define NDIM 512
26: /* NDIM specifies how many rows at a time we should work with when
27: * performing the vectorized mat-vec. This depends on various factors
28: * such as vector register length, etc., and I really need to add a
29: * way for the user (or the library) to tune this. I'm setting it to
30: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
31: * routines. */
33: typedef struct {
34: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36: PetscInt ngroup;
37: PetscInt *xgroup;
38: /* Denotes where groups of rows with same number of nonzeros
39: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
40: * where the ith group begins. */
42: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
43: PetscInt *iperm; /* The permutation vector. */
45: /* Some of this stuff is for Ed's recursive triangular solve.
46: * I'm not sure what I need yet. */
47: PetscInt blocksize;
48: PetscInt nstep;
49: PetscInt *jstart_list;
50: PetscInt *jend_list;
51: PetscInt *action_list;
52: PetscInt *ngroup_list;
53: PetscInt **ipointer_list;
54: PetscInt **xgroup_list;
55: PetscInt **nzgroup_list;
56: PetscInt **iperm_list;
57: } Mat_SeqAIJPERM;
59: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
60: {
61: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62: /* so we will ignore 'MatType type'. */
63: Mat B = *newmat;
64: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
66: PetscFunctionBegin;
67: if (reuse == MAT_INITIAL_MATRIX) {
68: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69: aijperm = (Mat_SeqAIJPERM *)B->spptr;
70: }
72: /* Reset the original function pointers. */
73: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74: B->ops->destroy = MatDestroy_SeqAIJ;
75: B->ops->duplicate = MatDuplicate_SeqAIJ;
76: B->ops->mult = MatMult_SeqAIJ;
77: B->ops->multadd = MatMultAdd_SeqAIJ;
79: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
81: /* Free everything in the Mat_SeqAIJPERM data structure.*/
82: PetscCall(PetscFree(aijperm->xgroup));
83: PetscCall(PetscFree(aijperm->nzgroup));
84: PetscCall(PetscFree(aijperm->iperm));
85: PetscCall(PetscFree(B->spptr));
87: /* Change the type of B to MATSEQAIJ. */
88: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
90: *newmat = B;
91: PetscFunctionReturn(PETSC_SUCCESS);
92: }
94: static PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
95: {
96: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
98: PetscFunctionBegin;
99: if (aijperm) {
100: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
101: PetscCall(PetscFree(aijperm->xgroup));
102: PetscCall(PetscFree(aijperm->nzgroup));
103: PetscCall(PetscFree(aijperm->iperm));
104: PetscCall(PetscFree(A->spptr));
105: }
106: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107: * to destroy everything that remains. */
108: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
109: /* Note that I don't call MatSetType(). I believe this is because that
110: * is only to be called when *building* a matrix. I could be wrong, but
111: * that is how things work for the SuperLU matrix class. */
112: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
113: PetscCall(MatDestroy_SeqAIJ(A));
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: static PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118: {
119: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
120: Mat_SeqAIJPERM *aijperm_dest;
121: PetscBool perm;
123: PetscFunctionBegin;
124: PetscCall(MatDuplicate_SeqAIJ(A, op, M));
125: PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
126: if (perm) {
127: aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
128: PetscCall(PetscFree(aijperm_dest->xgroup));
129: PetscCall(PetscFree(aijperm_dest->nzgroup));
130: PetscCall(PetscFree(aijperm_dest->iperm));
131: } else {
132: PetscCall(PetscNew(&aijperm_dest));
133: (*M)->spptr = (void *)aijperm_dest;
134: PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
135: PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
136: }
137: PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
138: /* Allocate space for, and copy the grouping and permutation info.
139: * I note that when the groups are initially determined in
140: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141: * necessary. But at this point, we know how large they need to be, and
142: * allocate only the necessary amount of memory. So the duplicated matrix
143: * may actually use slightly less storage than the original! */
144: PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
145: PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
146: PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
147: PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
148: PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
149: PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: static PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154: {
155: Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data;
156: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
157: PetscInt m; /* Number of rows in the matrix. */
158: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
159: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
160: PetscInt *rows_in_bucket;
161: /* To construct the permutation, we sort each row into one of maxnz
162: * buckets based on how many nonzeros are in the row. */
163: PetscInt nz;
164: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
165: PetscInt *ipnz;
166: /* When constructing the iperm permutation vector,
167: * ipnz[nz] is used to point to the next place in the permutation vector
168: * that a row with nz nonzero elements should be placed.*/
169: PetscInt i, ngroup, istart, ipos;
171: PetscFunctionBegin;
172: if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
173: aijperm->nonzerostate = A->nonzerostate;
174: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
175: PetscCall(PetscFree(aijperm->xgroup));
176: PetscCall(PetscFree(aijperm->nzgroup));
177: PetscCall(PetscFree(aijperm->iperm));
179: m = A->rmap->n;
180: ia = a->i;
182: /* Allocate the arrays that will hold the permutation vector. */
183: PetscCall(PetscMalloc1(m, &aijperm->iperm));
185: /* Allocate some temporary work arrays that will be used in
186: * calculating the permutation vector and groupings. */
187: PetscCall(PetscMalloc1(m, &nz_in_row));
189: /* Now actually figure out the permutation and grouping. */
191: /* First pass: Determine number of nonzeros in each row, maximum
192: * number of nonzeros in any row, and how many rows fall into each
193: * "bucket" of rows with same number of nonzeros. */
194: maxnz = 0;
195: for (i = 0; i < m; i++) {
196: nz_in_row[i] = ia[i + 1] - ia[i];
197: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198: }
199: PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
200: PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
202: for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
203: for (i = 0; i < m; i++) {
204: nz = nz_in_row[i];
205: rows_in_bucket[nz]++;
206: }
208: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
209: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
210: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
211: * then xgroup[] must consist of (maxnz + 2) elements, since the last
212: * element of xgroup will tell us where the (maxnz + 1)th group ends.
213: * We allocate space for the maximum number of groups;
214: * that is potentially a little wasteful, but not too much so.
215: * Perhaps I should fix it later. */
216: PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
217: PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
219: /* Second pass. Look at what is in the buckets and create the groupings.
220: * Note that it is OK to have a group of rows with no non-zero values. */
221: ngroup = 0;
222: istart = 0;
223: for (i = 0; i <= maxnz; i++) {
224: if (rows_in_bucket[i] > 0) {
225: aijperm->nzgroup[ngroup] = i;
226: aijperm->xgroup[ngroup] = istart;
227: ngroup++;
228: istart += rows_in_bucket[i];
229: }
230: }
232: aijperm->xgroup[ngroup] = istart;
233: aijperm->ngroup = ngroup;
235: /* Now fill in the permutation vector iperm. */
236: ipnz[0] = 0;
237: for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
239: for (i = 0; i < m; i++) {
240: nz = nz_in_row[i];
241: ipos = ipnz[nz];
242: aijperm->iperm[ipos] = i;
243: ipnz[nz]++;
244: }
246: /* Clean up temporary work arrays. */
247: PetscCall(PetscFree(rows_in_bucket));
248: PetscCall(PetscFree(ipnz));
249: PetscCall(PetscFree(nz_in_row));
250: PetscFunctionReturn(PETSC_SUCCESS);
251: }
253: static PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
254: {
255: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
257: PetscFunctionBegin;
258: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
260: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
261: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
262: * I'm not sure if this is the best way to do this, but it avoids
263: * a lot of code duplication.
264: * I also note that currently MATSEQAIJPERM doesn't know anything about
265: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
266: * are many zero rows. If the SeqAIJ assembly end routine decides to use
267: * this, this may break things. (Don't know... haven't looked at it.) */
268: a->inode.use = PETSC_FALSE;
269: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
271: /* Now calculate the permutation and grouping information. */
272: PetscCall(MatSeqAIJPERM_create_perm(A));
273: PetscFunctionReturn(PETSC_SUCCESS);
274: }
276: static PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
277: {
278: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
279: const PetscScalar *x;
280: PetscScalar *y;
281: const MatScalar *aa;
282: const PetscInt *aj, *ai;
283: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
284: PetscInt i, j;
285: #endif
286: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
287: __m512d vec_x, vec_y, vec_vals;
288: __m256i vec_idx, vec_ipos, vec_j;
289: __mmask8 mask;
290: #endif
292: /* Variables that don't appear in MatMult_SeqAIJ. */
293: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
294: PetscInt *iperm; /* Points to the permutation vector. */
295: PetscInt *xgroup;
296: /* Denotes where groups of rows with same number of nonzeros
297: * begin and end in iperm. */
298: PetscInt *nzgroup;
299: PetscInt ngroup;
300: PetscInt igroup;
301: PetscInt jstart, jend;
302: /* jstart is used in loops to denote the position in iperm where a
303: * group starts; jend denotes the position where it ends.
304: * (jend + 1 is where the next group starts.) */
305: PetscInt iold, nz;
306: PetscInt istart, iend, isize;
307: PetscInt ipos;
308: PetscScalar yp[NDIM];
309: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
311: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
312: #pragma disjoint(*x, *y, *aa)
313: #endif
315: PetscFunctionBegin;
316: PetscCall(VecGetArrayRead(xx, &x));
317: PetscCall(VecGetArray(yy, &y));
318: aj = a->j; /* aj[k] gives column index for element aa[k]. */
319: aa = a->a; /* Nonzero elements stored row-by-row. */
320: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
322: /* Get the info we need about the permutations and groupings. */
323: iperm = aijperm->iperm;
324: ngroup = aijperm->ngroup;
325: xgroup = aijperm->xgroup;
326: nzgroup = aijperm->nzgroup;
328: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
329: fortranmultaijperm_(&m, x, ii, aj, aa, y);
330: #else
332: for (igroup = 0; igroup < ngroup; igroup++) {
333: jstart = xgroup[igroup];
334: jend = xgroup[igroup + 1] - 1;
335: nz = nzgroup[igroup];
337: /* Handle the special cases where the number of nonzeros per row
338: * in the group is either 0 or 1. */
339: if (nz == 0) {
340: for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
341: } else if (nz == 1) {
342: for (i = jstart; i <= jend; i++) {
343: iold = iperm[i];
344: ipos = ai[iold];
345: y[iold] = aa[ipos] * x[aj[ipos]];
346: }
347: } else {
348: /* We work our way through the current group in chunks of NDIM rows
349: * at a time. */
351: for (istart = jstart; istart <= jend; istart += NDIM) {
352: /* Figure out where the chunk of 'isize' rows ends in iperm.
353: * 'isize may of course be less than NDIM for the last chunk. */
354: iend = istart + (NDIM - 1);
356: if (iend > jend) iend = jend;
358: isize = iend - istart + 1;
360: /* Initialize the yp[] array that will be used to hold part of
361: * the permuted results vector, and figure out where in aa each
362: * row of the chunk will begin. */
363: for (i = 0; i < isize; i++) {
364: iold = iperm[istart + i];
365: /* iold is a row number from the matrix A *before* reordering. */
366: ip[i] = ai[iold];
367: /* ip[i] tells us where the ith row of the chunk begins in aa. */
368: yp[i] = (PetscScalar)0.0;
369: }
371: /* If the number of zeros per row exceeds the number of rows in
372: * the chunk, we should vectorize along nz, that is, perform the
373: * mat-vec one row at a time as in the usual CSR case. */
374: if (nz > isize) {
375: #if defined(PETSC_HAVE_CRAY_VECTOR)
376: #pragma _CRI preferstream
377: #endif
378: for (i = 0; i < isize; i++) {
379: #if defined(PETSC_HAVE_CRAY_VECTOR)
380: #pragma _CRI prefervector
381: #endif
383: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
384: vec_y = _mm512_setzero_pd();
385: ipos = ip[i];
386: for (j = 0; j < (nz >> 3); j++) {
387: vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
388: vec_vals = _mm512_loadu_pd(&aa[ipos]);
389: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
390: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
391: ipos += 8;
392: }
393: if ((nz & 0x07) > 2) {
394: mask = (__mmask8)(0xff >> (8 - (nz & 0x07)));
395: vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
396: vec_vals = _mm512_loadu_pd(&aa[ipos]);
397: vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
398: vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
399: } else if ((nz & 0x07) == 2) {
400: yp[i] += aa[ipos] * x[aj[ipos]];
401: yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
402: } else if ((nz & 0x07) == 1) {
403: yp[i] += aa[ipos] * x[aj[ipos]];
404: }
405: yp[i] += _mm512_reduce_add_pd(vec_y);
406: #else
407: for (j = 0; j < nz; j++) {
408: ipos = ip[i] + j;
409: yp[i] += aa[ipos] * x[aj[ipos]];
410: }
411: #endif
412: }
413: } else {
414: /* Otherwise, there are enough rows in the chunk to make it
415: * worthwhile to vectorize across the rows, that is, to do the
416: * matvec by operating with "columns" of the chunk. */
417: for (j = 0; j < nz; j++) {
418: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
419: vec_j = _mm256_set1_epi32(j);
420: for (i = 0; i < ((isize >> 3) << 3); i += 8) {
421: vec_y = _mm512_loadu_pd(&yp[i]);
422: vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
423: vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
424: vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
425: vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
426: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
427: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
428: _mm512_storeu_pd(&yp[i], vec_y);
429: }
430: for (i = isize - (isize & 0x07); i < isize; i++) {
431: ipos = ip[i] + j;
432: yp[i] += aa[ipos] * x[aj[ipos]];
433: }
434: #else
435: for (i = 0; i < isize; i++) {
436: ipos = ip[i] + j;
437: yp[i] += aa[ipos] * x[aj[ipos]];
438: }
439: #endif
440: }
441: }
443: #if defined(PETSC_HAVE_CRAY_VECTOR)
444: #pragma _CRI ivdep
445: #endif
446: /* Put results from yp[] into non-permuted result vector y. */
447: for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
448: } /* End processing chunk of isize rows of a group. */
449: } /* End handling matvec for chunk with nz > 1. */
450: } /* End loop over igroup. */
451: #endif
452: PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
453: PetscCall(VecRestoreArrayRead(xx, &x));
454: PetscCall(VecRestoreArray(yy, &y));
455: PetscFunctionReturn(PETSC_SUCCESS);
456: }
458: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
459: * Note that the names I used to designate the vectors differs from that
460: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
461: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
462: /*
463: I hate having virtually identical code for the mult and the multadd!!!
464: */
465: static PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
466: {
467: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
468: const PetscScalar *x;
469: PetscScalar *y, *w;
470: const MatScalar *aa;
471: const PetscInt *aj, *ai;
472: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
473: PetscInt i, j;
474: #endif
476: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
477: Mat_SeqAIJPERM *aijperm;
478: PetscInt *iperm; /* Points to the permutation vector. */
479: PetscInt *xgroup;
480: /* Denotes where groups of rows with same number of nonzeros
481: * begin and end in iperm. */
482: PetscInt *nzgroup;
483: PetscInt ngroup;
484: PetscInt igroup;
485: PetscInt jstart, jend;
486: /* jstart is used in loops to denote the position in iperm where a
487: * group starts; jend denotes the position where it ends.
488: * (jend + 1 is where the next group starts.) */
489: PetscInt iold, nz;
490: PetscInt istart, iend, isize;
491: PetscInt ipos;
492: PetscScalar yp[NDIM];
493: PetscInt ip[NDIM];
494: /* yp[] and ip[] are treated as vector "registers" for performing
495: * the mat-vec. */
497: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
498: #pragma disjoint(*x, *y, *aa)
499: #endif
501: PetscFunctionBegin;
502: PetscCall(VecGetArrayRead(xx, &x));
503: PetscCall(VecGetArrayPair(yy, ww, &y, &w));
505: aj = a->j; /* aj[k] gives column index for element aa[k]. */
506: aa = a->a; /* Nonzero elements stored row-by-row. */
507: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
509: /* Get the info we need about the permutations and groupings. */
510: aijperm = (Mat_SeqAIJPERM *)A->spptr;
511: iperm = aijperm->iperm;
512: ngroup = aijperm->ngroup;
513: xgroup = aijperm->xgroup;
514: nzgroup = aijperm->nzgroup;
516: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
517: fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
518: #else
520: for (igroup = 0; igroup < ngroup; igroup++) {
521: jstart = xgroup[igroup];
522: jend = xgroup[igroup + 1] - 1;
524: nz = nzgroup[igroup];
526: /* Handle the special cases where the number of nonzeros per row
527: * in the group is either 0 or 1. */
528: if (nz == 0) {
529: for (i = jstart; i <= jend; i++) {
530: iold = iperm[i];
531: y[iold] = w[iold];
532: }
533: } else if (nz == 1) {
534: for (i = jstart; i <= jend; i++) {
535: iold = iperm[i];
536: ipos = ai[iold];
537: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
538: }
539: }
540: /* For the general case: */
541: else {
542: /* We work our way through the current group in chunks of NDIM rows
543: * at a time. */
545: for (istart = jstart; istart <= jend; istart += NDIM) {
546: /* Figure out where the chunk of 'isize' rows ends in iperm.
547: * 'isize may of course be less than NDIM for the last chunk. */
548: iend = istart + (NDIM - 1);
549: if (iend > jend) iend = jend;
550: isize = iend - istart + 1;
552: /* Initialize the yp[] array that will be used to hold part of
553: * the permuted results vector, and figure out where in aa each
554: * row of the chunk will begin. */
555: for (i = 0; i < isize; i++) {
556: iold = iperm[istart + i];
557: /* iold is a row number from the matrix A *before* reordering. */
558: ip[i] = ai[iold];
559: /* ip[i] tells us where the ith row of the chunk begins in aa. */
560: yp[i] = w[iold];
561: }
563: /* If the number of zeros per row exceeds the number of rows in
564: * the chunk, we should vectorize along nz, that is, perform the
565: * mat-vec one row at a time as in the usual CSR case. */
566: if (nz > isize) {
567: #if defined(PETSC_HAVE_CRAY_VECTOR)
568: #pragma _CRI preferstream
569: #endif
570: for (i = 0; i < isize; i++) {
571: #if defined(PETSC_HAVE_CRAY_VECTOR)
572: #pragma _CRI prefervector
573: #endif
574: for (j = 0; j < nz; j++) {
575: ipos = ip[i] + j;
576: yp[i] += aa[ipos] * x[aj[ipos]];
577: }
578: }
579: }
580: /* Otherwise, there are enough rows in the chunk to make it
581: * worthwhile to vectorize across the rows, that is, to do the
582: * matvec by operating with "columns" of the chunk. */
583: else {
584: for (j = 0; j < nz; j++) {
585: for (i = 0; i < isize; i++) {
586: ipos = ip[i] + j;
587: yp[i] += aa[ipos] * x[aj[ipos]];
588: }
589: }
590: }
592: #if defined(PETSC_HAVE_CRAY_VECTOR)
593: #pragma _CRI ivdep
594: #endif
595: /* Put results from yp[] into non-permuted result vector y. */
596: for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
597: } /* End processing chunk of isize rows of a group. */
599: } /* End handling matvec for chunk with nz > 1. */
600: } /* End loop over igroup. */
602: #endif
603: PetscCall(PetscLogFlops(2.0 * a->nz));
604: PetscCall(VecRestoreArrayRead(xx, &x));
605: PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
606: PetscFunctionReturn(PETSC_SUCCESS);
607: }
609: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
610: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
611: * routine, but can also be used to convert an assembled SeqAIJ matrix
612: * into a SeqAIJPERM one. */
613: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
614: {
615: Mat B = *newmat;
616: Mat_SeqAIJPERM *aijperm;
617: PetscBool sametype;
619: PetscFunctionBegin;
620: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
621: PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
622: if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
624: PetscCall(PetscNew(&aijperm));
625: B->spptr = (void *)aijperm;
627: /* Set function pointers for methods that we inherit from AIJ but override. */
628: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
629: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
630: B->ops->destroy = MatDestroy_SeqAIJPERM;
631: B->ops->mult = MatMult_SeqAIJPERM;
632: B->ops->multadd = MatMultAdd_SeqAIJPERM;
634: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
635: /* If A has already been assembled, compute the permutation. */
636: if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
638: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
640: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
641: *newmat = B;
642: PetscFunctionReturn(PETSC_SUCCESS);
643: }
645: /*@C
646: MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
648: Collective
650: Input Parameters:
651: + comm - MPI communicator, set to `PETSC_COMM_SELF`
652: . m - number of rows
653: . n - number of columns
654: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given
655: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
657: Output Parameter:
658: . A - the matrix
660: Level: intermediate
662: Notes:
663: This type inherits from `MATSEQAIJ`, but calculates some additional permutation information
664: that is used to allow better vectorization of some operations. At the cost of increased
665: storage, the `MATSEQAIJ` formatted matrix can be copied to a format in which pieces of the
666: matrix are stored in ELLPACK format, allowing the vectorized matrix multiply routine to use
667: stride-1 memory accesses.
669: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
670: @*/
671: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
672: {
673: PetscFunctionBegin;
674: PetscCall(MatCreate(comm, A));
675: PetscCall(MatSetSizes(*A, m, n, m, n));
676: PetscCall(MatSetType(*A, MATSEQAIJPERM));
677: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
678: PetscFunctionReturn(PETSC_SUCCESS);
679: }
681: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
682: {
683: PetscFunctionBegin;
684: PetscCall(MatSetType(A, MATSEQAIJ));
685: PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
686: PetscFunctionReturn(PETSC_SUCCESS);
687: }