Actual source code: inode.c
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
11: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
13: n = A->cmap->n;
14: m = A->rmap->n;
16: ns_row = a->inode.size;
18: min_mn = (m < n) ? m : n;
19: if (!ns) {
20: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
21: for (; count+1 < n; count++,i++) ;
22: if (count < n) {
23: i++;
24: }
25: *size = i;
26: return 0;
27: }
28: PetscMalloc1(n+1,&ns_col);
30: /* Use the same row structure wherever feasible. */
31: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
32: ns_col[i] = ns_row[i];
33: }
35: /* if m < n; pad up the remainder with inode_limit */
36: for (; count+1 < n; count++,i++) {
37: ns_col[i] = 1;
38: }
39: /* The last node is the odd ball. padd it up with the remaining rows; */
40: if (count < n) {
41: ns_col[i] = n - count;
42: i++;
43: } else if (count > n) {
44: /* Adjust for the over estimation */
45: ns_col[i-1] += n - count;
46: }
47: *size = i;
48: *ns = ns_col;
49: return 0;
50: }
52: /*
53: This builds symmetric version of nonzero structure,
54: */
55: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
56: {
57: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
58: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
59: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
60: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
62: nslim_row = a->inode.node_count;
63: m = A->rmap->n;
64: n = A->cmap->n;
68: /* Use the row_inode as column_inode */
69: nslim_col = nslim_row;
70: ns_col = ns_row;
72: /* allocate space for reformated inode structure */
73: PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
74: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
76: for (i1=0,col=0; i1<nslim_col; ++i1) {
77: nsz = ns_col[i1];
78: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
79: }
80: /* allocate space for row pointers */
81: PetscCalloc1(nslim_row+1,&ia);
82: *iia = ia;
83: PetscMalloc1(nslim_row+1,&work);
85: /* determine the number of columns in each row */
86: ia[0] = oshift;
87: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
89: j = aj + ai[row] + ishift;
90: jmax = aj + ai[row+1] + ishift;
91: if (j==jmax) continue; /* empty row */
92: col = *j++ + ishift;
93: i2 = tvc[col];
94: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
95: ia[i1+1]++;
96: ia[i2+1]++;
97: i2++; /* Start col of next node */
98: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
99: i2 = tvc[col];
100: }
101: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
102: }
104: /* shift ia[i] to point to next row */
105: for (i1=1; i1<nslim_row+1; i1++) {
106: row = ia[i1-1];
107: ia[i1] += row;
108: work[i1-1] = row - oshift;
109: }
111: /* allocate space for column pointers */
112: nz = ia[nslim_row] + (!ishift);
113: PetscMalloc1(nz,&ja);
114: *jja = ja;
116: /* loop over lower triangular part putting into ja */
117: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
118: j = aj + ai[row] + ishift;
119: jmax = aj + ai[row+1] + ishift;
120: if (j==jmax) continue; /* empty row */
121: col = *j++ + ishift;
122: i2 = tvc[col];
123: while (i2<i1 && j<jmax) {
124: ja[work[i2]++] = i1 + oshift;
125: ja[work[i1]++] = i2 + oshift;
126: ++i2;
127: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
128: i2 = tvc[col];
129: }
130: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
132: }
133: PetscFree(work);
134: PetscFree2(tns,tvc);
135: return 0;
136: }
138: /*
139: This builds nonsymmetric version of nonzero structure,
140: */
141: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
142: {
143: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
144: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
145: PetscInt *tns,*tvc,nsz,i1,i2;
146: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
149: nslim_row = a->inode.node_count;
150: n = A->cmap->n;
152: /* Create The column_inode for this matrix */
153: MatCreateColInode_Private(A,&nslim_col,&ns_col);
155: /* allocate space for reformated column_inode structure */
156: PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
157: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
159: for (i1=0,col=0; i1<nslim_col; ++i1) {
160: nsz = ns_col[i1];
161: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
162: }
163: /* allocate space for row pointers */
164: PetscCalloc1(nslim_row+1,&ia);
165: *iia = ia;
166: PetscMalloc1(nslim_row+1,&work);
168: /* determine the number of columns in each row */
169: ia[0] = oshift;
170: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
171: j = aj + ai[row] + ishift;
172: nz = ai[row+1] - ai[row];
173: if (!nz) continue; /* empty row */
174: col = *j++ + ishift;
175: i2 = tvc[col];
176: while (nz-- > 0) { /* off-diagonal elements */
177: ia[i1+1]++;
178: i2++; /* Start col of next node */
179: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
180: if (nz > 0) i2 = tvc[col];
181: }
182: }
184: /* shift ia[i] to point to next row */
185: for (i1=1; i1<nslim_row+1; i1++) {
186: row = ia[i1-1];
187: ia[i1] += row;
188: work[i1-1] = row - oshift;
189: }
191: /* allocate space for column pointers */
192: nz = ia[nslim_row] + (!ishift);
193: PetscMalloc1(nz,&ja);
194: *jja = ja;
196: /* loop over matrix putting into ja */
197: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
198: j = aj + ai[row] + ishift;
199: nz = ai[row+1] - ai[row];
200: if (!nz) continue; /* empty row */
201: col = *j++ + ishift;
202: i2 = tvc[col];
203: while (nz-- > 0) {
204: ja[work[i1]++] = i2 + oshift;
205: ++i2;
206: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
207: if (nz > 0) i2 = tvc[col];
208: }
209: }
210: PetscFree(ns_col);
211: PetscFree(work);
212: PetscFree2(tns,tvc);
213: return 0;
214: }
216: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
217: {
218: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
220: *n = a->inode.node_count;
221: if (!ia) return 0;
222: if (!blockcompressed) {
223: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
224: } else if (symmetric) {
225: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
226: } else {
227: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
228: }
229: return 0;
230: }
232: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
233: {
234: if (!ia) return 0;
236: if (!blockcompressed) {
237: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
238: } else {
239: PetscFree(*ia);
240: PetscFree(*ja);
241: }
242: return 0;
243: }
245: /* ----------------------------------------------------------- */
247: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
248: {
249: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
250: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
251: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
254: nslim_row = a->inode.node_count;
255: n = A->cmap->n;
257: /* Create The column_inode for this matrix */
258: MatCreateColInode_Private(A,&nslim_col,&ns_col);
260: /* allocate space for reformated column_inode structure */
261: PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
262: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
264: for (i1=0,col=0; i1<nslim_col; ++i1) {
265: nsz = ns_col[i1];
266: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
267: }
268: /* allocate space for column pointers */
269: PetscCalloc1(nslim_col+1,&ia);
270: *iia = ia;
271: PetscMalloc1(nslim_col+1,&work);
273: /* determine the number of columns in each row */
274: ia[0] = oshift;
275: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
276: j = aj + ai[row] + ishift;
277: col = *j++ + ishift;
278: i2 = tvc[col];
279: nz = ai[row+1] - ai[row];
280: while (nz-- > 0) { /* off-diagonal elements */
281: /* ia[i1+1]++; */
282: ia[i2+1]++;
283: i2++;
284: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285: if (nz > 0) i2 = tvc[col];
286: }
287: }
289: /* shift ia[i] to point to next col */
290: for (i1=1; i1<nslim_col+1; i1++) {
291: col = ia[i1-1];
292: ia[i1] += col;
293: work[i1-1] = col - oshift;
294: }
296: /* allocate space for column pointers */
297: nz = ia[nslim_col] + (!ishift);
298: PetscMalloc1(nz,&ja);
299: *jja = ja;
301: /* loop over matrix putting into ja */
302: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
303: j = aj + ai[row] + ishift;
304: col = *j++ + ishift;
305: i2 = tvc[col];
306: nz = ai[row+1] - ai[row];
307: while (nz-- > 0) {
308: /* ja[work[i1]++] = i2 + oshift; */
309: ja[work[i2]++] = i1 + oshift;
310: i2++;
311: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312: if (nz > 0) i2 = tvc[col];
313: }
314: }
315: PetscFree(ns_col);
316: PetscFree(work);
317: PetscFree2(tns,tvc);
318: return 0;
319: }
321: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
322: {
323: MatCreateColInode_Private(A,n,NULL);
324: if (!ia) return 0;
326: if (!blockcompressed) {
327: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
328: } else if (symmetric) {
329: /* Since the indices are symmetric it doesn't matter */
330: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
331: } else {
332: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
333: }
334: return 0;
335: }
337: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
338: {
339: if (!ia) return 0;
340: if (!blockcompressed) {
341: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
342: } else {
343: PetscFree(*ia);
344: PetscFree(*ja);
345: }
346: return 0;
347: }
349: /* ----------------------------------------------------------- */
351: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
352: {
353: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
354: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
355: PetscScalar *y;
356: const PetscScalar *x;
357: const MatScalar *v1,*v2,*v3,*v4,*v5;
358: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
359: const PetscInt *idx,*ns,*ii;
361: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
362: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
363: #endif
366: node_max = a->inode.node_count;
367: ns = a->inode.size; /* Node Size array */
368: VecGetArrayRead(xx,&x);
369: VecGetArray(yy,&y);
370: idx = a->j;
371: v1 = a->a;
372: ii = a->i;
374: for (i = 0,row = 0; i< node_max; ++i) {
375: nsz = ns[i];
376: n = ii[1] - ii[0];
377: nonzerorow += (n>0)*nsz;
378: ii += nsz;
379: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
380: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
381: sz = n; /* No of non zeros in this row */
382: /* Switch on the size of Node */
383: switch (nsz) { /* Each loop in 'case' is unrolled */
384: case 1:
385: sum1 = 0.;
387: for (n = 0; n< sz-1; n+=2) {
388: i1 = idx[0]; /* The instructions are ordered to */
389: i2 = idx[1]; /* make the compiler's job easy */
390: idx += 2;
391: tmp0 = x[i1];
392: tmp1 = x[i2];
393: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
394: }
396: if (n == sz-1) { /* Take care of the last nonzero */
397: tmp0 = x[*idx++];
398: sum1 += *v1++ *tmp0;
399: }
400: y[row++]=sum1;
401: break;
402: case 2:
403: sum1 = 0.;
404: sum2 = 0.;
405: v2 = v1 + n;
407: for (n = 0; n< sz-1; n+=2) {
408: i1 = idx[0];
409: i2 = idx[1];
410: idx += 2;
411: tmp0 = x[i1];
412: tmp1 = x[i2];
413: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
414: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
415: }
416: if (n == sz-1) {
417: tmp0 = x[*idx++];
418: sum1 += *v1++ * tmp0;
419: sum2 += *v2++ * tmp0;
420: }
421: y[row++]=sum1;
422: y[row++]=sum2;
423: v1 =v2; /* Since the next block to be processed starts there*/
424: idx +=sz;
425: break;
426: case 3:
427: sum1 = 0.;
428: sum2 = 0.;
429: sum3 = 0.;
430: v2 = v1 + n;
431: v3 = v2 + n;
433: for (n = 0; n< sz-1; n+=2) {
434: i1 = idx[0];
435: i2 = idx[1];
436: idx += 2;
437: tmp0 = x[i1];
438: tmp1 = x[i2];
439: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
441: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
442: }
443: if (n == sz-1) {
444: tmp0 = x[*idx++];
445: sum1 += *v1++ * tmp0;
446: sum2 += *v2++ * tmp0;
447: sum3 += *v3++ * tmp0;
448: }
449: y[row++]=sum1;
450: y[row++]=sum2;
451: y[row++]=sum3;
452: v1 =v3; /* Since the next block to be processed starts there*/
453: idx +=2*sz;
454: break;
455: case 4:
456: sum1 = 0.;
457: sum2 = 0.;
458: sum3 = 0.;
459: sum4 = 0.;
460: v2 = v1 + n;
461: v3 = v2 + n;
462: v4 = v3 + n;
464: for (n = 0; n< sz-1; n+=2) {
465: i1 = idx[0];
466: i2 = idx[1];
467: idx += 2;
468: tmp0 = x[i1];
469: tmp1 = x[i2];
470: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
471: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
472: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
473: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
474: }
475: if (n == sz-1) {
476: tmp0 = x[*idx++];
477: sum1 += *v1++ * tmp0;
478: sum2 += *v2++ * tmp0;
479: sum3 += *v3++ * tmp0;
480: sum4 += *v4++ * tmp0;
481: }
482: y[row++]=sum1;
483: y[row++]=sum2;
484: y[row++]=sum3;
485: y[row++]=sum4;
486: v1 =v4; /* Since the next block to be processed starts there*/
487: idx +=3*sz;
488: break;
489: case 5:
490: sum1 = 0.;
491: sum2 = 0.;
492: sum3 = 0.;
493: sum4 = 0.;
494: sum5 = 0.;
495: v2 = v1 + n;
496: v3 = v2 + n;
497: v4 = v3 + n;
498: v5 = v4 + n;
500: for (n = 0; n<sz-1; n+=2) {
501: i1 = idx[0];
502: i2 = idx[1];
503: idx += 2;
504: tmp0 = x[i1];
505: tmp1 = x[i2];
506: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
507: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
508: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
509: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
510: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
511: }
512: if (n == sz-1) {
513: tmp0 = x[*idx++];
514: sum1 += *v1++ * tmp0;
515: sum2 += *v2++ * tmp0;
516: sum3 += *v3++ * tmp0;
517: sum4 += *v4++ * tmp0;
518: sum5 += *v5++ * tmp0;
519: }
520: y[row++]=sum1;
521: y[row++]=sum2;
522: y[row++]=sum3;
523: y[row++]=sum4;
524: y[row++]=sum5;
525: v1 =v5; /* Since the next block to be processed starts there */
526: idx +=4*sz;
527: break;
528: default:
529: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
530: }
531: }
532: VecRestoreArrayRead(xx,&x);
533: VecRestoreArray(yy,&y);
534: PetscLogFlops(2.0*a->nz - nonzerorow);
535: return 0;
536: }
537: /* ----------------------------------------------------------- */
538: /* Almost same code as the MatMult_SeqAIJ_Inode() */
539: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
540: {
541: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
542: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
543: const MatScalar *v1,*v2,*v3,*v4,*v5;
544: const PetscScalar *x;
545: PetscScalar *y,*z,*zt;
546: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
547: const PetscInt *idx,*ns,*ii;
550: node_max = a->inode.node_count;
551: ns = a->inode.size; /* Node Size array */
553: VecGetArrayRead(xx,&x);
554: VecGetArrayPair(zz,yy,&z,&y);
555: zt = z;
557: idx = a->j;
558: v1 = a->a;
559: ii = a->i;
561: for (i = 0,row = 0; i< node_max; ++i) {
562: nsz = ns[i];
563: n = ii[1] - ii[0];
564: ii += nsz;
565: sz = n; /* No of non zeros in this row */
566: /* Switch on the size of Node */
567: switch (nsz) { /* Each loop in 'case' is unrolled */
568: case 1:
569: sum1 = *zt++;
571: for (n = 0; n< sz-1; n+=2) {
572: i1 = idx[0]; /* The instructions are ordered to */
573: i2 = idx[1]; /* make the compiler's job easy */
574: idx += 2;
575: tmp0 = x[i1];
576: tmp1 = x[i2];
577: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
578: }
580: if (n == sz-1) { /* Take care of the last nonzero */
581: tmp0 = x[*idx++];
582: sum1 += *v1++ * tmp0;
583: }
584: y[row++]=sum1;
585: break;
586: case 2:
587: sum1 = *zt++;
588: sum2 = *zt++;
589: v2 = v1 + n;
591: for (n = 0; n< sz-1; n+=2) {
592: i1 = idx[0];
593: i2 = idx[1];
594: idx += 2;
595: tmp0 = x[i1];
596: tmp1 = x[i2];
597: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
598: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
599: }
600: if (n == sz-1) {
601: tmp0 = x[*idx++];
602: sum1 += *v1++ * tmp0;
603: sum2 += *v2++ * tmp0;
604: }
605: y[row++]=sum1;
606: y[row++]=sum2;
607: v1 =v2; /* Since the next block to be processed starts there*/
608: idx +=sz;
609: break;
610: case 3:
611: sum1 = *zt++;
612: sum2 = *zt++;
613: sum3 = *zt++;
614: v2 = v1 + n;
615: v3 = v2 + n;
617: for (n = 0; n< sz-1; n+=2) {
618: i1 = idx[0];
619: i2 = idx[1];
620: idx += 2;
621: tmp0 = x[i1];
622: tmp1 = x[i2];
623: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
624: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
625: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
626: }
627: if (n == sz-1) {
628: tmp0 = x[*idx++];
629: sum1 += *v1++ * tmp0;
630: sum2 += *v2++ * tmp0;
631: sum3 += *v3++ * tmp0;
632: }
633: y[row++]=sum1;
634: y[row++]=sum2;
635: y[row++]=sum3;
636: v1 =v3; /* Since the next block to be processed starts there*/
637: idx +=2*sz;
638: break;
639: case 4:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: sum4 = *zt++;
644: v2 = v1 + n;
645: v3 = v2 + n;
646: v4 = v3 + n;
648: for (n = 0; n< sz-1; n+=2) {
649: i1 = idx[0];
650: i2 = idx[1];
651: idx += 2;
652: tmp0 = x[i1];
653: tmp1 = x[i2];
654: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
655: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
656: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
657: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
658: }
659: if (n == sz-1) {
660: tmp0 = x[*idx++];
661: sum1 += *v1++ * tmp0;
662: sum2 += *v2++ * tmp0;
663: sum3 += *v3++ * tmp0;
664: sum4 += *v4++ * tmp0;
665: }
666: y[row++]=sum1;
667: y[row++]=sum2;
668: y[row++]=sum3;
669: y[row++]=sum4;
670: v1 =v4; /* Since the next block to be processed starts there*/
671: idx +=3*sz;
672: break;
673: case 5:
674: sum1 = *zt++;
675: sum2 = *zt++;
676: sum3 = *zt++;
677: sum4 = *zt++;
678: sum5 = *zt++;
679: v2 = v1 + n;
680: v3 = v2 + n;
681: v4 = v3 + n;
682: v5 = v4 + n;
684: for (n = 0; n<sz-1; n+=2) {
685: i1 = idx[0];
686: i2 = idx[1];
687: idx += 2;
688: tmp0 = x[i1];
689: tmp1 = x[i2];
690: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
691: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
692: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
693: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
694: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
695: }
696: if (n == sz-1) {
697: tmp0 = x[*idx++];
698: sum1 += *v1++ * tmp0;
699: sum2 += *v2++ * tmp0;
700: sum3 += *v3++ * tmp0;
701: sum4 += *v4++ * tmp0;
702: sum5 += *v5++ * tmp0;
703: }
704: y[row++]=sum1;
705: y[row++]=sum2;
706: y[row++]=sum3;
707: y[row++]=sum4;
708: y[row++]=sum5;
709: v1 =v5; /* Since the next block to be processed starts there */
710: idx +=4*sz;
711: break;
712: default:
713: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
714: }
715: }
716: VecRestoreArrayRead(xx,&x);
717: VecRestoreArrayPair(zz,yy,&z,&y);
718: PetscLogFlops(2.0*a->nz);
719: return 0;
720: }
722: /* ----------------------------------------------------------- */
723: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
724: {
725: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
726: IS iscol = a->col,isrow = a->row;
727: const PetscInt *r,*c,*rout,*cout;
728: PetscInt i,j,n = A->rmap->n,nz;
729: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
730: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
731: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
732: PetscScalar sum1,sum2,sum3,sum4,sum5;
733: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
734: const PetscScalar *b;
737: node_max = a->inode.node_count;
738: ns = a->inode.size; /* Node Size array */
740: VecGetArrayRead(bb,&b);
741: VecGetArrayWrite(xx,&x);
742: tmp = a->solve_work;
744: ISGetIndices(isrow,&rout); r = rout;
745: ISGetIndices(iscol,&cout)); c = cout + (n-1;
747: /* forward solve the lower triangular */
748: tmps = tmp;
749: aa = a_a;
750: aj = a_j;
751: ad = a->diag;
753: for (i = 0,row = 0; i< node_max; ++i) {
754: nsz = ns[i];
755: aii = ai[row];
756: v1 = aa + aii;
757: vi = aj + aii;
758: nz = ad[row]- aii;
759: if (i < node_max-1) {
760: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
761: * but our indexing to determine it's size could. */
762: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
763: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
764: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
765: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
766: }
768: switch (nsz) { /* Each loop in 'case' is unrolled */
769: case 1:
770: sum1 = b[*r++];
771: for (j=0; j<nz-1; j+=2) {
772: i0 = vi[0];
773: i1 = vi[1];
774: vi +=2;
775: tmp0 = tmps[i0];
776: tmp1 = tmps[i1];
777: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
778: }
779: if (j == nz-1) {
780: tmp0 = tmps[*vi++];
781: sum1 -= *v1++ *tmp0;
782: }
783: tmp[row++]=sum1;
784: break;
785: case 2:
786: sum1 = b[*r++];
787: sum2 = b[*r++];
788: v2 = aa + ai[row+1];
790: for (j=0; j<nz-1; j+=2) {
791: i0 = vi[0];
792: i1 = vi[1];
793: vi +=2;
794: tmp0 = tmps[i0];
795: tmp1 = tmps[i1];
796: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
797: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
798: }
799: if (j == nz-1) {
800: tmp0 = tmps[*vi++];
801: sum1 -= *v1++ *tmp0;
802: sum2 -= *v2++ *tmp0;
803: }
804: sum2 -= *v2++ *sum1;
805: tmp[row++]=sum1;
806: tmp[row++]=sum2;
807: break;
808: case 3:
809: sum1 = b[*r++];
810: sum2 = b[*r++];
811: sum3 = b[*r++];
812: v2 = aa + ai[row+1];
813: v3 = aa + ai[row+2];
815: for (j=0; j<nz-1; j+=2) {
816: i0 = vi[0];
817: i1 = vi[1];
818: vi +=2;
819: tmp0 = tmps[i0];
820: tmp1 = tmps[i1];
821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
823: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
824: }
825: if (j == nz-1) {
826: tmp0 = tmps[*vi++];
827: sum1 -= *v1++ *tmp0;
828: sum2 -= *v2++ *tmp0;
829: sum3 -= *v3++ *tmp0;
830: }
831: sum2 -= *v2++ * sum1;
832: sum3 -= *v3++ * sum1;
833: sum3 -= *v3++ * sum2;
835: tmp[row++]=sum1;
836: tmp[row++]=sum2;
837: tmp[row++]=sum3;
838: break;
840: case 4:
841: sum1 = b[*r++];
842: sum2 = b[*r++];
843: sum3 = b[*r++];
844: sum4 = b[*r++];
845: v2 = aa + ai[row+1];
846: v3 = aa + ai[row+2];
847: v4 = aa + ai[row+3];
849: for (j=0; j<nz-1; j+=2) {
850: i0 = vi[0];
851: i1 = vi[1];
852: vi +=2;
853: tmp0 = tmps[i0];
854: tmp1 = tmps[i1];
855: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
858: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
859: }
860: if (j == nz-1) {
861: tmp0 = tmps[*vi++];
862: sum1 -= *v1++ *tmp0;
863: sum2 -= *v2++ *tmp0;
864: sum3 -= *v3++ *tmp0;
865: sum4 -= *v4++ *tmp0;
866: }
867: sum2 -= *v2++ * sum1;
868: sum3 -= *v3++ * sum1;
869: sum4 -= *v4++ * sum1;
870: sum3 -= *v3++ * sum2;
871: sum4 -= *v4++ * sum2;
872: sum4 -= *v4++ * sum3;
874: tmp[row++]=sum1;
875: tmp[row++]=sum2;
876: tmp[row++]=sum3;
877: tmp[row++]=sum4;
878: break;
879: case 5:
880: sum1 = b[*r++];
881: sum2 = b[*r++];
882: sum3 = b[*r++];
883: sum4 = b[*r++];
884: sum5 = b[*r++];
885: v2 = aa + ai[row+1];
886: v3 = aa + ai[row+2];
887: v4 = aa + ai[row+3];
888: v5 = aa + ai[row+4];
890: for (j=0; j<nz-1; j+=2) {
891: i0 = vi[0];
892: i1 = vi[1];
893: vi +=2;
894: tmp0 = tmps[i0];
895: tmp1 = tmps[i1];
896: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
897: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
898: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
899: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
900: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
901: }
902: if (j == nz-1) {
903: tmp0 = tmps[*vi++];
904: sum1 -= *v1++ *tmp0;
905: sum2 -= *v2++ *tmp0;
906: sum3 -= *v3++ *tmp0;
907: sum4 -= *v4++ *tmp0;
908: sum5 -= *v5++ *tmp0;
909: }
911: sum2 -= *v2++ * sum1;
912: sum3 -= *v3++ * sum1;
913: sum4 -= *v4++ * sum1;
914: sum5 -= *v5++ * sum1;
915: sum3 -= *v3++ * sum2;
916: sum4 -= *v4++ * sum2;
917: sum5 -= *v5++ * sum2;
918: sum4 -= *v4++ * sum3;
919: sum5 -= *v5++ * sum3;
920: sum5 -= *v5++ * sum4;
922: tmp[row++]=sum1;
923: tmp[row++]=sum2;
924: tmp[row++]=sum3;
925: tmp[row++]=sum4;
926: tmp[row++]=sum5;
927: break;
928: default:
929: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
930: }
931: }
932: /* backward solve the upper triangular */
933: for (i=node_max -1,row = n-1; i>=0; i--) {
934: nsz = ns[i];
935: aii = ai[row+1] -1;
936: v1 = aa + aii;
937: vi = aj + aii;
938: nz = aii- ad[row];
939: switch (nsz) { /* Each loop in 'case' is unrolled */
940: case 1:
941: sum1 = tmp[row];
943: for (j=nz; j>1; j-=2) {
944: vi -=2;
945: i0 = vi[2];
946: i1 = vi[1];
947: tmp0 = tmps[i0];
948: tmp1 = tmps[i1];
949: v1 -= 2;
950: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
951: }
952: if (j==1) {
953: tmp0 = tmps[*vi--];
954: sum1 -= *v1-- * tmp0;
955: }
956: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
957: break;
958: case 2:
959: sum1 = tmp[row];
960: sum2 = tmp[row -1];
961: v2 = aa + ai[row]-1;
962: for (j=nz; j>1; j-=2) {
963: vi -=2;
964: i0 = vi[2];
965: i1 = vi[1];
966: tmp0 = tmps[i0];
967: tmp1 = tmps[i1];
968: v1 -= 2;
969: v2 -= 2;
970: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
971: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
972: }
973: if (j==1) {
974: tmp0 = tmps[*vi--];
975: sum1 -= *v1-- * tmp0;
976: sum2 -= *v2-- * tmp0;
977: }
979: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
980: sum2 -= *v2-- * tmp0;
981: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
982: break;
983: case 3:
984: sum1 = tmp[row];
985: sum2 = tmp[row -1];
986: sum3 = tmp[row -2];
987: v2 = aa + ai[row]-1;
988: v3 = aa + ai[row -1]-1;
989: for (j=nz; j>1; j-=2) {
990: vi -=2;
991: i0 = vi[2];
992: i1 = vi[1];
993: tmp0 = tmps[i0];
994: tmp1 = tmps[i1];
995: v1 -= 2;
996: v2 -= 2;
997: v3 -= 2;
998: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
999: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1000: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1001: }
1002: if (j==1) {
1003: tmp0 = tmps[*vi--];
1004: sum1 -= *v1-- * tmp0;
1005: sum2 -= *v2-- * tmp0;
1006: sum3 -= *v3-- * tmp0;
1007: }
1008: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1009: sum2 -= *v2-- * tmp0;
1010: sum3 -= *v3-- * tmp0;
1011: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012: sum3 -= *v3-- * tmp0;
1013: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1015: break;
1016: case 4:
1017: sum1 = tmp[row];
1018: sum2 = tmp[row -1];
1019: sum3 = tmp[row -2];
1020: sum4 = tmp[row -3];
1021: v2 = aa + ai[row]-1;
1022: v3 = aa + ai[row -1]-1;
1023: v4 = aa + ai[row -2]-1;
1025: for (j=nz; j>1; j-=2) {
1026: vi -=2;
1027: i0 = vi[2];
1028: i1 = vi[1];
1029: tmp0 = tmps[i0];
1030: tmp1 = tmps[i1];
1031: v1 -= 2;
1032: v2 -= 2;
1033: v3 -= 2;
1034: v4 -= 2;
1035: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1036: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1037: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1038: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1039: }
1040: if (j==1) {
1041: tmp0 = tmps[*vi--];
1042: sum1 -= *v1-- * tmp0;
1043: sum2 -= *v2-- * tmp0;
1044: sum3 -= *v3-- * tmp0;
1045: sum4 -= *v4-- * tmp0;
1046: }
1048: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1049: sum2 -= *v2-- * tmp0;
1050: sum3 -= *v3-- * tmp0;
1051: sum4 -= *v4-- * tmp0;
1052: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1053: sum3 -= *v3-- * tmp0;
1054: sum4 -= *v4-- * tmp0;
1055: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1056: sum4 -= *v4-- * tmp0;
1057: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1058: break;
1059: case 5:
1060: sum1 = tmp[row];
1061: sum2 = tmp[row -1];
1062: sum3 = tmp[row -2];
1063: sum4 = tmp[row -3];
1064: sum5 = tmp[row -4];
1065: v2 = aa + ai[row]-1;
1066: v3 = aa + ai[row -1]-1;
1067: v4 = aa + ai[row -2]-1;
1068: v5 = aa + ai[row -3]-1;
1069: for (j=nz; j>1; j-=2) {
1070: vi -= 2;
1071: i0 = vi[2];
1072: i1 = vi[1];
1073: tmp0 = tmps[i0];
1074: tmp1 = tmps[i1];
1075: v1 -= 2;
1076: v2 -= 2;
1077: v3 -= 2;
1078: v4 -= 2;
1079: v5 -= 2;
1080: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1081: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1082: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1083: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1084: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1085: }
1086: if (j==1) {
1087: tmp0 = tmps[*vi--];
1088: sum1 -= *v1-- * tmp0;
1089: sum2 -= *v2-- * tmp0;
1090: sum3 -= *v3-- * tmp0;
1091: sum4 -= *v4-- * tmp0;
1092: sum5 -= *v5-- * tmp0;
1093: }
1095: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1096: sum2 -= *v2-- * tmp0;
1097: sum3 -= *v3-- * tmp0;
1098: sum4 -= *v4-- * tmp0;
1099: sum5 -= *v5-- * tmp0;
1100: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1101: sum3 -= *v3-- * tmp0;
1102: sum4 -= *v4-- * tmp0;
1103: sum5 -= *v5-- * tmp0;
1104: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105: sum4 -= *v4-- * tmp0;
1106: sum5 -= *v5-- * tmp0;
1107: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1108: sum5 -= *v5-- * tmp0;
1109: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1110: break;
1111: default:
1112: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
1113: }
1114: }
1115: ISRestoreIndices(isrow,&rout);
1116: ISRestoreIndices(iscol,&cout);
1117: VecRestoreArrayRead(bb,&b);
1118: VecRestoreArrayWrite(xx,&x);
1119: PetscLogFlops(2.0*a->nz - A->cmap->n);
1120: return 0;
1121: }
1123: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1124: {
1125: Mat C =B;
1126: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1127: IS isrow = b->row,isicol = b->icol;
1128: const PetscInt *r,*ic,*ics;
1129: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1130: PetscInt i,j,k,nz,nzL,row,*pj;
1131: const PetscInt *ajtmp,*bjtmp;
1132: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1133: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1134: FactorShiftCtx sctx;
1135: const PetscInt *ddiag;
1136: PetscReal rs;
1137: MatScalar d;
1138: PetscInt inod,nodesz,node_max,col;
1139: const PetscInt *ns;
1140: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1142: /* MatPivotSetUp(): initialize shift context sctx */
1143: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1145: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1146: ddiag = a->diag;
1147: sctx.shift_top = info->zeropivot;
1148: for (i=0; i<n; i++) {
1149: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1150: d = (aa)[ddiag[i]];
1151: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1152: v = aa+ai[i];
1153: nz = ai[i+1] - ai[i];
1154: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1155: if (rs>sctx.shift_top) sctx.shift_top = rs;
1156: }
1157: sctx.shift_top *= 1.1;
1158: sctx.nshift_max = 5;
1159: sctx.shift_lo = 0.;
1160: sctx.shift_hi = 1.;
1161: }
1163: ISGetIndices(isrow,&r);
1164: ISGetIndices(isicol,&ic);
1166: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1167: ics = ic;
1169: node_max = a->inode.node_count;
1170: ns = a->inode.size;
1173: /* If max inode size > 4, split it into two inodes.*/
1174: /* also map the inode sizes according to the ordering */
1175: PetscMalloc1(n+1,&tmp_vec1);
1176: for (i=0,j=0; i<node_max; ++i,++j) {
1177: if (ns[i] > 4) {
1178: tmp_vec1[j] = 4;
1179: ++j;
1180: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1181: } else {
1182: tmp_vec1[j] = ns[i];
1183: }
1184: }
1185: /* Use the correct node_max */
1186: node_max = j;
1188: /* Now reorder the inode info based on mat re-ordering info */
1189: /* First create a row -> inode_size_array_index map */
1190: PetscMalloc1(n+1,&nsmap);
1191: PetscMalloc1(node_max+1,&tmp_vec2);
1192: for (i=0,row=0; i<node_max; i++) {
1193: nodesz = tmp_vec1[i];
1194: for (j=0; j<nodesz; j++,row++) {
1195: nsmap[row] = i;
1196: }
1197: }
1198: /* Using nsmap, create a reordered ns structure */
1199: for (i=0,j=0; i< node_max; i++) {
1200: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1201: tmp_vec2[i] = nodesz;
1202: j += nodesz;
1203: }
1204: PetscFree(nsmap);
1205: PetscFree(tmp_vec1);
1207: /* Now use the correct ns */
1208: ns = tmp_vec2;
1210: do {
1211: sctx.newshift = PETSC_FALSE;
1212: /* Now loop over each block-row, and do the factorization */
1213: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1214: nodesz = ns[inod];
1216: switch (nodesz) {
1217: case 1:
1218: /*----------*/
1219: /* zero rtmp1 */
1220: /* L part */
1221: nz = bi[i+1] - bi[i];
1222: bjtmp = bj + bi[i];
1223: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1225: /* U part */
1226: nz = bdiag[i]-bdiag[i+1];
1227: bjtmp = bj + bdiag[i+1]+1;
1228: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1230: /* load in initial (unfactored row) */
1231: nz = ai[r[i]+1] - ai[r[i]];
1232: ajtmp = aj + ai[r[i]];
1233: v = aa + ai[r[i]];
1234: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1236: /* ZeropivotApply() */
1237: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1239: /* elimination */
1240: bjtmp = bj + bi[i];
1241: row = *bjtmp++;
1242: nzL = bi[i+1] - bi[i];
1243: for (k=0; k < nzL; k++) {
1244: pc = rtmp1 + row;
1245: if (*pc != 0.0) {
1246: pv = b->a + bdiag[row];
1247: mul1 = *pc * (*pv);
1248: *pc = mul1;
1249: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1250: pv = b->a + bdiag[row+1]+1;
1251: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1252: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1253: PetscLogFlops(1+2.0*nz);
1254: }
1255: row = *bjtmp++;
1256: }
1258: /* finished row so stick it into b->a */
1259: rs = 0.0;
1260: /* L part */
1261: pv = b->a + bi[i];
1262: pj = b->j + bi[i];
1263: nz = bi[i+1] - bi[i];
1264: for (j=0; j<nz; j++) {
1265: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1266: }
1268: /* U part */
1269: pv = b->a + bdiag[i+1]+1;
1270: pj = b->j + bdiag[i+1]+1;
1271: nz = bdiag[i] - bdiag[i+1]-1;
1272: for (j=0; j<nz; j++) {
1273: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1274: }
1276: /* Check zero pivot */
1277: sctx.rs = rs;
1278: sctx.pv = rtmp1[i];
1279: MatPivotCheck(B,A,info,&sctx,i);
1280: if (sctx.newshift) break;
1282: /* Mark diagonal and invert diagonal for simpler triangular solves */
1283: pv = b->a + bdiag[i];
1284: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1285: break;
1287: case 2:
1288: /*----------*/
1289: /* zero rtmp1 and rtmp2 */
1290: /* L part */
1291: nz = bi[i+1] - bi[i];
1292: bjtmp = bj + bi[i];
1293: for (j=0; j<nz; j++) {
1294: col = bjtmp[j];
1295: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1296: }
1298: /* U part */
1299: nz = bdiag[i]-bdiag[i+1];
1300: bjtmp = bj + bdiag[i+1]+1;
1301: for (j=0; j<nz; j++) {
1302: col = bjtmp[j];
1303: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1304: }
1306: /* load in initial (unfactored row) */
1307: nz = ai[r[i]+1] - ai[r[i]];
1308: ajtmp = aj + ai[r[i]];
1309: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1310: for (j=0; j<nz; j++) {
1311: col = ics[ajtmp[j]];
1312: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1313: }
1314: /* ZeropivotApply(): shift the diagonal of the matrix */
1315: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1317: /* elimination */
1318: bjtmp = bj + bi[i];
1319: row = *bjtmp++; /* pivot row */
1320: nzL = bi[i+1] - bi[i];
1321: for (k=0; k < nzL; k++) {
1322: pc1 = rtmp1 + row;
1323: pc2 = rtmp2 + row;
1324: if (*pc1 != 0.0 || *pc2 != 0.0) {
1325: pv = b->a + bdiag[row];
1326: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1327: *pc1 = mul1; *pc2 = mul2;
1329: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1330: pv = b->a + bdiag[row+1]+1;
1331: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1332: for (j=0; j<nz; j++) {
1333: col = pj[j];
1334: rtmp1[col] -= mul1 * pv[j];
1335: rtmp2[col] -= mul2 * pv[j];
1336: }
1337: PetscLogFlops(2+4.0*nz);
1338: }
1339: row = *bjtmp++;
1340: }
1342: /* finished row i; check zero pivot, then stick row i into b->a */
1343: rs = 0.0;
1344: /* L part */
1345: pc1 = b->a + bi[i];
1346: pj = b->j + bi[i];
1347: nz = bi[i+1] - bi[i];
1348: for (j=0; j<nz; j++) {
1349: col = pj[j];
1350: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1351: }
1352: /* U part */
1353: pc1 = b->a + bdiag[i+1]+1;
1354: pj = b->j + bdiag[i+1]+1;
1355: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1356: for (j=0; j<nz; j++) {
1357: col = pj[j];
1358: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1359: }
1361: sctx.rs = rs;
1362: sctx.pv = rtmp1[i];
1363: MatPivotCheck(B,A,info,&sctx,i);
1364: if (sctx.newshift) break;
1365: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1366: *pc1 = 1.0/sctx.pv;
1368: /* Now take care of diagonal 2x2 block. */
1369: pc2 = rtmp2 + i;
1370: if (*pc2 != 0.0) {
1371: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1372: *pc2 = mul1; /* insert L entry */
1373: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1374: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1375: for (j=0; j<nz; j++) {
1376: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1377: }
1378: PetscLogFlops(1+2.0*nz);
1379: }
1381: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1382: rs = 0.0;
1383: /* L part */
1384: pc2 = b->a + bi[i+1];
1385: pj = b->j + bi[i+1];
1386: nz = bi[i+2] - bi[i+1];
1387: for (j=0; j<nz; j++) {
1388: col = pj[j];
1389: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1390: }
1391: /* U part */
1392: pc2 = b->a + bdiag[i+2]+1;
1393: pj = b->j + bdiag[i+2]+1;
1394: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1395: for (j=0; j<nz; j++) {
1396: col = pj[j];
1397: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1398: }
1400: sctx.rs = rs;
1401: sctx.pv = rtmp2[i+1];
1402: MatPivotCheck(B,A,info,&sctx,i+1);
1403: if (sctx.newshift) break;
1404: pc2 = b->a + bdiag[i+1];
1405: *pc2 = 1.0/sctx.pv;
1406: break;
1408: case 3:
1409: /*----------*/
1410: /* zero rtmp */
1411: /* L part */
1412: nz = bi[i+1] - bi[i];
1413: bjtmp = bj + bi[i];
1414: for (j=0; j<nz; j++) {
1415: col = bjtmp[j];
1416: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1417: }
1419: /* U part */
1420: nz = bdiag[i]-bdiag[i+1];
1421: bjtmp = bj + bdiag[i+1]+1;
1422: for (j=0; j<nz; j++) {
1423: col = bjtmp[j];
1424: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1425: }
1427: /* load in initial (unfactored row) */
1428: nz = ai[r[i]+1] - ai[r[i]];
1429: ajtmp = aj + ai[r[i]];
1430: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1431: for (j=0; j<nz; j++) {
1432: col = ics[ajtmp[j]];
1433: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1434: }
1435: /* ZeropivotApply(): shift the diagonal of the matrix */
1436: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1438: /* elimination */
1439: bjtmp = bj + bi[i];
1440: row = *bjtmp++; /* pivot row */
1441: nzL = bi[i+1] - bi[i];
1442: for (k=0; k < nzL; k++) {
1443: pc1 = rtmp1 + row;
1444: pc2 = rtmp2 + row;
1445: pc3 = rtmp3 + row;
1446: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1447: pv = b->a + bdiag[row];
1448: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1449: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1451: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1452: pv = b->a + bdiag[row+1]+1;
1453: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1454: for (j=0; j<nz; j++) {
1455: col = pj[j];
1456: rtmp1[col] -= mul1 * pv[j];
1457: rtmp2[col] -= mul2 * pv[j];
1458: rtmp3[col] -= mul3 * pv[j];
1459: }
1460: PetscLogFlops(3+6.0*nz);
1461: }
1462: row = *bjtmp++;
1463: }
1465: /* finished row i; check zero pivot, then stick row i into b->a */
1466: rs = 0.0;
1467: /* L part */
1468: pc1 = b->a + bi[i];
1469: pj = b->j + bi[i];
1470: nz = bi[i+1] - bi[i];
1471: for (j=0; j<nz; j++) {
1472: col = pj[j];
1473: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1474: }
1475: /* U part */
1476: pc1 = b->a + bdiag[i+1]+1;
1477: pj = b->j + bdiag[i+1]+1;
1478: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1479: for (j=0; j<nz; j++) {
1480: col = pj[j];
1481: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1482: }
1484: sctx.rs = rs;
1485: sctx.pv = rtmp1[i];
1486: MatPivotCheck(B,A,info,&sctx,i);
1487: if (sctx.newshift) break;
1488: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1489: *pc1 = 1.0/sctx.pv;
1491: /* Now take care of 1st column of diagonal 3x3 block. */
1492: pc2 = rtmp2 + i;
1493: pc3 = rtmp3 + i;
1494: if (*pc2 != 0.0 || *pc3 != 0.0) {
1495: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1496: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1497: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1498: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1499: for (j=0; j<nz; j++) {
1500: col = pj[j];
1501: rtmp2[col] -= mul2 * rtmp1[col];
1502: rtmp3[col] -= mul3 * rtmp1[col];
1503: }
1504: PetscLogFlops(2+4.0*nz);
1505: }
1507: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1508: rs = 0.0;
1509: /* L part */
1510: pc2 = b->a + bi[i+1];
1511: pj = b->j + bi[i+1];
1512: nz = bi[i+2] - bi[i+1];
1513: for (j=0; j<nz; j++) {
1514: col = pj[j];
1515: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1516: }
1517: /* U part */
1518: pc2 = b->a + bdiag[i+2]+1;
1519: pj = b->j + bdiag[i+2]+1;
1520: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1521: for (j=0; j<nz; j++) {
1522: col = pj[j];
1523: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1524: }
1526: sctx.rs = rs;
1527: sctx.pv = rtmp2[i+1];
1528: MatPivotCheck(B,A,info,&sctx,i+1);
1529: if (sctx.newshift) break;
1530: pc2 = b->a + bdiag[i+1];
1531: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1533: /* Now take care of 2nd column of diagonal 3x3 block. */
1534: pc3 = rtmp3 + i+1;
1535: if (*pc3 != 0.0) {
1536: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1537: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1538: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1539: for (j=0; j<nz; j++) {
1540: col = pj[j];
1541: rtmp3[col] -= mul3 * rtmp2[col];
1542: }
1543: PetscLogFlops(1+2.0*nz);
1544: }
1546: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1547: rs = 0.0;
1548: /* L part */
1549: pc3 = b->a + bi[i+2];
1550: pj = b->j + bi[i+2];
1551: nz = bi[i+3] - bi[i+2];
1552: for (j=0; j<nz; j++) {
1553: col = pj[j];
1554: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1555: }
1556: /* U part */
1557: pc3 = b->a + bdiag[i+3]+1;
1558: pj = b->j + bdiag[i+3]+1;
1559: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1560: for (j=0; j<nz; j++) {
1561: col = pj[j];
1562: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1563: }
1565: sctx.rs = rs;
1566: sctx.pv = rtmp3[i+2];
1567: MatPivotCheck(B,A,info,&sctx,i+2);
1568: if (sctx.newshift) break;
1569: pc3 = b->a + bdiag[i+2];
1570: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1571: break;
1572: case 4:
1573: /*----------*/
1574: /* zero rtmp */
1575: /* L part */
1576: nz = bi[i+1] - bi[i];
1577: bjtmp = bj + bi[i];
1578: for (j=0; j<nz; j++) {
1579: col = bjtmp[j];
1580: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1581: }
1583: /* U part */
1584: nz = bdiag[i]-bdiag[i+1];
1585: bjtmp = bj + bdiag[i+1]+1;
1586: for (j=0; j<nz; j++) {
1587: col = bjtmp[j];
1588: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1589: }
1591: /* load in initial (unfactored row) */
1592: nz = ai[r[i]+1] - ai[r[i]];
1593: ajtmp = aj + ai[r[i]];
1594: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1595: for (j=0; j<nz; j++) {
1596: col = ics[ajtmp[j]];
1597: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1598: }
1599: /* ZeropivotApply(): shift the diagonal of the matrix */
1600: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1602: /* elimination */
1603: bjtmp = bj + bi[i];
1604: row = *bjtmp++; /* pivot row */
1605: nzL = bi[i+1] - bi[i];
1606: for (k=0; k < nzL; k++) {
1607: pc1 = rtmp1 + row;
1608: pc2 = rtmp2 + row;
1609: pc3 = rtmp3 + row;
1610: pc4 = rtmp4 + row;
1611: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1612: pv = b->a + bdiag[row];
1613: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1614: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1616: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1617: pv = b->a + bdiag[row+1]+1;
1618: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1619: for (j=0; j<nz; j++) {
1620: col = pj[j];
1621: rtmp1[col] -= mul1 * pv[j];
1622: rtmp2[col] -= mul2 * pv[j];
1623: rtmp3[col] -= mul3 * pv[j];
1624: rtmp4[col] -= mul4 * pv[j];
1625: }
1626: PetscLogFlops(4+8.0*nz);
1627: }
1628: row = *bjtmp++;
1629: }
1631: /* finished row i; check zero pivot, then stick row i into b->a */
1632: rs = 0.0;
1633: /* L part */
1634: pc1 = b->a + bi[i];
1635: pj = b->j + bi[i];
1636: nz = bi[i+1] - bi[i];
1637: for (j=0; j<nz; j++) {
1638: col = pj[j];
1639: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1640: }
1641: /* U part */
1642: pc1 = b->a + bdiag[i+1]+1;
1643: pj = b->j + bdiag[i+1]+1;
1644: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1645: for (j=0; j<nz; j++) {
1646: col = pj[j];
1647: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1648: }
1650: sctx.rs = rs;
1651: sctx.pv = rtmp1[i];
1652: MatPivotCheck(B,A,info,&sctx,i);
1653: if (sctx.newshift) break;
1654: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1655: *pc1 = 1.0/sctx.pv;
1657: /* Now take care of 1st column of diagonal 4x4 block. */
1658: pc2 = rtmp2 + i;
1659: pc3 = rtmp3 + i;
1660: pc4 = rtmp4 + i;
1661: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1662: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1663: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1664: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1665: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1666: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1667: for (j=0; j<nz; j++) {
1668: col = pj[j];
1669: rtmp2[col] -= mul2 * rtmp1[col];
1670: rtmp3[col] -= mul3 * rtmp1[col];
1671: rtmp4[col] -= mul4 * rtmp1[col];
1672: }
1673: PetscLogFlops(3+6.0*nz);
1674: }
1676: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1677: rs = 0.0;
1678: /* L part */
1679: pc2 = b->a + bi[i+1];
1680: pj = b->j + bi[i+1];
1681: nz = bi[i+2] - bi[i+1];
1682: for (j=0; j<nz; j++) {
1683: col = pj[j];
1684: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1685: }
1686: /* U part */
1687: pc2 = b->a + bdiag[i+2]+1;
1688: pj = b->j + bdiag[i+2]+1;
1689: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1690: for (j=0; j<nz; j++) {
1691: col = pj[j];
1692: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1693: }
1695: sctx.rs = rs;
1696: sctx.pv = rtmp2[i+1];
1697: MatPivotCheck(B,A,info,&sctx,i+1);
1698: if (sctx.newshift) break;
1699: pc2 = b->a + bdiag[i+1];
1700: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1702: /* Now take care of 2nd column of diagonal 4x4 block. */
1703: pc3 = rtmp3 + i+1;
1704: pc4 = rtmp4 + i+1;
1705: if (*pc3 != 0.0 || *pc4 != 0.0) {
1706: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1707: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1708: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1709: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1710: for (j=0; j<nz; j++) {
1711: col = pj[j];
1712: rtmp3[col] -= mul3 * rtmp2[col];
1713: rtmp4[col] -= mul4 * rtmp2[col];
1714: }
1715: PetscLogFlops(4.0*nz);
1716: }
1718: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1719: rs = 0.0;
1720: /* L part */
1721: pc3 = b->a + bi[i+2];
1722: pj = b->j + bi[i+2];
1723: nz = bi[i+3] - bi[i+2];
1724: for (j=0; j<nz; j++) {
1725: col = pj[j];
1726: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1727: }
1728: /* U part */
1729: pc3 = b->a + bdiag[i+3]+1;
1730: pj = b->j + bdiag[i+3]+1;
1731: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1732: for (j=0; j<nz; j++) {
1733: col = pj[j];
1734: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1735: }
1737: sctx.rs = rs;
1738: sctx.pv = rtmp3[i+2];
1739: MatPivotCheck(B,A,info,&sctx,i+2);
1740: if (sctx.newshift) break;
1741: pc3 = b->a + bdiag[i+2];
1742: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1744: /* Now take care of 3rd column of diagonal 4x4 block. */
1745: pc4 = rtmp4 + i+2;
1746: if (*pc4 != 0.0) {
1747: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1748: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1749: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1750: for (j=0; j<nz; j++) {
1751: col = pj[j];
1752: rtmp4[col] -= mul4 * rtmp3[col];
1753: }
1754: PetscLogFlops(1+2.0*nz);
1755: }
1757: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1758: rs = 0.0;
1759: /* L part */
1760: pc4 = b->a + bi[i+3];
1761: pj = b->j + bi[i+3];
1762: nz = bi[i+4] - bi[i+3];
1763: for (j=0; j<nz; j++) {
1764: col = pj[j];
1765: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1766: }
1767: /* U part */
1768: pc4 = b->a + bdiag[i+4]+1;
1769: pj = b->j + bdiag[i+4]+1;
1770: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1771: for (j=0; j<nz; j++) {
1772: col = pj[j];
1773: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1774: }
1776: sctx.rs = rs;
1777: sctx.pv = rtmp4[i+3];
1778: MatPivotCheck(B,A,info,&sctx,i+3);
1779: if (sctx.newshift) break;
1780: pc4 = b->a + bdiag[i+3];
1781: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1782: break;
1784: default:
1785: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported ");
1786: }
1787: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1788: i += nodesz; /* Update the row */
1789: }
1791: /* MatPivotRefine() */
1792: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1793: /*
1794: * if no shift in this attempt & shifting & started shifting & can refine,
1795: * then try lower shift
1796: */
1797: sctx.shift_hi = sctx.shift_fraction;
1798: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1799: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1800: sctx.newshift = PETSC_TRUE;
1801: sctx.nshift++;
1802: }
1803: } while (sctx.newshift);
1805: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1806: PetscFree(tmp_vec2);
1807: ISRestoreIndices(isicol,&ic);
1808: ISRestoreIndices(isrow,&r);
1810: if (b->inode.size) {
1811: C->ops->solve = MatSolve_SeqAIJ_Inode;
1812: } else {
1813: C->ops->solve = MatSolve_SeqAIJ;
1814: }
1815: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1816: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1817: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1818: C->ops->matsolve = MatMatSolve_SeqAIJ;
1819: C->assembled = PETSC_TRUE;
1820: C->preallocated = PETSC_TRUE;
1822: PetscLogFlops(C->cmap->n);
1824: /* MatShiftView(A,info,&sctx) */
1825: if (sctx.nshift) {
1826: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1827: PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1828: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1829: PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1830: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1831: PetscInfo(A,"number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1832: }
1833: }
1834: return 0;
1835: }
1837: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1838: {
1839: Mat C = B;
1840: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1841: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1842: const PetscInt *r,*ic,*c,*ics;
1843: PetscInt n = A->rmap->n,*bi = b->i;
1844: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1845: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1846: PetscInt *ai = a->i,*aj = a->j;
1847: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1848: PetscScalar mul1,mul2,mul3,tmp;
1849: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1850: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1851: PetscReal rs=0.0;
1852: FactorShiftCtx sctx;
1854: sctx.shift_top = 0;
1855: sctx.nshift_max = 0;
1856: sctx.shift_lo = 0;
1857: sctx.shift_hi = 0;
1858: sctx.shift_fraction = 0;
1860: /* if both shift schemes are chosen by user, only use info->shiftpd */
1861: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1862: sctx.shift_top = 0;
1863: for (i=0; i<n; i++) {
1864: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1865: rs = 0.0;
1866: ajtmp = aj + ai[i];
1867: rtmp1 = aa + ai[i];
1868: nz = ai[i+1] - ai[i];
1869: for (j=0; j<nz; j++) {
1870: if (*ajtmp != i) {
1871: rs += PetscAbsScalar(*rtmp1++);
1872: } else {
1873: rs -= PetscRealPart(*rtmp1++);
1874: }
1875: ajtmp++;
1876: }
1877: if (rs>sctx.shift_top) sctx.shift_top = rs;
1878: }
1879: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1880: sctx.shift_top *= 1.1;
1881: sctx.nshift_max = 5;
1882: sctx.shift_lo = 0.;
1883: sctx.shift_hi = 1.;
1884: }
1885: sctx.shift_amount = 0;
1886: sctx.nshift = 0;
1888: ISGetIndices(isrow,&r);
1889: ISGetIndices(iscol,&c);
1890: ISGetIndices(isicol,&ic);
1891: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1892: ics = ic;
1894: node_max = a->inode.node_count;
1895: ns = a->inode.size;
1898: /* If max inode size > 3, split it into two inodes.*/
1899: /* also map the inode sizes according to the ordering */
1900: PetscMalloc1(n+1,&tmp_vec1);
1901: for (i=0,j=0; i<node_max; ++i,++j) {
1902: if (ns[i]>3) {
1903: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1904: ++j;
1905: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1906: } else {
1907: tmp_vec1[j] = ns[i];
1908: }
1909: }
1910: /* Use the correct node_max */
1911: node_max = j;
1913: /* Now reorder the inode info based on mat re-ordering info */
1914: /* First create a row -> inode_size_array_index map */
1915: PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1916: for (i=0,row=0; i<node_max; i++) {
1917: nodesz = tmp_vec1[i];
1918: for (j=0; j<nodesz; j++,row++) {
1919: nsmap[row] = i;
1920: }
1921: }
1922: /* Using nsmap, create a reordered ns structure */
1923: for (i=0,j=0; i< node_max; i++) {
1924: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1925: tmp_vec2[i] = nodesz;
1926: j += nodesz;
1927: }
1928: PetscFree2(nsmap,tmp_vec1);
1929: /* Now use the correct ns */
1930: ns = tmp_vec2;
1932: do {
1933: sctx.newshift = PETSC_FALSE;
1934: /* Now loop over each block-row, and do the factorization */
1935: for (i=0,row=0; i<node_max; i++) {
1936: nodesz = ns[i];
1937: nz = bi[row+1] - bi[row];
1938: bjtmp = bj + bi[row];
1940: switch (nodesz) {
1941: case 1:
1942: for (j=0; j<nz; j++) {
1943: idx = bjtmp[j];
1944: rtmp11[idx] = 0.0;
1945: }
1947: /* load in initial (unfactored row) */
1948: idx = r[row];
1949: nz_tmp = ai[idx+1] - ai[idx];
1950: ajtmp = aj + ai[idx];
1951: v1 = aa + ai[idx];
1953: for (j=0; j<nz_tmp; j++) {
1954: idx = ics[ajtmp[j]];
1955: rtmp11[idx] = v1[j];
1956: }
1957: rtmp11[ics[r[row]]] += sctx.shift_amount;
1959: prow = *bjtmp++;
1960: while (prow < row) {
1961: pc1 = rtmp11 + prow;
1962: if (*pc1 != 0.0) {
1963: pv = ba + bd[prow];
1964: pj = nbj + bd[prow];
1965: mul1 = *pc1 * *pv++;
1966: *pc1 = mul1;
1967: nz_tmp = bi[prow+1] - bd[prow] - 1;
1968: PetscLogFlops(1+2.0*nz_tmp);
1969: for (j=0; j<nz_tmp; j++) {
1970: tmp = pv[j];
1971: idx = pj[j];
1972: rtmp11[idx] -= mul1 * tmp;
1973: }
1974: }
1975: prow = *bjtmp++;
1976: }
1977: pj = bj + bi[row];
1978: pc1 = ba + bi[row];
1980: sctx.pv = rtmp11[row];
1981: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1982: rs = 0.0;
1983: for (j=0; j<nz; j++) {
1984: idx = pj[j];
1985: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1986: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1987: }
1988: sctx.rs = rs;
1989: MatPivotCheck(B,A,info,&sctx,row);
1990: if (sctx.newshift) goto endofwhile;
1991: break;
1993: case 2:
1994: for (j=0; j<nz; j++) {
1995: idx = bjtmp[j];
1996: rtmp11[idx] = 0.0;
1997: rtmp22[idx] = 0.0;
1998: }
2000: /* load in initial (unfactored row) */
2001: idx = r[row];
2002: nz_tmp = ai[idx+1] - ai[idx];
2003: ajtmp = aj + ai[idx];
2004: v1 = aa + ai[idx];
2005: v2 = aa + ai[idx+1];
2006: for (j=0; j<nz_tmp; j++) {
2007: idx = ics[ajtmp[j]];
2008: rtmp11[idx] = v1[j];
2009: rtmp22[idx] = v2[j];
2010: }
2011: rtmp11[ics[r[row]]] += sctx.shift_amount;
2012: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2014: prow = *bjtmp++;
2015: while (prow < row) {
2016: pc1 = rtmp11 + prow;
2017: pc2 = rtmp22 + prow;
2018: if (*pc1 != 0.0 || *pc2 != 0.0) {
2019: pv = ba + bd[prow];
2020: pj = nbj + bd[prow];
2021: mul1 = *pc1 * *pv;
2022: mul2 = *pc2 * *pv;
2023: ++pv;
2024: *pc1 = mul1;
2025: *pc2 = mul2;
2027: nz_tmp = bi[prow+1] - bd[prow] - 1;
2028: for (j=0; j<nz_tmp; j++) {
2029: tmp = pv[j];
2030: idx = pj[j];
2031: rtmp11[idx] -= mul1 * tmp;
2032: rtmp22[idx] -= mul2 * tmp;
2033: }
2034: PetscLogFlops(2+4.0*nz_tmp);
2035: }
2036: prow = *bjtmp++;
2037: }
2039: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2040: pc1 = rtmp11 + prow;
2041: pc2 = rtmp22 + prow;
2043: sctx.pv = *pc1;
2044: pj = bj + bi[prow];
2045: rs = 0.0;
2046: for (j=0; j<nz; j++) {
2047: idx = pj[j];
2048: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2049: }
2050: sctx.rs = rs;
2051: MatPivotCheck(B,A,info,&sctx,row);
2052: if (sctx.newshift) goto endofwhile;
2054: if (*pc2 != 0.0) {
2055: pj = nbj + bd[prow];
2056: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2057: *pc2 = mul2;
2058: nz_tmp = bi[prow+1] - bd[prow] - 1;
2059: for (j=0; j<nz_tmp; j++) {
2060: idx = pj[j];
2061: tmp = rtmp11[idx];
2062: rtmp22[idx] -= mul2 * tmp;
2063: }
2064: PetscLogFlops(1+2.0*nz_tmp);
2065: }
2067: pj = bj + bi[row];
2068: pc1 = ba + bi[row];
2069: pc2 = ba + bi[row+1];
2071: sctx.pv = rtmp22[row+1];
2072: rs = 0.0;
2073: rtmp11[row] = 1.0/rtmp11[row];
2074: rtmp22[row+1] = 1.0/rtmp22[row+1];
2075: /* copy row entries from dense representation to sparse */
2076: for (j=0; j<nz; j++) {
2077: idx = pj[j];
2078: pc1[j] = rtmp11[idx];
2079: pc2[j] = rtmp22[idx];
2080: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2081: }
2082: sctx.rs = rs;
2083: MatPivotCheck(B,A,info,&sctx,row+1);
2084: if (sctx.newshift) goto endofwhile;
2085: break;
2087: case 3:
2088: for (j=0; j<nz; j++) {
2089: idx = bjtmp[j];
2090: rtmp11[idx] = 0.0;
2091: rtmp22[idx] = 0.0;
2092: rtmp33[idx] = 0.0;
2093: }
2094: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2095: idx = r[row];
2096: nz_tmp = ai[idx+1] - ai[idx];
2097: ajtmp = aj + ai[idx];
2098: v1 = aa + ai[idx];
2099: v2 = aa + ai[idx+1];
2100: v3 = aa + ai[idx+2];
2101: for (j=0; j<nz_tmp; j++) {
2102: idx = ics[ajtmp[j]];
2103: rtmp11[idx] = v1[j];
2104: rtmp22[idx] = v2[j];
2105: rtmp33[idx] = v3[j];
2106: }
2107: rtmp11[ics[r[row]]] += sctx.shift_amount;
2108: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2109: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2111: /* loop over all pivot row blocks above this row block */
2112: prow = *bjtmp++;
2113: while (prow < row) {
2114: pc1 = rtmp11 + prow;
2115: pc2 = rtmp22 + prow;
2116: pc3 = rtmp33 + prow;
2117: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2118: pv = ba + bd[prow];
2119: pj = nbj + bd[prow];
2120: mul1 = *pc1 * *pv;
2121: mul2 = *pc2 * *pv;
2122: mul3 = *pc3 * *pv;
2123: ++pv;
2124: *pc1 = mul1;
2125: *pc2 = mul2;
2126: *pc3 = mul3;
2128: nz_tmp = bi[prow+1] - bd[prow] - 1;
2129: /* update this row based on pivot row */
2130: for (j=0; j<nz_tmp; j++) {
2131: tmp = pv[j];
2132: idx = pj[j];
2133: rtmp11[idx] -= mul1 * tmp;
2134: rtmp22[idx] -= mul2 * tmp;
2135: rtmp33[idx] -= mul3 * tmp;
2136: }
2137: PetscLogFlops(3+6.0*nz_tmp);
2138: }
2139: prow = *bjtmp++;
2140: }
2142: /* Now take care of diagonal 3x3 block in this set of rows */
2143: /* note: prow = row here */
2144: pc1 = rtmp11 + prow;
2145: pc2 = rtmp22 + prow;
2146: pc3 = rtmp33 + prow;
2148: sctx.pv = *pc1;
2149: pj = bj + bi[prow];
2150: rs = 0.0;
2151: for (j=0; j<nz; j++) {
2152: idx = pj[j];
2153: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2154: }
2155: sctx.rs = rs;
2156: MatPivotCheck(B,A,info,&sctx,row);
2157: if (sctx.newshift) goto endofwhile;
2159: if (*pc2 != 0.0 || *pc3 != 0.0) {
2160: mul2 = (*pc2)/(*pc1);
2161: mul3 = (*pc3)/(*pc1);
2162: *pc2 = mul2;
2163: *pc3 = mul3;
2164: nz_tmp = bi[prow+1] - bd[prow] - 1;
2165: pj = nbj + bd[prow];
2166: for (j=0; j<nz_tmp; j++) {
2167: idx = pj[j];
2168: tmp = rtmp11[idx];
2169: rtmp22[idx] -= mul2 * tmp;
2170: rtmp33[idx] -= mul3 * tmp;
2171: }
2172: PetscLogFlops(2+4.0*nz_tmp);
2173: }
2174: ++prow;
2176: pc2 = rtmp22 + prow;
2177: pc3 = rtmp33 + prow;
2178: sctx.pv = *pc2;
2179: pj = bj + bi[prow];
2180: rs = 0.0;
2181: for (j=0; j<nz; j++) {
2182: idx = pj[j];
2183: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2184: }
2185: sctx.rs = rs;
2186: MatPivotCheck(B,A,info,&sctx,row+1);
2187: if (sctx.newshift) goto endofwhile;
2189: if (*pc3 != 0.0) {
2190: mul3 = (*pc3)/(*pc2);
2191: *pc3 = mul3;
2192: pj = nbj + bd[prow];
2193: nz_tmp = bi[prow+1] - bd[prow] - 1;
2194: for (j=0; j<nz_tmp; j++) {
2195: idx = pj[j];
2196: tmp = rtmp22[idx];
2197: rtmp33[idx] -= mul3 * tmp;
2198: }
2199: PetscLogFlops(1+2.0*nz_tmp);
2200: }
2202: pj = bj + bi[row];
2203: pc1 = ba + bi[row];
2204: pc2 = ba + bi[row+1];
2205: pc3 = ba + bi[row+2];
2207: sctx.pv = rtmp33[row+2];
2208: rs = 0.0;
2209: rtmp11[row] = 1.0/rtmp11[row];
2210: rtmp22[row+1] = 1.0/rtmp22[row+1];
2211: rtmp33[row+2] = 1.0/rtmp33[row+2];
2212: /* copy row entries from dense representation to sparse */
2213: for (j=0; j<nz; j++) {
2214: idx = pj[j];
2215: pc1[j] = rtmp11[idx];
2216: pc2[j] = rtmp22[idx];
2217: pc3[j] = rtmp33[idx];
2218: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2219: }
2221: sctx.rs = rs;
2222: MatPivotCheck(B,A,info,&sctx,row+2);
2223: if (sctx.newshift) goto endofwhile;
2224: break;
2226: default:
2227: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported ");
2228: }
2229: row += nodesz; /* Update the row */
2230: }
2231: endofwhile:;
2232: } while (sctx.newshift);
2233: PetscFree3(rtmp11,rtmp22,rtmp33);
2234: PetscFree(tmp_vec2);
2235: ISRestoreIndices(isicol,&ic);
2236: ISRestoreIndices(isrow,&r);
2237: ISRestoreIndices(iscol,&c);
2239: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2240: /* do not set solve add, since MatSolve_Inode + Add is faster */
2241: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2242: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2243: C->assembled = PETSC_TRUE;
2244: C->preallocated = PETSC_TRUE;
2245: if (sctx.nshift) {
2246: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2247: PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2248: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2249: PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2250: }
2251: }
2252: PetscLogFlops(C->cmap->n);
2253: MatSeqAIJCheckInode(C);
2254: return 0;
2255: }
2257: /* ----------------------------------------------------------- */
2258: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2259: {
2260: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2261: IS iscol = a->col,isrow = a->row;
2262: const PetscInt *r,*c,*rout,*cout;
2263: PetscInt i,j,n = A->rmap->n;
2264: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2265: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2266: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2267: PetscScalar sum1,sum2,sum3,sum4,sum5;
2268: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2269: const PetscScalar *b;
2272: node_max = a->inode.node_count;
2273: ns = a->inode.size; /* Node Size array */
2275: VecGetArrayRead(bb,&b);
2276: VecGetArrayWrite(xx,&x);
2277: tmp = a->solve_work;
2279: ISGetIndices(isrow,&rout); r = rout;
2280: ISGetIndices(iscol,&cout); c = cout;
2282: /* forward solve the lower triangular */
2283: tmps = tmp;
2284: aa = a_a;
2285: aj = a_j;
2286: ad = a->diag;
2288: for (i = 0,row = 0; i< node_max; ++i) {
2289: nsz = ns[i];
2290: aii = ai[row];
2291: v1 = aa + aii;
2292: vi = aj + aii;
2293: nz = ai[row+1]- ai[row];
2295: if (i < node_max-1) {
2296: /* Prefetch the indices for the next block */
2297: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2298: /* Prefetch the data for the next block */
2299: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2300: }
2302: switch (nsz) { /* Each loop in 'case' is unrolled */
2303: case 1:
2304: sum1 = b[r[row]];
2305: for (j=0; j<nz-1; j+=2) {
2306: i0 = vi[j];
2307: i1 = vi[j+1];
2308: tmp0 = tmps[i0];
2309: tmp1 = tmps[i1];
2310: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2311: }
2312: if (j == nz-1) {
2313: tmp0 = tmps[vi[j]];
2314: sum1 -= v1[j]*tmp0;
2315: }
2316: tmp[row++]=sum1;
2317: break;
2318: case 2:
2319: sum1 = b[r[row]];
2320: sum2 = b[r[row+1]];
2321: v2 = aa + ai[row+1];
2323: for (j=0; j<nz-1; j+=2) {
2324: i0 = vi[j];
2325: i1 = vi[j+1];
2326: tmp0 = tmps[i0];
2327: tmp1 = tmps[i1];
2328: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2329: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2330: }
2331: if (j == nz-1) {
2332: tmp0 = tmps[vi[j]];
2333: sum1 -= v1[j] *tmp0;
2334: sum2 -= v2[j] *tmp0;
2335: }
2336: sum2 -= v2[nz] * sum1;
2337: tmp[row++]=sum1;
2338: tmp[row++]=sum2;
2339: break;
2340: case 3:
2341: sum1 = b[r[row]];
2342: sum2 = b[r[row+1]];
2343: sum3 = b[r[row+2]];
2344: v2 = aa + ai[row+1];
2345: v3 = aa + ai[row+2];
2347: for (j=0; j<nz-1; j+=2) {
2348: i0 = vi[j];
2349: i1 = vi[j+1];
2350: tmp0 = tmps[i0];
2351: tmp1 = tmps[i1];
2352: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2353: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2354: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2355: }
2356: if (j == nz-1) {
2357: tmp0 = tmps[vi[j]];
2358: sum1 -= v1[j] *tmp0;
2359: sum2 -= v2[j] *tmp0;
2360: sum3 -= v3[j] *tmp0;
2361: }
2362: sum2 -= v2[nz] * sum1;
2363: sum3 -= v3[nz] * sum1;
2364: sum3 -= v3[nz+1] * sum2;
2365: tmp[row++]=sum1;
2366: tmp[row++]=sum2;
2367: tmp[row++]=sum3;
2368: break;
2370: case 4:
2371: sum1 = b[r[row]];
2372: sum2 = b[r[row+1]];
2373: sum3 = b[r[row+2]];
2374: sum4 = b[r[row+3]];
2375: v2 = aa + ai[row+1];
2376: v3 = aa + ai[row+2];
2377: v4 = aa + ai[row+3];
2379: for (j=0; j<nz-1; j+=2) {
2380: i0 = vi[j];
2381: i1 = vi[j+1];
2382: tmp0 = tmps[i0];
2383: tmp1 = tmps[i1];
2384: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2385: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2386: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2387: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2388: }
2389: if (j == nz-1) {
2390: tmp0 = tmps[vi[j]];
2391: sum1 -= v1[j] *tmp0;
2392: sum2 -= v2[j] *tmp0;
2393: sum3 -= v3[j] *tmp0;
2394: sum4 -= v4[j] *tmp0;
2395: }
2396: sum2 -= v2[nz] * sum1;
2397: sum3 -= v3[nz] * sum1;
2398: sum4 -= v4[nz] * sum1;
2399: sum3 -= v3[nz+1] * sum2;
2400: sum4 -= v4[nz+1] * sum2;
2401: sum4 -= v4[nz+2] * sum3;
2403: tmp[row++]=sum1;
2404: tmp[row++]=sum2;
2405: tmp[row++]=sum3;
2406: tmp[row++]=sum4;
2407: break;
2408: case 5:
2409: sum1 = b[r[row]];
2410: sum2 = b[r[row+1]];
2411: sum3 = b[r[row+2]];
2412: sum4 = b[r[row+3]];
2413: sum5 = b[r[row+4]];
2414: v2 = aa + ai[row+1];
2415: v3 = aa + ai[row+2];
2416: v4 = aa + ai[row+3];
2417: v5 = aa + ai[row+4];
2419: for (j=0; j<nz-1; j+=2) {
2420: i0 = vi[j];
2421: i1 = vi[j+1];
2422: tmp0 = tmps[i0];
2423: tmp1 = tmps[i1];
2424: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2429: }
2430: if (j == nz-1) {
2431: tmp0 = tmps[vi[j]];
2432: sum1 -= v1[j] *tmp0;
2433: sum2 -= v2[j] *tmp0;
2434: sum3 -= v3[j] *tmp0;
2435: sum4 -= v4[j] *tmp0;
2436: sum5 -= v5[j] *tmp0;
2437: }
2439: sum2 -= v2[nz] * sum1;
2440: sum3 -= v3[nz] * sum1;
2441: sum4 -= v4[nz] * sum1;
2442: sum5 -= v5[nz] * sum1;
2443: sum3 -= v3[nz+1] * sum2;
2444: sum4 -= v4[nz+1] * sum2;
2445: sum5 -= v5[nz+1] * sum2;
2446: sum4 -= v4[nz+2] * sum3;
2447: sum5 -= v5[nz+2] * sum3;
2448: sum5 -= v5[nz+3] * sum4;
2450: tmp[row++]=sum1;
2451: tmp[row++]=sum2;
2452: tmp[row++]=sum3;
2453: tmp[row++]=sum4;
2454: tmp[row++]=sum5;
2455: break;
2456: default:
2457: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
2458: }
2459: }
2460: /* backward solve the upper triangular */
2461: for (i=node_max -1,row = n-1; i>=0; i--) {
2462: nsz = ns[i];
2463: aii = ad[row+1] + 1;
2464: v1 = aa + aii;
2465: vi = aj + aii;
2466: nz = ad[row]- ad[row+1] - 1;
2468: if (i > 0) {
2469: /* Prefetch the indices for the next block */
2470: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2471: /* Prefetch the data for the next block */
2472: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2473: }
2475: switch (nsz) { /* Each loop in 'case' is unrolled */
2476: case 1:
2477: sum1 = tmp[row];
2479: for (j=0; j<nz-1; j+=2) {
2480: i0 = vi[j];
2481: i1 = vi[j+1];
2482: tmp0 = tmps[i0];
2483: tmp1 = tmps[i1];
2484: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2485: }
2486: if (j == nz-1) {
2487: tmp0 = tmps[vi[j]];
2488: sum1 -= v1[j]*tmp0;
2489: }
2490: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2491: break;
2492: case 2:
2493: sum1 = tmp[row];
2494: sum2 = tmp[row-1];
2495: v2 = aa + ad[row] + 1;
2496: for (j=0; j<nz-1; j+=2) {
2497: i0 = vi[j];
2498: i1 = vi[j+1];
2499: tmp0 = tmps[i0];
2500: tmp1 = tmps[i1];
2501: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2502: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2503: }
2504: if (j == nz-1) {
2505: tmp0 = tmps[vi[j]];
2506: sum1 -= v1[j]* tmp0;
2507: sum2 -= v2[j+1]* tmp0;
2508: }
2510: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2511: sum2 -= v2[0] * tmp0;
2512: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2513: break;
2514: case 3:
2515: sum1 = tmp[row];
2516: sum2 = tmp[row -1];
2517: sum3 = tmp[row -2];
2518: v2 = aa + ad[row] + 1;
2519: v3 = aa + ad[row -1] + 1;
2520: for (j=0; j<nz-1; j+=2) {
2521: i0 = vi[j];
2522: i1 = vi[j+1];
2523: tmp0 = tmps[i0];
2524: tmp1 = tmps[i1];
2525: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2526: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2527: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2528: }
2529: if (j== nz-1) {
2530: tmp0 = tmps[vi[j]];
2531: sum1 -= v1[j] * tmp0;
2532: sum2 -= v2[j+1] * tmp0;
2533: sum3 -= v3[j+2] * tmp0;
2534: }
2535: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2536: sum2 -= v2[0]* tmp0;
2537: sum3 -= v3[1] * tmp0;
2538: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2539: sum3 -= v3[0]* tmp0;
2540: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2542: break;
2543: case 4:
2544: sum1 = tmp[row];
2545: sum2 = tmp[row -1];
2546: sum3 = tmp[row -2];
2547: sum4 = tmp[row -3];
2548: v2 = aa + ad[row]+1;
2549: v3 = aa + ad[row -1]+1;
2550: v4 = aa + ad[row -2]+1;
2552: for (j=0; j<nz-1; j+=2) {
2553: i0 = vi[j];
2554: i1 = vi[j+1];
2555: tmp0 = tmps[i0];
2556: tmp1 = tmps[i1];
2557: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2558: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2559: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2560: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2561: }
2562: if (j== nz-1) {
2563: tmp0 = tmps[vi[j]];
2564: sum1 -= v1[j] * tmp0;
2565: sum2 -= v2[j+1] * tmp0;
2566: sum3 -= v3[j+2] * tmp0;
2567: sum4 -= v4[j+3] * tmp0;
2568: }
2570: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2571: sum2 -= v2[0] * tmp0;
2572: sum3 -= v3[1] * tmp0;
2573: sum4 -= v4[2] * tmp0;
2574: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2575: sum3 -= v3[0] * tmp0;
2576: sum4 -= v4[1] * tmp0;
2577: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2578: sum4 -= v4[0] * tmp0;
2579: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2580: break;
2581: case 5:
2582: sum1 = tmp[row];
2583: sum2 = tmp[row -1];
2584: sum3 = tmp[row -2];
2585: sum4 = tmp[row -3];
2586: sum5 = tmp[row -4];
2587: v2 = aa + ad[row]+1;
2588: v3 = aa + ad[row -1]+1;
2589: v4 = aa + ad[row -2]+1;
2590: v5 = aa + ad[row -3]+1;
2591: for (j=0; j<nz-1; j+=2) {
2592: i0 = vi[j];
2593: i1 = vi[j+1];
2594: tmp0 = tmps[i0];
2595: tmp1 = tmps[i1];
2596: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2597: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2598: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2599: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2600: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2601: }
2602: if (j==nz-1) {
2603: tmp0 = tmps[vi[j]];
2604: sum1 -= v1[j] * tmp0;
2605: sum2 -= v2[j+1] * tmp0;
2606: sum3 -= v3[j+2] * tmp0;
2607: sum4 -= v4[j+3] * tmp0;
2608: sum5 -= v5[j+4] * tmp0;
2609: }
2611: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2612: sum2 -= v2[0] * tmp0;
2613: sum3 -= v3[1] * tmp0;
2614: sum4 -= v4[2] * tmp0;
2615: sum5 -= v5[3] * tmp0;
2616: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2617: sum3 -= v3[0] * tmp0;
2618: sum4 -= v4[1] * tmp0;
2619: sum5 -= v5[2] * tmp0;
2620: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2621: sum4 -= v4[0] * tmp0;
2622: sum5 -= v5[1] * tmp0;
2623: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2624: sum5 -= v5[0] * tmp0;
2625: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2626: break;
2627: default:
2628: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
2629: }
2630: }
2631: ISRestoreIndices(isrow,&rout);
2632: ISRestoreIndices(iscol,&cout);
2633: VecRestoreArrayRead(bb,&b);
2634: VecRestoreArrayWrite(xx,&x);
2635: PetscLogFlops(2.0*a->nz - A->cmap->n);
2636: return 0;
2637: }
2639: /*
2640: Makes a longer coloring[] array and calls the usual code with that
2641: */
2642: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2643: {
2644: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2645: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2646: PetscInt *colorused,i;
2647: ISColoringValue *newcolor;
2650: PetscMalloc1(n+1,&newcolor);
2651: /* loop over inodes, marking a color for each column*/
2652: row = 0;
2653: for (i=0; i<m; i++) {
2654: for (j=0; j<ns[i]; j++) {
2655: newcolor[row++] = coloring[i] + j*ncolors;
2656: }
2657: }
2659: /* eliminate unneeded colors */
2660: PetscCalloc1(5*ncolors,&colorused);
2661: for (i=0; i<n; i++) {
2662: colorused[newcolor[i]] = 1;
2663: }
2665: for (i=1; i<5*ncolors; i++) {
2666: colorused[i] += colorused[i-1];
2667: }
2668: ncolors = colorused[5*ncolors-1];
2669: for (i=0; i<n; i++) {
2670: newcolor[i] = colorused[newcolor[i]]-1;
2671: }
2672: PetscFree(colorused);
2673: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2674: PetscFree(coloring);
2675: return 0;
2676: }
2678: #include <petsc/private/kernels/blockinvert.h>
2680: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2681: {
2682: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2683: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2684: MatScalar *ibdiag,*bdiag,work[25],*t;
2685: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2686: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2687: const PetscScalar *xb, *b;
2688: PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2689: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2690: PetscInt sz,k,ipvt[5];
2691: PetscBool allowzeropivot,zeropivotdetected;
2692: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2694: allowzeropivot = PetscNot(A->erroriffailure);
2699: if (!a->inode.ibdiagvalid) {
2700: if (!a->inode.ibdiag) {
2701: /* calculate space needed for diagonal blocks */
2702: for (i=0; i<m; i++) {
2703: cnt += sizes[i]*sizes[i];
2704: }
2705: a->inode.bdiagsize = cnt;
2707: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2708: }
2710: /* copy over the diagonal blocks and invert them */
2711: ibdiag = a->inode.ibdiag;
2712: bdiag = a->inode.bdiag;
2713: cnt = 0;
2714: for (i=0, row = 0; i<m; i++) {
2715: for (j=0; j<sizes[i]; j++) {
2716: for (k=0; k<sizes[i]; k++) {
2717: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2718: }
2719: }
2720: PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);
2722: switch (sizes[i]) {
2723: case 1:
2724: /* Create matrix data structure */
2725: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2726: if (allowzeropivot) {
2727: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2728: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2729: A->factorerror_zeropivot_row = row;
2730: PetscInfo(A,"Zero pivot, row %" PetscInt_FMT "\n",row);
2731: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %" PetscInt_FMT,row);
2732: }
2733: ibdiag[cnt] = 1.0/ibdiag[cnt];
2734: break;
2735: case 2:
2736: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2737: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2738: break;
2739: case 3:
2740: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2741: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2742: break;
2743: case 4:
2744: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2745: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2746: break;
2747: case 5:
2748: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2749: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2750: break;
2751: default:
2752: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
2753: }
2754: cnt += sizes[i]*sizes[i];
2755: row += sizes[i];
2756: }
2757: a->inode.ibdiagvalid = PETSC_TRUE;
2758: }
2759: ibdiag = a->inode.ibdiag;
2760: bdiag = a->inode.bdiag;
2761: t = a->inode.ssor_work;
2763: VecGetArray(xx,&x);
2764: VecGetArrayRead(bb,&b);
2765: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2766: if (flag & SOR_ZERO_INITIAL_GUESS) {
2767: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2769: for (i=0, row=0; i<m; i++) {
2770: sz = diag[row] - ii[row];
2771: v1 = a->a + ii[row];
2772: idx = a->j + ii[row];
2774: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2775: switch (sizes[i]) {
2776: case 1:
2778: sum1 = b[row];
2779: for (n = 0; n<sz-1; n+=2) {
2780: i1 = idx[0];
2781: i2 = idx[1];
2782: idx += 2;
2783: tmp0 = x[i1];
2784: tmp1 = x[i2];
2785: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2786: }
2788: if (n == sz-1) {
2789: tmp0 = x[*idx];
2790: sum1 -= *v1 * tmp0;
2791: }
2792: t[row] = sum1;
2793: x[row++] = sum1*(*ibdiag++);
2794: break;
2795: case 2:
2796: v2 = a->a + ii[row+1];
2797: sum1 = b[row];
2798: sum2 = b[row+1];
2799: for (n = 0; n<sz-1; n+=2) {
2800: i1 = idx[0];
2801: i2 = idx[1];
2802: idx += 2;
2803: tmp0 = x[i1];
2804: tmp1 = x[i2];
2805: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2806: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2807: }
2809: if (n == sz-1) {
2810: tmp0 = x[*idx];
2811: sum1 -= v1[0] * tmp0;
2812: sum2 -= v2[0] * tmp0;
2813: }
2814: t[row] = sum1;
2815: t[row+1] = sum2;
2816: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2817: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2818: ibdiag += 4;
2819: break;
2820: case 3:
2821: v2 = a->a + ii[row+1];
2822: v3 = a->a + ii[row+2];
2823: sum1 = b[row];
2824: sum2 = b[row+1];
2825: sum3 = b[row+2];
2826: for (n = 0; n<sz-1; n+=2) {
2827: i1 = idx[0];
2828: i2 = idx[1];
2829: idx += 2;
2830: tmp0 = x[i1];
2831: tmp1 = x[i2];
2832: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2833: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2834: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2835: }
2837: if (n == sz-1) {
2838: tmp0 = x[*idx];
2839: sum1 -= v1[0] * tmp0;
2840: sum2 -= v2[0] * tmp0;
2841: sum3 -= v3[0] * tmp0;
2842: }
2843: t[row] = sum1;
2844: t[row+1] = sum2;
2845: t[row+2] = sum3;
2846: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2847: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2848: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2849: ibdiag += 9;
2850: break;
2851: case 4:
2852: v2 = a->a + ii[row+1];
2853: v3 = a->a + ii[row+2];
2854: v4 = a->a + ii[row+3];
2855: sum1 = b[row];
2856: sum2 = b[row+1];
2857: sum3 = b[row+2];
2858: sum4 = b[row+3];
2859: for (n = 0; n<sz-1; n+=2) {
2860: i1 = idx[0];
2861: i2 = idx[1];
2862: idx += 2;
2863: tmp0 = x[i1];
2864: tmp1 = x[i2];
2865: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2866: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2867: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2868: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2869: }
2871: if (n == sz-1) {
2872: tmp0 = x[*idx];
2873: sum1 -= v1[0] * tmp0;
2874: sum2 -= v2[0] * tmp0;
2875: sum3 -= v3[0] * tmp0;
2876: sum4 -= v4[0] * tmp0;
2877: }
2878: t[row] = sum1;
2879: t[row+1] = sum2;
2880: t[row+2] = sum3;
2881: t[row+3] = sum4;
2882: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2883: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2884: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2885: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2886: ibdiag += 16;
2887: break;
2888: case 5:
2889: v2 = a->a + ii[row+1];
2890: v3 = a->a + ii[row+2];
2891: v4 = a->a + ii[row+3];
2892: v5 = a->a + ii[row+4];
2893: sum1 = b[row];
2894: sum2 = b[row+1];
2895: sum3 = b[row+2];
2896: sum4 = b[row+3];
2897: sum5 = b[row+4];
2898: for (n = 0; n<sz-1; n+=2) {
2899: i1 = idx[0];
2900: i2 = idx[1];
2901: idx += 2;
2902: tmp0 = x[i1];
2903: tmp1 = x[i2];
2904: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2905: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2906: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2907: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2908: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2909: }
2911: if (n == sz-1) {
2912: tmp0 = x[*idx];
2913: sum1 -= v1[0] * tmp0;
2914: sum2 -= v2[0] * tmp0;
2915: sum3 -= v3[0] * tmp0;
2916: sum4 -= v4[0] * tmp0;
2917: sum5 -= v5[0] * tmp0;
2918: }
2919: t[row] = sum1;
2920: t[row+1] = sum2;
2921: t[row+2] = sum3;
2922: t[row+3] = sum4;
2923: t[row+4] = sum5;
2924: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2925: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2926: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2927: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2928: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2929: ibdiag += 25;
2930: break;
2931: default:
2932: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
2933: }
2934: }
2936: xb = t;
2937: PetscLogFlops(a->nz);
2938: } else xb = b;
2939: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2941: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2942: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2943: ibdiag -= sizes[i]*sizes[i];
2944: sz = ii[row+1] - diag[row] - 1;
2945: v1 = a->a + diag[row] + 1;
2946: idx = a->j + diag[row] + 1;
2948: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2949: switch (sizes[i]) {
2950: case 1:
2952: sum1 = xb[row];
2953: for (n = 0; n<sz-1; n+=2) {
2954: i1 = idx[0];
2955: i2 = idx[1];
2956: idx += 2;
2957: tmp0 = x[i1];
2958: tmp1 = x[i2];
2959: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2960: }
2962: if (n == sz-1) {
2963: tmp0 = x[*idx];
2964: sum1 -= *v1*tmp0;
2965: }
2966: x[row--] = sum1*(*ibdiag);
2967: break;
2969: case 2:
2971: sum1 = xb[row];
2972: sum2 = xb[row-1];
2973: /* note that sum1 is associated with the second of the two rows */
2974: v2 = a->a + diag[row-1] + 2;
2975: for (n = 0; n<sz-1; n+=2) {
2976: i1 = idx[0];
2977: i2 = idx[1];
2978: idx += 2;
2979: tmp0 = x[i1];
2980: tmp1 = x[i2];
2981: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2982: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2983: }
2985: if (n == sz-1) {
2986: tmp0 = x[*idx];
2987: sum1 -= *v1*tmp0;
2988: sum2 -= *v2*tmp0;
2989: }
2990: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2991: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2992: break;
2993: case 3:
2995: sum1 = xb[row];
2996: sum2 = xb[row-1];
2997: sum3 = xb[row-2];
2998: v2 = a->a + diag[row-1] + 2;
2999: v3 = a->a + diag[row-2] + 3;
3000: for (n = 0; n<sz-1; n+=2) {
3001: i1 = idx[0];
3002: i2 = idx[1];
3003: idx += 2;
3004: tmp0 = x[i1];
3005: tmp1 = x[i2];
3006: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3007: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3008: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3009: }
3011: if (n == sz-1) {
3012: tmp0 = x[*idx];
3013: sum1 -= *v1*tmp0;
3014: sum2 -= *v2*tmp0;
3015: sum3 -= *v3*tmp0;
3016: }
3017: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3018: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3019: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3020: break;
3021: case 4:
3023: sum1 = xb[row];
3024: sum2 = xb[row-1];
3025: sum3 = xb[row-2];
3026: sum4 = xb[row-3];
3027: v2 = a->a + diag[row-1] + 2;
3028: v3 = a->a + diag[row-2] + 3;
3029: v4 = a->a + diag[row-3] + 4;
3030: for (n = 0; n<sz-1; n+=2) {
3031: i1 = idx[0];
3032: i2 = idx[1];
3033: idx += 2;
3034: tmp0 = x[i1];
3035: tmp1 = x[i2];
3036: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3037: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3038: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3039: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3040: }
3042: if (n == sz-1) {
3043: tmp0 = x[*idx];
3044: sum1 -= *v1*tmp0;
3045: sum2 -= *v2*tmp0;
3046: sum3 -= *v3*tmp0;
3047: sum4 -= *v4*tmp0;
3048: }
3049: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3050: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3051: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3052: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3053: break;
3054: case 5:
3056: sum1 = xb[row];
3057: sum2 = xb[row-1];
3058: sum3 = xb[row-2];
3059: sum4 = xb[row-3];
3060: sum5 = xb[row-4];
3061: v2 = a->a + diag[row-1] + 2;
3062: v3 = a->a + diag[row-2] + 3;
3063: v4 = a->a + diag[row-3] + 4;
3064: v5 = a->a + diag[row-4] + 5;
3065: for (n = 0; n<sz-1; n+=2) {
3066: i1 = idx[0];
3067: i2 = idx[1];
3068: idx += 2;
3069: tmp0 = x[i1];
3070: tmp1 = x[i2];
3071: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3072: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3073: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3074: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3075: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3076: }
3078: if (n == sz-1) {
3079: tmp0 = x[*idx];
3080: sum1 -= *v1*tmp0;
3081: sum2 -= *v2*tmp0;
3082: sum3 -= *v3*tmp0;
3083: sum4 -= *v4*tmp0;
3084: sum5 -= *v5*tmp0;
3085: }
3086: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3087: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3088: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3089: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3090: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3091: break;
3092: default:
3093: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3094: }
3095: }
3097: PetscLogFlops(a->nz);
3098: }
3099: its--;
3100: }
3101: while (its--) {
3103: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3104: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3105: i<m;
3106: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3108: sz = diag[row] - ii[row];
3109: v1 = a->a + ii[row];
3110: idx = a->j + ii[row];
3111: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3112: switch (sizes[i]) {
3113: case 1:
3114: sum1 = b[row];
3115: for (n = 0; n<sz-1; n+=2) {
3116: i1 = idx[0];
3117: i2 = idx[1];
3118: idx += 2;
3119: tmp0 = x[i1];
3120: tmp1 = x[i2];
3121: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3122: }
3123: if (n == sz-1) {
3124: tmp0 = x[*idx++];
3125: sum1 -= *v1 * tmp0;
3126: v1++;
3127: }
3128: t[row] = sum1;
3129: sz = ii[row+1] - diag[row] - 1;
3130: idx = a->j + diag[row] + 1;
3131: v1 += 1;
3132: for (n = 0; n<sz-1; n+=2) {
3133: i1 = idx[0];
3134: i2 = idx[1];
3135: idx += 2;
3136: tmp0 = x[i1];
3137: tmp1 = x[i2];
3138: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3139: }
3140: if (n == sz-1) {
3141: tmp0 = x[*idx++];
3142: sum1 -= *v1 * tmp0;
3143: }
3144: /* in MatSOR_SeqAIJ this line would be
3145: *
3146: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3147: *
3148: * but omega == 1, so this becomes
3149: *
3150: * x[row] = sum1*(*ibdiag++);
3151: *
3152: */
3153: x[row] = sum1*(*ibdiag);
3154: break;
3155: case 2:
3156: v2 = a->a + ii[row+1];
3157: sum1 = b[row];
3158: sum2 = b[row+1];
3159: for (n = 0; n<sz-1; n+=2) {
3160: i1 = idx[0];
3161: i2 = idx[1];
3162: idx += 2;
3163: tmp0 = x[i1];
3164: tmp1 = x[i2];
3165: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3166: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3167: }
3168: if (n == sz-1) {
3169: tmp0 = x[*idx++];
3170: sum1 -= v1[0] * tmp0;
3171: sum2 -= v2[0] * tmp0;
3172: v1++; v2++;
3173: }
3174: t[row] = sum1;
3175: t[row+1] = sum2;
3176: sz = ii[row+1] - diag[row] - 2;
3177: idx = a->j + diag[row] + 2;
3178: v1 += 2;
3179: v2 += 2;
3180: for (n = 0; n<sz-1; n+=2) {
3181: i1 = idx[0];
3182: i2 = idx[1];
3183: idx += 2;
3184: tmp0 = x[i1];
3185: tmp1 = x[i2];
3186: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3187: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3188: }
3189: if (n == sz-1) {
3190: tmp0 = x[*idx];
3191: sum1 -= v1[0] * tmp0;
3192: sum2 -= v2[0] * tmp0;
3193: }
3194: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3195: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3196: break;
3197: case 3:
3198: v2 = a->a + ii[row+1];
3199: v3 = a->a + ii[row+2];
3200: sum1 = b[row];
3201: sum2 = b[row+1];
3202: sum3 = b[row+2];
3203: for (n = 0; n<sz-1; n+=2) {
3204: i1 = idx[0];
3205: i2 = idx[1];
3206: idx += 2;
3207: tmp0 = x[i1];
3208: tmp1 = x[i2];
3209: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3210: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3211: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3212: }
3213: if (n == sz-1) {
3214: tmp0 = x[*idx++];
3215: sum1 -= v1[0] * tmp0;
3216: sum2 -= v2[0] * tmp0;
3217: sum3 -= v3[0] * tmp0;
3218: v1++; v2++; v3++;
3219: }
3220: t[row] = sum1;
3221: t[row+1] = sum2;
3222: t[row+2] = sum3;
3223: sz = ii[row+1] - diag[row] - 3;
3224: idx = a->j + diag[row] + 3;
3225: v1 += 3;
3226: v2 += 3;
3227: v3 += 3;
3228: for (n = 0; n<sz-1; n+=2) {
3229: i1 = idx[0];
3230: i2 = idx[1];
3231: idx += 2;
3232: tmp0 = x[i1];
3233: tmp1 = x[i2];
3234: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3235: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3236: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3237: }
3238: if (n == sz-1) {
3239: tmp0 = x[*idx];
3240: sum1 -= v1[0] * tmp0;
3241: sum2 -= v2[0] * tmp0;
3242: sum3 -= v3[0] * tmp0;
3243: }
3244: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3245: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3246: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3247: break;
3248: case 4:
3249: v2 = a->a + ii[row+1];
3250: v3 = a->a + ii[row+2];
3251: v4 = a->a + ii[row+3];
3252: sum1 = b[row];
3253: sum2 = b[row+1];
3254: sum3 = b[row+2];
3255: sum4 = b[row+3];
3256: for (n = 0; n<sz-1; n+=2) {
3257: i1 = idx[0];
3258: i2 = idx[1];
3259: idx += 2;
3260: tmp0 = x[i1];
3261: tmp1 = x[i2];
3262: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3263: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3264: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3265: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3266: }
3267: if (n == sz-1) {
3268: tmp0 = x[*idx++];
3269: sum1 -= v1[0] * tmp0;
3270: sum2 -= v2[0] * tmp0;
3271: sum3 -= v3[0] * tmp0;
3272: sum4 -= v4[0] * tmp0;
3273: v1++; v2++; v3++; v4++;
3274: }
3275: t[row] = sum1;
3276: t[row+1] = sum2;
3277: t[row+2] = sum3;
3278: t[row+3] = sum4;
3279: sz = ii[row+1] - diag[row] - 4;
3280: idx = a->j + diag[row] + 4;
3281: v1 += 4;
3282: v2 += 4;
3283: v3 += 4;
3284: v4 += 4;
3285: for (n = 0; n<sz-1; n+=2) {
3286: i1 = idx[0];
3287: i2 = idx[1];
3288: idx += 2;
3289: tmp0 = x[i1];
3290: tmp1 = x[i2];
3291: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3292: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3293: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3294: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3295: }
3296: if (n == sz-1) {
3297: tmp0 = x[*idx];
3298: sum1 -= v1[0] * tmp0;
3299: sum2 -= v2[0] * tmp0;
3300: sum3 -= v3[0] * tmp0;
3301: sum4 -= v4[0] * tmp0;
3302: }
3303: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3304: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3305: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3306: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3307: break;
3308: case 5:
3309: v2 = a->a + ii[row+1];
3310: v3 = a->a + ii[row+2];
3311: v4 = a->a + ii[row+3];
3312: v5 = a->a + ii[row+4];
3313: sum1 = b[row];
3314: sum2 = b[row+1];
3315: sum3 = b[row+2];
3316: sum4 = b[row+3];
3317: sum5 = b[row+4];
3318: for (n = 0; n<sz-1; n+=2) {
3319: i1 = idx[0];
3320: i2 = idx[1];
3321: idx += 2;
3322: tmp0 = x[i1];
3323: tmp1 = x[i2];
3324: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3325: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3326: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3327: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3328: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3329: }
3330: if (n == sz-1) {
3331: tmp0 = x[*idx++];
3332: sum1 -= v1[0] * tmp0;
3333: sum2 -= v2[0] * tmp0;
3334: sum3 -= v3[0] * tmp0;
3335: sum4 -= v4[0] * tmp0;
3336: sum5 -= v5[0] * tmp0;
3337: v1++; v2++; v3++; v4++; v5++;
3338: }
3339: t[row] = sum1;
3340: t[row+1] = sum2;
3341: t[row+2] = sum3;
3342: t[row+3] = sum4;
3343: t[row+4] = sum5;
3344: sz = ii[row+1] - diag[row] - 5;
3345: idx = a->j + diag[row] + 5;
3346: v1 += 5;
3347: v2 += 5;
3348: v3 += 5;
3349: v4 += 5;
3350: v5 += 5;
3351: for (n = 0; n<sz-1; n+=2) {
3352: i1 = idx[0];
3353: i2 = idx[1];
3354: idx += 2;
3355: tmp0 = x[i1];
3356: tmp1 = x[i2];
3357: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3358: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3359: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3360: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3361: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3362: }
3363: if (n == sz-1) {
3364: tmp0 = x[*idx];
3365: sum1 -= v1[0] * tmp0;
3366: sum2 -= v2[0] * tmp0;
3367: sum3 -= v3[0] * tmp0;
3368: sum4 -= v4[0] * tmp0;
3369: sum5 -= v5[0] * tmp0;
3370: }
3371: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3372: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3373: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3374: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3375: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3376: break;
3377: default:
3378: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3379: }
3380: }
3381: xb = t;
3382: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3383: } else xb = b;
3385: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3387: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3388: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3389: ibdiag -= sizes[i]*sizes[i];
3391: /* set RHS */
3392: if (xb == b) {
3393: /* whole (old way) */
3394: sz = ii[row+1] - ii[row];
3395: idx = a->j + ii[row];
3396: switch (sizes[i]) {
3397: case 5:
3398: v5 = a->a + ii[row-4];
3399: case 4: /* fall through */
3400: v4 = a->a + ii[row-3];
3401: case 3:
3402: v3 = a->a + ii[row-2];
3403: case 2:
3404: v2 = a->a + ii[row-1];
3405: case 1:
3406: v1 = a->a + ii[row];
3407: break;
3408: default:
3409: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3410: }
3411: } else {
3412: /* upper, no diag */
3413: sz = ii[row+1] - diag[row] - 1;
3414: idx = a->j + diag[row] + 1;
3415: switch (sizes[i]) {
3416: case 5:
3417: v5 = a->a + diag[row-4] + 5;
3418: case 4: /* fall through */
3419: v4 = a->a + diag[row-3] + 4;
3420: case 3:
3421: v3 = a->a + diag[row-2] + 3;
3422: case 2:
3423: v2 = a->a + diag[row-1] + 2;
3424: case 1:
3425: v1 = a->a + diag[row] + 1;
3426: }
3427: }
3428: /* set sum */
3429: switch (sizes[i]) {
3430: case 5:
3431: sum5 = xb[row-4];
3432: case 4: /* fall through */
3433: sum4 = xb[row-3];
3434: case 3:
3435: sum3 = xb[row-2];
3436: case 2:
3437: sum2 = xb[row-1];
3438: case 1:
3439: /* note that sum1 is associated with the last row */
3440: sum1 = xb[row];
3441: }
3442: /* do sums */
3443: for (n = 0; n<sz-1; n+=2) {
3444: i1 = idx[0];
3445: i2 = idx[1];
3446: idx += 2;
3447: tmp0 = x[i1];
3448: tmp1 = x[i2];
3449: switch (sizes[i]) {
3450: case 5:
3451: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3452: case 4: /* fall through */
3453: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3454: case 3:
3455: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3456: case 2:
3457: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3458: case 1:
3459: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3460: }
3461: }
3462: /* ragged edge */
3463: if (n == sz-1) {
3464: tmp0 = x[*idx];
3465: switch (sizes[i]) {
3466: case 5:
3467: sum5 -= *v5*tmp0;
3468: case 4: /* fall through */
3469: sum4 -= *v4*tmp0;
3470: case 3:
3471: sum3 -= *v3*tmp0;
3472: case 2:
3473: sum2 -= *v2*tmp0;
3474: case 1:
3475: sum1 -= *v1*tmp0;
3476: }
3477: }
3478: /* update */
3479: if (xb == b) {
3480: /* whole (old way) w/ diag */
3481: switch (sizes[i]) {
3482: case 5:
3483: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3484: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3485: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3486: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3487: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3488: break;
3489: case 4:
3490: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3491: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3492: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3493: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3494: break;
3495: case 3:
3496: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3497: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3498: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3499: break;
3500: case 2:
3501: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3502: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3503: break;
3504: case 1:
3505: x[row--] += sum1*(*ibdiag);
3506: break;
3507: }
3508: } else {
3509: /* no diag so set = */
3510: switch (sizes[i]) {
3511: case 5:
3512: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3513: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3514: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3515: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3516: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3517: break;
3518: case 4:
3519: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3520: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3521: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3522: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3523: break;
3524: case 3:
3525: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3526: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3527: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3528: break;
3529: case 2:
3530: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3531: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3532: break;
3533: case 1:
3534: x[row--] = sum1*(*ibdiag);
3535: break;
3536: }
3537: }
3538: }
3539: if (xb == b) {
3540: PetscLogFlops(2.0*a->nz);
3541: } else {
3542: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3543: }
3544: }
3545: }
3546: if (flag & SOR_EISENSTAT) {
3547: /*
3548: Apply (U + D)^-1 where D is now the block diagonal
3549: */
3550: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3551: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3552: ibdiag -= sizes[i]*sizes[i];
3553: sz = ii[row+1] - diag[row] - 1;
3554: v1 = a->a + diag[row] + 1;
3555: idx = a->j + diag[row] + 1;
3556: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3557: switch (sizes[i]) {
3558: case 1:
3560: sum1 = b[row];
3561: for (n = 0; n<sz-1; n+=2) {
3562: i1 = idx[0];
3563: i2 = idx[1];
3564: idx += 2;
3565: tmp0 = x[i1];
3566: tmp1 = x[i2];
3567: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3568: }
3570: if (n == sz-1) {
3571: tmp0 = x[*idx];
3572: sum1 -= *v1*tmp0;
3573: }
3574: x[row] = sum1*(*ibdiag);row--;
3575: break;
3577: case 2:
3579: sum1 = b[row];
3580: sum2 = b[row-1];
3581: /* note that sum1 is associated with the second of the two rows */
3582: v2 = a->a + diag[row-1] + 2;
3583: for (n = 0; n<sz-1; n+=2) {
3584: i1 = idx[0];
3585: i2 = idx[1];
3586: idx += 2;
3587: tmp0 = x[i1];
3588: tmp1 = x[i2];
3589: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3590: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3591: }
3593: if (n == sz-1) {
3594: tmp0 = x[*idx];
3595: sum1 -= *v1*tmp0;
3596: sum2 -= *v2*tmp0;
3597: }
3598: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3599: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3600: row -= 2;
3601: break;
3602: case 3:
3604: sum1 = b[row];
3605: sum2 = b[row-1];
3606: sum3 = b[row-2];
3607: v2 = a->a + diag[row-1] + 2;
3608: v3 = a->a + diag[row-2] + 3;
3609: for (n = 0; n<sz-1; n+=2) {
3610: i1 = idx[0];
3611: i2 = idx[1];
3612: idx += 2;
3613: tmp0 = x[i1];
3614: tmp1 = x[i2];
3615: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3616: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3617: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3618: }
3620: if (n == sz-1) {
3621: tmp0 = x[*idx];
3622: sum1 -= *v1*tmp0;
3623: sum2 -= *v2*tmp0;
3624: sum3 -= *v3*tmp0;
3625: }
3626: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3627: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3628: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3629: row -= 3;
3630: break;
3631: case 4:
3633: sum1 = b[row];
3634: sum2 = b[row-1];
3635: sum3 = b[row-2];
3636: sum4 = b[row-3];
3637: v2 = a->a + diag[row-1] + 2;
3638: v3 = a->a + diag[row-2] + 3;
3639: v4 = a->a + diag[row-3] + 4;
3640: for (n = 0; n<sz-1; n+=2) {
3641: i1 = idx[0];
3642: i2 = idx[1];
3643: idx += 2;
3644: tmp0 = x[i1];
3645: tmp1 = x[i2];
3646: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3647: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3648: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3649: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3650: }
3652: if (n == sz-1) {
3653: tmp0 = x[*idx];
3654: sum1 -= *v1*tmp0;
3655: sum2 -= *v2*tmp0;
3656: sum3 -= *v3*tmp0;
3657: sum4 -= *v4*tmp0;
3658: }
3659: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3660: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3661: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3662: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3663: row -= 4;
3664: break;
3665: case 5:
3667: sum1 = b[row];
3668: sum2 = b[row-1];
3669: sum3 = b[row-2];
3670: sum4 = b[row-3];
3671: sum5 = b[row-4];
3672: v2 = a->a + diag[row-1] + 2;
3673: v3 = a->a + diag[row-2] + 3;
3674: v4 = a->a + diag[row-3] + 4;
3675: v5 = a->a + diag[row-4] + 5;
3676: for (n = 0; n<sz-1; n+=2) {
3677: i1 = idx[0];
3678: i2 = idx[1];
3679: idx += 2;
3680: tmp0 = x[i1];
3681: tmp1 = x[i2];
3682: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3683: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3684: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3685: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3686: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3687: }
3689: if (n == sz-1) {
3690: tmp0 = x[*idx];
3691: sum1 -= *v1*tmp0;
3692: sum2 -= *v2*tmp0;
3693: sum3 -= *v3*tmp0;
3694: sum4 -= *v4*tmp0;
3695: sum5 -= *v5*tmp0;
3696: }
3697: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3698: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3699: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3700: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3701: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3702: row -= 5;
3703: break;
3704: default:
3705: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3706: }
3707: }
3708: PetscLogFlops(a->nz);
3710: /*
3711: t = b - D x where D is the block diagonal
3712: */
3713: cnt = 0;
3714: for (i=0, row=0; i<m; i++) {
3715: switch (sizes[i]) {
3716: case 1:
3717: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3718: break;
3719: case 2:
3720: x1 = x[row]; x2 = x[row+1];
3721: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3722: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3723: t[row] = b[row] - tmp1;
3724: t[row+1] = b[row+1] - tmp2; row += 2;
3725: cnt += 4;
3726: break;
3727: case 3:
3728: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3729: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3730: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3731: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3732: t[row] = b[row] - tmp1;
3733: t[row+1] = b[row+1] - tmp2;
3734: t[row+2] = b[row+2] - tmp3; row += 3;
3735: cnt += 9;
3736: break;
3737: case 4:
3738: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3739: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3740: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3741: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3742: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3743: t[row] = b[row] - tmp1;
3744: t[row+1] = b[row+1] - tmp2;
3745: t[row+2] = b[row+2] - tmp3;
3746: t[row+3] = b[row+3] - tmp4; row += 4;
3747: cnt += 16;
3748: break;
3749: case 5:
3750: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3751: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3752: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3753: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3754: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3755: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3756: t[row] = b[row] - tmp1;
3757: t[row+1] = b[row+1] - tmp2;
3758: t[row+2] = b[row+2] - tmp3;
3759: t[row+3] = b[row+3] - tmp4;
3760: t[row+4] = b[row+4] - tmp5;row += 5;
3761: cnt += 25;
3762: break;
3763: default:
3764: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3765: }
3766: }
3767: PetscLogFlops(m);
3769: /*
3770: Apply (L + D)^-1 where D is the block diagonal
3771: */
3772: for (i=0, row=0; i<m; i++) {
3773: sz = diag[row] - ii[row];
3774: v1 = a->a + ii[row];
3775: idx = a->j + ii[row];
3776: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3777: switch (sizes[i]) {
3778: case 1:
3780: sum1 = t[row];
3781: for (n = 0; n<sz-1; n+=2) {
3782: i1 = idx[0];
3783: i2 = idx[1];
3784: idx += 2;
3785: tmp0 = t[i1];
3786: tmp1 = t[i2];
3787: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3788: }
3790: if (n == sz-1) {
3791: tmp0 = t[*idx];
3792: sum1 -= *v1 * tmp0;
3793: }
3794: x[row] += t[row] = sum1*(*ibdiag++); row++;
3795: break;
3796: case 2:
3797: v2 = a->a + ii[row+1];
3798: sum1 = t[row];
3799: sum2 = t[row+1];
3800: for (n = 0; n<sz-1; n+=2) {
3801: i1 = idx[0];
3802: i2 = idx[1];
3803: idx += 2;
3804: tmp0 = t[i1];
3805: tmp1 = t[i2];
3806: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3807: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3808: }
3810: if (n == sz-1) {
3811: tmp0 = t[*idx];
3812: sum1 -= v1[0] * tmp0;
3813: sum2 -= v2[0] * tmp0;
3814: }
3815: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3816: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3817: ibdiag += 4; row += 2;
3818: break;
3819: case 3:
3820: v2 = a->a + ii[row+1];
3821: v3 = a->a + ii[row+2];
3822: sum1 = t[row];
3823: sum2 = t[row+1];
3824: sum3 = t[row+2];
3825: for (n = 0; n<sz-1; n+=2) {
3826: i1 = idx[0];
3827: i2 = idx[1];
3828: idx += 2;
3829: tmp0 = t[i1];
3830: tmp1 = t[i2];
3831: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3832: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3833: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3834: }
3836: if (n == sz-1) {
3837: tmp0 = t[*idx];
3838: sum1 -= v1[0] * tmp0;
3839: sum2 -= v2[0] * tmp0;
3840: sum3 -= v3[0] * tmp0;
3841: }
3842: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3843: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3844: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3845: ibdiag += 9; row += 3;
3846: break;
3847: case 4:
3848: v2 = a->a + ii[row+1];
3849: v3 = a->a + ii[row+2];
3850: v4 = a->a + ii[row+3];
3851: sum1 = t[row];
3852: sum2 = t[row+1];
3853: sum3 = t[row+2];
3854: sum4 = t[row+3];
3855: for (n = 0; n<sz-1; n+=2) {
3856: i1 = idx[0];
3857: i2 = idx[1];
3858: idx += 2;
3859: tmp0 = t[i1];
3860: tmp1 = t[i2];
3861: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3862: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3863: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3864: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3865: }
3867: if (n == sz-1) {
3868: tmp0 = t[*idx];
3869: sum1 -= v1[0] * tmp0;
3870: sum2 -= v2[0] * tmp0;
3871: sum3 -= v3[0] * tmp0;
3872: sum4 -= v4[0] * tmp0;
3873: }
3874: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3875: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3876: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3877: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3878: ibdiag += 16; row += 4;
3879: break;
3880: case 5:
3881: v2 = a->a + ii[row+1];
3882: v3 = a->a + ii[row+2];
3883: v4 = a->a + ii[row+3];
3884: v5 = a->a + ii[row+4];
3885: sum1 = t[row];
3886: sum2 = t[row+1];
3887: sum3 = t[row+2];
3888: sum4 = t[row+3];
3889: sum5 = t[row+4];
3890: for (n = 0; n<sz-1; n+=2) {
3891: i1 = idx[0];
3892: i2 = idx[1];
3893: idx += 2;
3894: tmp0 = t[i1];
3895: tmp1 = t[i2];
3896: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3897: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3898: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3899: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3900: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3901: }
3903: if (n == sz-1) {
3904: tmp0 = t[*idx];
3905: sum1 -= v1[0] * tmp0;
3906: sum2 -= v2[0] * tmp0;
3907: sum3 -= v3[0] * tmp0;
3908: sum4 -= v4[0] * tmp0;
3909: sum5 -= v5[0] * tmp0;
3910: }
3911: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3912: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3913: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3914: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3915: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3916: ibdiag += 25; row += 5;
3917: break;
3918: default:
3919: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3920: }
3921: }
3922: PetscLogFlops(a->nz);
3923: }
3924: VecRestoreArray(xx,&x);
3925: VecRestoreArrayRead(bb,&b);
3926: return 0;
3927: }
3929: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3930: {
3931: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3932: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3933: const MatScalar *bdiag = a->inode.bdiag;
3934: const PetscScalar *b;
3935: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3936: const PetscInt *sizes = a->inode.size;
3939: VecGetArray(xx,&x);
3940: VecGetArrayRead(bb,&b);
3941: cnt = 0;
3942: for (i=0, row=0; i<m; i++) {
3943: switch (sizes[i]) {
3944: case 1:
3945: x[row] = b[row]*bdiag[cnt++];row++;
3946: break;
3947: case 2:
3948: x1 = b[row]; x2 = b[row+1];
3949: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3950: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3951: x[row++] = tmp1;
3952: x[row++] = tmp2;
3953: cnt += 4;
3954: break;
3955: case 3:
3956: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3957: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3958: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3959: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3960: x[row++] = tmp1;
3961: x[row++] = tmp2;
3962: x[row++] = tmp3;
3963: cnt += 9;
3964: break;
3965: case 4:
3966: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3967: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3968: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3969: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3970: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3971: x[row++] = tmp1;
3972: x[row++] = tmp2;
3973: x[row++] = tmp3;
3974: x[row++] = tmp4;
3975: cnt += 16;
3976: break;
3977: case 5:
3978: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3979: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3980: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3981: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3982: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3983: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3984: x[row++] = tmp1;
3985: x[row++] = tmp2;
3986: x[row++] = tmp3;
3987: x[row++] = tmp4;
3988: x[row++] = tmp5;
3989: cnt += 25;
3990: break;
3991: default:
3992: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3993: }
3994: }
3995: PetscLogFlops(2.0*cnt);
3996: VecRestoreArray(xx,&x);
3997: VecRestoreArrayRead(bb,&b);
3998: return 0;
3999: }
4001: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4002: {
4003: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4005: a->inode.node_count = 0;
4006: a->inode.use = PETSC_FALSE;
4007: a->inode.checked = PETSC_FALSE;
4008: a->inode.mat_nonzerostate = -1;
4009: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4010: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4011: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4012: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4013: A->ops->coloringpatch = NULL;
4014: A->ops->multdiagonalblock = NULL;
4015: if (A->factortype) {
4016: A->ops->solve = MatSolve_SeqAIJ_inplace;
4017: }
4018: return 0;
4019: }
4021: /*
4022: samestructure indicates that the matrix has not changed its nonzero structure so we
4023: do not need to recompute the inodes
4024: */
4025: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4026: {
4027: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4028: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4029: PetscBool flag;
4030: const PetscInt *idx,*idy,*ii;
4032: if (!a->inode.use) {
4033: MatSeqAIJ_Inode_ResetOps(A);
4034: PetscFree(a->inode.size);
4035: return 0;
4036: }
4037: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return 0;
4039: m = A->rmap->n;
4040: if (!a->inode.size) PetscMalloc1(m+1,&a->inode.size);
4041: ns = a->inode.size;
4043: i = 0;
4044: node_count = 0;
4045: idx = a->j;
4046: ii = a->i;
4047: while (i < m) { /* For each row */
4048: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4049: /* Limits the number of elements in a node to 'a->inode.limit' */
4050: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4051: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4052: if (nzy != nzx) break;
4053: idy += nzx; /* Same nonzero pattern */
4054: PetscArraycmp(idx,idy,nzx,&flag);
4055: if (!flag) break;
4056: }
4057: ns[node_count++] = blk_size;
4058: idx += blk_size*nzx;
4059: i = j;
4060: }
4062: /* If not enough inodes found,, do not use inode version of the routines */
4063: if (!m || node_count > .8*m) {
4064: MatSeqAIJ_Inode_ResetOps(A);
4065: PetscFree(a->inode.size);
4066: PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m);
4067: } else {
4068: if (!A->factortype) {
4069: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4070: if (A->rmap->n == A->cmap->n) {
4071: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4072: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4073: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4074: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4075: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4076: }
4077: } else {
4078: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4079: }
4080: a->inode.node_count = node_count;
4081: PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit);
4082: }
4083: a->inode.checked = PETSC_TRUE;
4084: a->inode.mat_nonzerostate = A->nonzerostate;
4085: return 0;
4086: }
4088: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4089: {
4090: Mat B =*C;
4091: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4092: PetscInt m=A->rmap->n;
4094: c->inode.use = a->inode.use;
4095: c->inode.limit = a->inode.limit;
4096: c->inode.max_limit = a->inode.max_limit;
4097: c->inode.checked = PETSC_FALSE;
4098: c->inode.size = NULL;
4099: c->inode.node_count = 0;
4100: c->inode.ibdiagvalid = PETSC_FALSE;
4101: c->inode.ibdiag = NULL;
4102: c->inode.bdiag = NULL;
4103: c->inode.mat_nonzerostate = -1;
4104: if (a->inode.use) {
4105: if (a->inode.checked && a->inode.size) {
4106: PetscMalloc1(m+1,&c->inode.size);
4107: PetscArraycpy(c->inode.size,a->inode.size,m+1);
4109: c->inode.checked = PETSC_TRUE;
4110: c->inode.node_count = a->inode.node_count;
4111: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4112: }
4113: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4114: if (!B->factortype) {
4115: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4116: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4117: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4118: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4119: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4120: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4121: } else {
4122: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4123: }
4124: }
4125: return 0;
4126: }
4128: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4129: {
4130: PetscInt k;
4131: const PetscInt *vi;
4133: vi = aj + ai[row];
4134: for (k=0; k<nzl; k++) cols[k] = vi[k];
4135: vi = aj + adiag[row];
4136: cols[nzl] = vi[0];
4137: vi = aj + adiag[row+1]+1;
4138: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4139: return 0;
4140: }
4141: /*
4142: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4143: Modified from MatSeqAIJCheckInode().
4145: Input Parameters:
4146: . Mat A - ILU or LU matrix factor
4148: */
4149: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4150: {
4151: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4152: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4153: PetscInt *cols1,*cols2,*ns;
4154: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4155: PetscBool flag;
4157: if (!a->inode.use) return 0;
4158: if (a->inode.checked) return 0;
4160: m = A->rmap->n;
4161: if (a->inode.size) ns = a->inode.size;
4162: else {
4163: PetscMalloc1(m+1,&ns);
4164: }
4166: i = 0;
4167: node_count = 0;
4168: PetscMalloc2(m,&cols1,m,&cols2);
4169: while (i < m) { /* For each row */
4170: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4171: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4172: nzx = nzl1 + nzu1 + 1;
4173: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4175: /* Limits the number of elements in a node to 'a->inode.limit' */
4176: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4177: nzl2 = ai[j+1] - ai[j];
4178: nzu2 = adiag[j] - adiag[j+1] - 1;
4179: nzy = nzl2 + nzu2 + 1;
4180: if (nzy != nzx) break;
4181: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4182: PetscArraycmp(cols1,cols2,nzx,&flag);
4183: if (!flag) break;
4184: }
4185: ns[node_count++] = blk_size;
4186: i = j;
4187: }
4188: PetscFree2(cols1,cols2);
4189: /* If not enough inodes found,, do not use inode version of the routines */
4190: if (!m || node_count > .8*m) {
4191: PetscFree(ns);
4193: a->inode.node_count = 0;
4194: a->inode.size = NULL;
4195: a->inode.use = PETSC_FALSE;
4197: PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m);
4198: } else {
4199: A->ops->mult = NULL;
4200: A->ops->sor = NULL;
4201: A->ops->multadd = NULL;
4202: A->ops->getrowij = NULL;
4203: A->ops->restorerowij = NULL;
4204: A->ops->getcolumnij = NULL;
4205: A->ops->restorecolumnij = NULL;
4206: A->ops->coloringpatch = NULL;
4207: A->ops->multdiagonalblock = NULL;
4208: a->inode.node_count = node_count;
4209: a->inode.size = ns;
4211: PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit);
4212: }
4213: a->inode.checked = PETSC_TRUE;
4214: return 0;
4215: }
4217: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4218: {
4219: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4221: a->inode.ibdiagvalid = PETSC_FALSE;
4222: return 0;
4223: }
4225: /*
4226: This is really ugly. if inodes are used this replaces the
4227: permutations with ones that correspond to rows/cols of the matrix
4228: rather then inode blocks
4229: */
4230: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4231: {
4232: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4233: return 0;
4234: }
4236: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4237: {
4238: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4239: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4240: const PetscInt *ridx,*cidx;
4241: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4242: PetscInt nslim_col,*ns_col;
4243: IS ris = *rperm,cis = *cperm;
4245: if (!a->inode.size) return 0; /* no inodes so return */
4246: if (a->inode.node_count == m) return 0; /* all inodes are of size 1 */
4248: MatCreateColInode_Private(A,&nslim_col,&ns_col);
4249: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4250: PetscMalloc2(m,&permr,n,&permc);
4252: ISGetIndices(ris,&ridx);
4253: ISGetIndices(cis,&cidx);
4255: /* Form the inode structure for the rows of permuted matric using inv perm*/
4256: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4258: /* Construct the permutations for rows*/
4259: for (i=0,row = 0; i<nslim_row; ++i) {
4260: indx = ridx[i];
4261: start_val = tns[indx];
4262: end_val = tns[indx + 1];
4263: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4264: }
4266: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4267: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4269: /* Construct permutations for columns */
4270: for (i=0,col=0; i<nslim_col; ++i) {
4271: indx = cidx[i];
4272: start_val = tns[indx];
4273: end_val = tns[indx + 1];
4274: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4275: }
4277: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4278: ISSetPermutation(*rperm);
4279: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4280: ISSetPermutation(*cperm);
4282: ISRestoreIndices(ris,&ridx);
4283: ISRestoreIndices(cis,&cidx);
4285: PetscFree(ns_col);
4286: PetscFree2(permr,permc);
4287: ISDestroy(&cis);
4288: ISDestroy(&ris);
4289: PetscFree(tns);
4290: return 0;
4291: }
4293: /*@C
4294: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4296: Not Collective
4298: Input Parameter:
4299: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4301: Output Parameters:
4302: + node_count - no of inodes present in the matrix.
4303: . sizes - an array of size node_count,with sizes of each inode.
4304: - limit - the max size used to generate the inodes.
4306: Level: advanced
4308: Notes:
4309: This routine returns some internal storage information
4310: of the matrix, it is intended to be used by advanced users.
4311: It should be called after the matrix is assembled.
4312: The contents of the sizes[] array should not be changed.
4313: NULL may be passed for information not requested.
4315: .seealso: MatGetInfo()
4316: @*/
4317: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4318: {
4319: PetscErrorCode (*f)(Mat,PetscInt*,PetscInt**,PetscInt*);
4322: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4323: if (f) (*f)(A,node_count,sizes,limit);
4324: return 0;
4325: }
4327: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4328: {
4329: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4331: if (node_count) *node_count = a->inode.node_count;
4332: if (sizes) *sizes = a->inode.size;
4333: if (limit) *limit = a->inode.limit;
4334: return 0;
4335: }