Actual source code: inode.c
petsc-3.6.1 2015-08-06
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24: for (; count+1 < n; count++,i++) ;
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc1(n+1,&ns_col);
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for (; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc1(nslim_col+1,&tns);
81: PetscMalloc1(n+1,&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1) {
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87: }
88: /* allocate space for row pointers */
89: PetscMalloc1(nslim_row+1,&ia);
90: *iia = ia;
91: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
92: PetscMalloc1(nslim_row+1,&work);
94: /* determine the number of columns in each row */
95: ia[0] = oshift;
96: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
98: j = aj + ai[row] + ishift;
99: jmax = aj + ai[row+1] + ishift;
100: col = *j++ + ishift;
101: i2 = tvc[col];
102: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103: ia[i1+1]++;
104: ia[i2+1]++;
105: i2++; /* Start col of next node */
106: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107: i2 = tvc[col];
108: }
109: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
110: }
112: /* shift ia[i] to point to next row */
113: for (i1=1; i1<nslim_row+1; i1++) {
114: row = ia[i1-1];
115: ia[i1] += row;
116: work[i1-1] = row - oshift;
117: }
119: /* allocate space for column pointers */
120: nz = ia[nslim_row] + (!ishift);
121: PetscMalloc1(nz,&ja);
122: *jja = ja;
124: /* loop over lower triangular part putting into ja */
125: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126: j = aj + ai[row] + ishift;
127: jmax = aj + ai[row+1] + ishift;
128: col = *j++ + ishift;
129: i2 = tvc[col];
130: while (i2<i1 && j<jmax) {
131: ja[work[i2]++] = i1 + oshift;
132: ja[work[i1]++] = i2 + oshift;
133: ++i2;
134: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135: i2 = tvc[col];
136: }
137: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
139: }
140: PetscFree(work);
141: PetscFree(tns);
142: PetscFree(tvc);
143: return(0);
144: }
146: /*
147: This builds nonsymmetric version of nonzero structure,
148: */
151: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152: {
153: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
155: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156: PetscInt *tns,*tvc,nsz,i1,i2;
157: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
160: nslim_row = a->inode.node_count;
161: n = A->cmap->n;
163: /* Create The column_inode for this matrix */
164: Mat_CreateColInode(A,&nslim_col,&ns_col);
166: /* allocate space for reformated column_inode structure */
167: PetscMalloc1(nslim_col +1,&tns);
168: PetscMalloc1(n + 1,&tvc);
169: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171: for (i1=0,col=0; i1<nslim_col; ++i1) {
172: nsz = ns_col[i1];
173: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174: }
175: /* allocate space for row pointers */
176: PetscMalloc1(nslim_row+1,&ia);
177: *iia = ia;
178: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
179: PetscMalloc1(nslim_row+1,&work);
181: /* determine the number of columns in each row */
182: ia[0] = oshift;
183: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184: j = aj + ai[row] + ishift;
185: col = *j++ + ishift;
186: i2 = tvc[col];
187: nz = ai[row+1] - ai[row];
188: while (nz-- > 0) { /* off-diagonal elemets */
189: ia[i1+1]++;
190: i2++; /* Start col of next node */
191: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192: if (nz > 0) i2 = tvc[col];
193: }
194: }
196: /* shift ia[i] to point to next row */
197: for (i1=1; i1<nslim_row+1; i1++) {
198: row = ia[i1-1];
199: ia[i1] += row;
200: work[i1-1] = row - oshift;
201: }
203: /* allocate space for column pointers */
204: nz = ia[nslim_row] + (!ishift);
205: PetscMalloc1(nz,&ja);
206: *jja = ja;
208: /* loop over matrix putting into ja */
209: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210: j = aj + ai[row] + ishift;
211: col = *j++ + ishift;
212: i2 = tvc[col];
213: nz = ai[row+1] - ai[row];
214: while (nz-- > 0) {
215: ja[work[i1]++] = i2 + oshift;
216: ++i2;
217: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218: if (nz > 0) i2 = tvc[col];
219: }
220: }
221: PetscFree(ns_col);
222: PetscFree(work);
223: PetscFree(tns);
224: PetscFree(tvc);
225: return(0);
226: }
230: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
231: {
232: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
236: *n = a->inode.node_count;
237: if (!ia) return(0);
238: if (!blockcompressed) {
239: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
240: } else if (symmetric) {
241: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
242: } else {
243: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
244: }
245: return(0);
246: }
250: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
251: {
255: if (!ia) return(0);
257: if (!blockcompressed) {
258: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
259: } else {
260: PetscFree(*ia);
261: PetscFree(*ja);
262: }
263: return(0);
264: }
266: /* ----------------------------------------------------------- */
270: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271: {
272: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
274: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
278: nslim_row = a->inode.node_count;
279: n = A->cmap->n;
281: /* Create The column_inode for this matrix */
282: Mat_CreateColInode(A,&nslim_col,&ns_col);
284: /* allocate space for reformated column_inode structure */
285: PetscMalloc1(nslim_col + 1,&tns);
286: PetscMalloc1(n + 1,&tvc);
287: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
289: for (i1=0,col=0; i1<nslim_col; ++i1) {
290: nsz = ns_col[i1];
291: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292: }
293: /* allocate space for column pointers */
294: PetscMalloc1(nslim_col+1,&ia);
295: *iia = ia;
296: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
297: PetscMalloc1(nslim_col+1,&work);
299: /* determine the number of columns in each row */
300: ia[0] = oshift;
301: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302: j = aj + ai[row] + ishift;
303: col = *j++ + ishift;
304: i2 = tvc[col];
305: nz = ai[row+1] - ai[row];
306: while (nz-- > 0) { /* off-diagonal elemets */
307: /* ia[i1+1]++; */
308: ia[i2+1]++;
309: i2++;
310: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311: if (nz > 0) i2 = tvc[col];
312: }
313: }
315: /* shift ia[i] to point to next col */
316: for (i1=1; i1<nslim_col+1; i1++) {
317: col = ia[i1-1];
318: ia[i1] += col;
319: work[i1-1] = col - oshift;
320: }
322: /* allocate space for column pointers */
323: nz = ia[nslim_col] + (!ishift);
324: PetscMalloc1(nz,&ja);
325: *jja = ja;
327: /* loop over matrix putting into ja */
328: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329: j = aj + ai[row] + ishift;
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338: if (nz > 0) i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
351: {
355: Mat_CreateColInode(A,n,NULL);
356: if (!ia) return(0);
358: if (!blockcompressed) {
359: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
360: } else if (symmetric) {
361: /* Since the indices are symmetric it does'nt matter */
362: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
363: } else {
364: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
365: }
366: return(0);
367: }
371: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
372: {
376: if (!ia) return(0);
377: if (!blockcompressed) {
378: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
379: } else {
380: PetscFree(*ia);
381: PetscFree(*ja);
382: }
383: return(0);
384: }
386: /* ----------------------------------------------------------- */
390: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391: {
392: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
393: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394: PetscScalar *y;
395: const PetscScalar *x;
396: const MatScalar *v1,*v2,*v3,*v4,*v5;
397: PetscErrorCode ierr;
398: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399: const PetscInt *idx,*ns,*ii;
401: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403: #endif
406: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407: node_max = a->inode.node_count;
408: ns = a->inode.size; /* Node Size array */
409: VecGetArrayRead(xx,&x);
410: VecGetArray(yy,&y);
411: idx = a->j;
412: v1 = a->a;
413: ii = a->i;
415: for (i = 0,row = 0; i< node_max; ++i) {
416: nsz = ns[i];
417: n = ii[1] - ii[0];
418: nonzerorow += (n>0)*nsz;
419: ii += nsz;
420: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
421: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
422: sz = n; /* No of non zeros in this row */
423: /* Switch on the size of Node */
424: switch (nsz) { /* Each loop in 'case' is unrolled */
425: case 1:
426: sum1 = 0.;
428: for (n = 0; n< sz-1; n+=2) {
429: i1 = idx[0]; /* The instructions are ordered to */
430: i2 = idx[1]; /* make the compiler's job easy */
431: idx += 2;
432: tmp0 = x[i1];
433: tmp1 = x[i2];
434: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435: }
437: if (n == sz-1) { /* Take care of the last nonzero */
438: tmp0 = x[*idx++];
439: sum1 += *v1++ *tmp0;
440: }
441: y[row++]=sum1;
442: break;
443: case 2:
444: sum1 = 0.;
445: sum2 = 0.;
446: v2 = v1 + n;
448: for (n = 0; n< sz-1; n+=2) {
449: i1 = idx[0];
450: i2 = idx[1];
451: idx += 2;
452: tmp0 = x[i1];
453: tmp1 = x[i2];
454: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456: }
457: if (n == sz-1) {
458: tmp0 = x[*idx++];
459: sum1 += *v1++ * tmp0;
460: sum2 += *v2++ * tmp0;
461: }
462: y[row++]=sum1;
463: y[row++]=sum2;
464: v1 =v2; /* Since the next block to be processed starts there*/
465: idx +=sz;
466: break;
467: case 3:
468: sum1 = 0.;
469: sum2 = 0.;
470: sum3 = 0.;
471: v2 = v1 + n;
472: v3 = v2 + n;
474: for (n = 0; n< sz-1; n+=2) {
475: i1 = idx[0];
476: i2 = idx[1];
477: idx += 2;
478: tmp0 = x[i1];
479: tmp1 = x[i2];
480: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483: }
484: if (n == sz-1) {
485: tmp0 = x[*idx++];
486: sum1 += *v1++ * tmp0;
487: sum2 += *v2++ * tmp0;
488: sum3 += *v3++ * tmp0;
489: }
490: y[row++]=sum1;
491: y[row++]=sum2;
492: y[row++]=sum3;
493: v1 =v3; /* Since the next block to be processed starts there*/
494: idx +=2*sz;
495: break;
496: case 4:
497: sum1 = 0.;
498: sum2 = 0.;
499: sum3 = 0.;
500: sum4 = 0.;
501: v2 = v1 + n;
502: v3 = v2 + n;
503: v4 = v3 + n;
505: for (n = 0; n< sz-1; n+=2) {
506: i1 = idx[0];
507: i2 = idx[1];
508: idx += 2;
509: tmp0 = x[i1];
510: tmp1 = x[i2];
511: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515: }
516: if (n == sz-1) {
517: tmp0 = x[*idx++];
518: sum1 += *v1++ * tmp0;
519: sum2 += *v2++ * tmp0;
520: sum3 += *v3++ * tmp0;
521: sum4 += *v4++ * tmp0;
522: }
523: y[row++]=sum1;
524: y[row++]=sum2;
525: y[row++]=sum3;
526: y[row++]=sum4;
527: v1 =v4; /* Since the next block to be processed starts there*/
528: idx +=3*sz;
529: break;
530: case 5:
531: sum1 = 0.;
532: sum2 = 0.;
533: sum3 = 0.;
534: sum4 = 0.;
535: sum5 = 0.;
536: v2 = v1 + n;
537: v3 = v2 + n;
538: v4 = v3 + n;
539: v5 = v4 + n;
541: for (n = 0; n<sz-1; n+=2) {
542: i1 = idx[0];
543: i2 = idx[1];
544: idx += 2;
545: tmp0 = x[i1];
546: tmp1 = x[i2];
547: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552: }
553: if (n == sz-1) {
554: tmp0 = x[*idx++];
555: sum1 += *v1++ * tmp0;
556: sum2 += *v2++ * tmp0;
557: sum3 += *v3++ * tmp0;
558: sum4 += *v4++ * tmp0;
559: sum5 += *v5++ * tmp0;
560: }
561: y[row++]=sum1;
562: y[row++]=sum2;
563: y[row++]=sum3;
564: y[row++]=sum4;
565: y[row++]=sum5;
566: v1 =v5; /* Since the next block to be processed starts there */
567: idx +=4*sz;
568: break;
569: default:
570: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571: }
572: }
573: VecRestoreArrayRead(xx,&x);
574: VecRestoreArray(yy,&y);
575: PetscLogFlops(2.0*a->nz - nonzerorow);
576: return(0);
577: }
578: /* ----------------------------------------------------------- */
579: /* Almost same code as the MatMult_SeqAIJ_Inode() */
582: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583: {
584: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
585: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586: const MatScalar *v1,*v2,*v3,*v4,*v5;
587: const PetscScalar *x;
588: PetscScalar *y,*z,*zt;
589: PetscErrorCode ierr;
590: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
591: const PetscInt *idx,*ns,*ii;
594: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595: node_max = a->inode.node_count;
596: ns = a->inode.size; /* Node Size array */
598: VecGetArrayRead(xx,&x);
599: VecGetArrayPair(zz,yy,&z,&y);
600: zt = z;
602: idx = a->j;
603: v1 = a->a;
604: ii = a->i;
606: for (i = 0,row = 0; i< node_max; ++i) {
607: nsz = ns[i];
608: n = ii[1] - ii[0];
609: ii += nsz;
610: sz = n; /* No of non zeros in this row */
611: /* Switch on the size of Node */
612: switch (nsz) { /* Each loop in 'case' is unrolled */
613: case 1:
614: sum1 = *zt++;
616: for (n = 0; n< sz-1; n+=2) {
617: i1 = idx[0]; /* The instructions are ordered to */
618: i2 = idx[1]; /* make the compiler's job easy */
619: idx += 2;
620: tmp0 = x[i1];
621: tmp1 = x[i2];
622: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623: }
625: if (n == sz-1) { /* Take care of the last nonzero */
626: tmp0 = x[*idx++];
627: sum1 += *v1++ * tmp0;
628: }
629: y[row++]=sum1;
630: break;
631: case 2:
632: sum1 = *zt++;
633: sum2 = *zt++;
634: v2 = v1 + n;
636: for (n = 0; n< sz-1; n+=2) {
637: i1 = idx[0];
638: i2 = idx[1];
639: idx += 2;
640: tmp0 = x[i1];
641: tmp1 = x[i2];
642: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644: }
645: if (n == sz-1) {
646: tmp0 = x[*idx++];
647: sum1 += *v1++ * tmp0;
648: sum2 += *v2++ * tmp0;
649: }
650: y[row++]=sum1;
651: y[row++]=sum2;
652: v1 =v2; /* Since the next block to be processed starts there*/
653: idx +=sz;
654: break;
655: case 3:
656: sum1 = *zt++;
657: sum2 = *zt++;
658: sum3 = *zt++;
659: v2 = v1 + n;
660: v3 = v2 + n;
662: for (n = 0; n< sz-1; n+=2) {
663: i1 = idx[0];
664: i2 = idx[1];
665: idx += 2;
666: tmp0 = x[i1];
667: tmp1 = x[i2];
668: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671: }
672: if (n == sz-1) {
673: tmp0 = x[*idx++];
674: sum1 += *v1++ * tmp0;
675: sum2 += *v2++ * tmp0;
676: sum3 += *v3++ * tmp0;
677: }
678: y[row++]=sum1;
679: y[row++]=sum2;
680: y[row++]=sum3;
681: v1 =v3; /* Since the next block to be processed starts there*/
682: idx +=2*sz;
683: break;
684: case 4:
685: sum1 = *zt++;
686: sum2 = *zt++;
687: sum3 = *zt++;
688: sum4 = *zt++;
689: v2 = v1 + n;
690: v3 = v2 + n;
691: v4 = v3 + n;
693: for (n = 0; n< sz-1; n+=2) {
694: i1 = idx[0];
695: i2 = idx[1];
696: idx += 2;
697: tmp0 = x[i1];
698: tmp1 = x[i2];
699: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703: }
704: if (n == sz-1) {
705: tmp0 = x[*idx++];
706: sum1 += *v1++ * tmp0;
707: sum2 += *v2++ * tmp0;
708: sum3 += *v3++ * tmp0;
709: sum4 += *v4++ * tmp0;
710: }
711: y[row++]=sum1;
712: y[row++]=sum2;
713: y[row++]=sum3;
714: y[row++]=sum4;
715: v1 =v4; /* Since the next block to be processed starts there*/
716: idx +=3*sz;
717: break;
718: case 5:
719: sum1 = *zt++;
720: sum2 = *zt++;
721: sum3 = *zt++;
722: sum4 = *zt++;
723: sum5 = *zt++;
724: v2 = v1 + n;
725: v3 = v2 + n;
726: v4 = v3 + n;
727: v5 = v4 + n;
729: for (n = 0; n<sz-1; n+=2) {
730: i1 = idx[0];
731: i2 = idx[1];
732: idx += 2;
733: tmp0 = x[i1];
734: tmp1 = x[i2];
735: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740: }
741: if (n == sz-1) {
742: tmp0 = x[*idx++];
743: sum1 += *v1++ * tmp0;
744: sum2 += *v2++ * tmp0;
745: sum3 += *v3++ * tmp0;
746: sum4 += *v4++ * tmp0;
747: sum5 += *v5++ * tmp0;
748: }
749: y[row++]=sum1;
750: y[row++]=sum2;
751: y[row++]=sum3;
752: y[row++]=sum4;
753: y[row++]=sum5;
754: v1 =v5; /* Since the next block to be processed starts there */
755: idx +=4*sz;
756: break;
757: default:
758: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759: }
760: }
761: VecRestoreArrayRead(xx,&x);
762: VecRestoreArrayPair(zz,yy,&z,&y);
763: PetscLogFlops(2.0*a->nz);
764: return(0);
765: }
767: /* ----------------------------------------------------------- */
770: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771: {
772: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
773: IS iscol = a->col,isrow = a->row;
774: PetscErrorCode ierr;
775: const PetscInt *r,*c,*rout,*cout;
776: PetscInt i,j,n = A->rmap->n,nz;
777: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
778: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
780: PetscScalar sum1,sum2,sum3,sum4,sum5;
781: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782: const PetscScalar *b;
785: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786: node_max = a->inode.node_count;
787: ns = a->inode.size; /* Node Size array */
789: VecGetArrayRead(bb,&b);
790: VecGetArray(xx,&x);
791: tmp = a->solve_work;
793: ISGetIndices(isrow,&rout); r = rout;
794: ISGetIndices(iscol,&cout); c = cout + (n-1);
796: /* forward solve the lower triangular */
797: tmps = tmp;
798: aa = a_a;
799: aj = a_j;
800: ad = a->diag;
802: for (i = 0,row = 0; i< node_max; ++i) {
803: nsz = ns[i];
804: aii = ai[row];
805: v1 = aa + aii;
806: vi = aj + aii;
807: nz = ad[row]- aii;
808: if (i < node_max-1) {
809: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810: * but our indexing to determine it's size could. */
811: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815: }
817: switch (nsz) { /* Each loop in 'case' is unrolled */
818: case 1:
819: sum1 = b[*r++];
820: for (j=0; j<nz-1; j+=2) {
821: i0 = vi[0];
822: i1 = vi[1];
823: vi +=2;
824: tmp0 = tmps[i0];
825: tmp1 = tmps[i1];
826: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827: }
828: if (j == nz-1) {
829: tmp0 = tmps[*vi++];
830: sum1 -= *v1++ *tmp0;
831: }
832: tmp[row++]=sum1;
833: break;
834: case 2:
835: sum1 = b[*r++];
836: sum2 = b[*r++];
837: v2 = aa + ai[row+1];
839: for (j=0; j<nz-1; j+=2) {
840: i0 = vi[0];
841: i1 = vi[1];
842: vi +=2;
843: tmp0 = tmps[i0];
844: tmp1 = tmps[i1];
845: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847: }
848: if (j == nz-1) {
849: tmp0 = tmps[*vi++];
850: sum1 -= *v1++ *tmp0;
851: sum2 -= *v2++ *tmp0;
852: }
853: sum2 -= *v2++ *sum1;
854: tmp[row++]=sum1;
855: tmp[row++]=sum2;
856: break;
857: case 3:
858: sum1 = b[*r++];
859: sum2 = b[*r++];
860: sum3 = b[*r++];
861: v2 = aa + ai[row+1];
862: v3 = aa + ai[row+2];
864: for (j=0; j<nz-1; j+=2) {
865: i0 = vi[0];
866: i1 = vi[1];
867: vi +=2;
868: tmp0 = tmps[i0];
869: tmp1 = tmps[i1];
870: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873: }
874: if (j == nz-1) {
875: tmp0 = tmps[*vi++];
876: sum1 -= *v1++ *tmp0;
877: sum2 -= *v2++ *tmp0;
878: sum3 -= *v3++ *tmp0;
879: }
880: sum2 -= *v2++ * sum1;
881: sum3 -= *v3++ * sum1;
882: sum3 -= *v3++ * sum2;
884: tmp[row++]=sum1;
885: tmp[row++]=sum2;
886: tmp[row++]=sum3;
887: break;
889: case 4:
890: sum1 = b[*r++];
891: sum2 = b[*r++];
892: sum3 = b[*r++];
893: sum4 = b[*r++];
894: v2 = aa + ai[row+1];
895: v3 = aa + ai[row+2];
896: v4 = aa + ai[row+3];
898: for (j=0; j<nz-1; j+=2) {
899: i0 = vi[0];
900: i1 = vi[1];
901: vi +=2;
902: tmp0 = tmps[i0];
903: tmp1 = tmps[i1];
904: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908: }
909: if (j == nz-1) {
910: tmp0 = tmps[*vi++];
911: sum1 -= *v1++ *tmp0;
912: sum2 -= *v2++ *tmp0;
913: sum3 -= *v3++ *tmp0;
914: sum4 -= *v4++ *tmp0;
915: }
916: sum2 -= *v2++ * sum1;
917: sum3 -= *v3++ * sum1;
918: sum4 -= *v4++ * sum1;
919: sum3 -= *v3++ * sum2;
920: sum4 -= *v4++ * sum2;
921: sum4 -= *v4++ * sum3;
923: tmp[row++]=sum1;
924: tmp[row++]=sum2;
925: tmp[row++]=sum3;
926: tmp[row++]=sum4;
927: break;
928: case 5:
929: sum1 = b[*r++];
930: sum2 = b[*r++];
931: sum3 = b[*r++];
932: sum4 = b[*r++];
933: sum5 = b[*r++];
934: v2 = aa + ai[row+1];
935: v3 = aa + ai[row+2];
936: v4 = aa + ai[row+3];
937: v5 = aa + ai[row+4];
939: for (j=0; j<nz-1; j+=2) {
940: i0 = vi[0];
941: i1 = vi[1];
942: vi +=2;
943: tmp0 = tmps[i0];
944: tmp1 = tmps[i1];
945: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950: }
951: if (j == nz-1) {
952: tmp0 = tmps[*vi++];
953: sum1 -= *v1++ *tmp0;
954: sum2 -= *v2++ *tmp0;
955: sum3 -= *v3++ *tmp0;
956: sum4 -= *v4++ *tmp0;
957: sum5 -= *v5++ *tmp0;
958: }
960: sum2 -= *v2++ * sum1;
961: sum3 -= *v3++ * sum1;
962: sum4 -= *v4++ * sum1;
963: sum5 -= *v5++ * sum1;
964: sum3 -= *v3++ * sum2;
965: sum4 -= *v4++ * sum2;
966: sum5 -= *v5++ * sum2;
967: sum4 -= *v4++ * sum3;
968: sum5 -= *v5++ * sum3;
969: sum5 -= *v5++ * sum4;
971: tmp[row++]=sum1;
972: tmp[row++]=sum2;
973: tmp[row++]=sum3;
974: tmp[row++]=sum4;
975: tmp[row++]=sum5;
976: break;
977: default:
978: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979: }
980: }
981: /* backward solve the upper triangular */
982: for (i=node_max -1,row = n-1; i>=0; i--) {
983: nsz = ns[i];
984: aii = ai[row+1] -1;
985: v1 = aa + aii;
986: vi = aj + aii;
987: nz = aii- ad[row];
988: switch (nsz) { /* Each loop in 'case' is unrolled */
989: case 1:
990: sum1 = tmp[row];
992: for (j=nz; j>1; j-=2) {
993: vi -=2;
994: i0 = vi[2];
995: i1 = vi[1];
996: tmp0 = tmps[i0];
997: tmp1 = tmps[i1];
998: v1 -= 2;
999: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000: }
1001: if (j==1) {
1002: tmp0 = tmps[*vi--];
1003: sum1 -= *v1-- * tmp0;
1004: }
1005: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006: break;
1007: case 2:
1008: sum1 = tmp[row];
1009: sum2 = tmp[row -1];
1010: v2 = aa + ai[row]-1;
1011: for (j=nz; j>1; j-=2) {
1012: vi -=2;
1013: i0 = vi[2];
1014: i1 = vi[1];
1015: tmp0 = tmps[i0];
1016: tmp1 = tmps[i1];
1017: v1 -= 2;
1018: v2 -= 2;
1019: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021: }
1022: if (j==1) {
1023: tmp0 = tmps[*vi--];
1024: sum1 -= *v1-- * tmp0;
1025: sum2 -= *v2-- * tmp0;
1026: }
1028: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029: sum2 -= *v2-- * tmp0;
1030: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031: break;
1032: case 3:
1033: sum1 = tmp[row];
1034: sum2 = tmp[row -1];
1035: sum3 = tmp[row -2];
1036: v2 = aa + ai[row]-1;
1037: v3 = aa + ai[row -1]-1;
1038: for (j=nz; j>1; j-=2) {
1039: vi -=2;
1040: i0 = vi[2];
1041: i1 = vi[1];
1042: tmp0 = tmps[i0];
1043: tmp1 = tmps[i1];
1044: v1 -= 2;
1045: v2 -= 2;
1046: v3 -= 2;
1047: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050: }
1051: if (j==1) {
1052: tmp0 = tmps[*vi--];
1053: sum1 -= *v1-- * tmp0;
1054: sum2 -= *v2-- * tmp0;
1055: sum3 -= *v3-- * tmp0;
1056: }
1057: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058: sum2 -= *v2-- * tmp0;
1059: sum3 -= *v3-- * tmp0;
1060: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061: sum3 -= *v3-- * tmp0;
1062: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1064: break;
1065: case 4:
1066: sum1 = tmp[row];
1067: sum2 = tmp[row -1];
1068: sum3 = tmp[row -2];
1069: sum4 = tmp[row -3];
1070: v2 = aa + ai[row]-1;
1071: v3 = aa + ai[row -1]-1;
1072: v4 = aa + ai[row -2]-1;
1074: for (j=nz; j>1; j-=2) {
1075: vi -=2;
1076: i0 = vi[2];
1077: i1 = vi[1];
1078: tmp0 = tmps[i0];
1079: tmp1 = tmps[i1];
1080: v1 -= 2;
1081: v2 -= 2;
1082: v3 -= 2;
1083: v4 -= 2;
1084: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088: }
1089: if (j==1) {
1090: tmp0 = tmps[*vi--];
1091: sum1 -= *v1-- * tmp0;
1092: sum2 -= *v2-- * tmp0;
1093: sum3 -= *v3-- * tmp0;
1094: sum4 -= *v4-- * tmp0;
1095: }
1097: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098: sum2 -= *v2-- * tmp0;
1099: sum3 -= *v3-- * tmp0;
1100: sum4 -= *v4-- * tmp0;
1101: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102: sum3 -= *v3-- * tmp0;
1103: sum4 -= *v4-- * tmp0;
1104: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105: sum4 -= *v4-- * tmp0;
1106: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107: break;
1108: case 5:
1109: sum1 = tmp[row];
1110: sum2 = tmp[row -1];
1111: sum3 = tmp[row -2];
1112: sum4 = tmp[row -3];
1113: sum5 = tmp[row -4];
1114: v2 = aa + ai[row]-1;
1115: v3 = aa + ai[row -1]-1;
1116: v4 = aa + ai[row -2]-1;
1117: v5 = aa + ai[row -3]-1;
1118: for (j=nz; j>1; j-=2) {
1119: vi -= 2;
1120: i0 = vi[2];
1121: i1 = vi[1];
1122: tmp0 = tmps[i0];
1123: tmp1 = tmps[i1];
1124: v1 -= 2;
1125: v2 -= 2;
1126: v3 -= 2;
1127: v4 -= 2;
1128: v5 -= 2;
1129: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134: }
1135: if (j==1) {
1136: tmp0 = tmps[*vi--];
1137: sum1 -= *v1-- * tmp0;
1138: sum2 -= *v2-- * tmp0;
1139: sum3 -= *v3-- * tmp0;
1140: sum4 -= *v4-- * tmp0;
1141: sum5 -= *v5-- * tmp0;
1142: }
1144: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145: sum2 -= *v2-- * tmp0;
1146: sum3 -= *v3-- * tmp0;
1147: sum4 -= *v4-- * tmp0;
1148: sum5 -= *v5-- * tmp0;
1149: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150: sum3 -= *v3-- * tmp0;
1151: sum4 -= *v4-- * tmp0;
1152: sum5 -= *v5-- * tmp0;
1153: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154: sum4 -= *v4-- * tmp0;
1155: sum5 -= *v5-- * tmp0;
1156: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157: sum5 -= *v5-- * tmp0;
1158: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159: break;
1160: default:
1161: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162: }
1163: }
1164: ISRestoreIndices(isrow,&rout);
1165: ISRestoreIndices(iscol,&cout);
1166: VecRestoreArrayRead(bb,&b);
1167: VecRestoreArray(xx,&x);
1168: PetscLogFlops(2.0*a->nz - A->cmap->n);
1169: return(0);
1170: }
1174: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175: {
1176: Mat C =B;
1177: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178: IS isrow = b->row,isicol = b->icol;
1179: PetscErrorCode ierr;
1180: const PetscInt *r,*ic,*ics;
1181: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182: PetscInt i,j,k,nz,nzL,row,*pj;
1183: const PetscInt *ajtmp,*bjtmp;
1184: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186: FactorShiftCtx sctx;
1187: const PetscInt *ddiag;
1188: PetscReal rs;
1189: MatScalar d;
1190: PetscInt inod,nodesz,node_max,col;
1191: const PetscInt *ns;
1192: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1195: /* MatPivotSetUp(): initialize shift context sctx */
1196: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1198: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199: ddiag = a->diag;
1200: sctx.shift_top = info->zeropivot;
1201: for (i=0; i<n; i++) {
1202: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203: d = (aa)[ddiag[i]];
1204: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205: v = aa+ai[i];
1206: nz = ai[i+1] - ai[i];
1207: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208: if (rs>sctx.shift_top) sctx.shift_top = rs;
1209: }
1210: sctx.shift_top *= 1.1;
1211: sctx.nshift_max = 5;
1212: sctx.shift_lo = 0.;
1213: sctx.shift_hi = 1.;
1214: }
1216: ISGetIndices(isrow,&r);
1217: ISGetIndices(isicol,&ic);
1219: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1220: ics = ic;
1222: node_max = a->inode.node_count;
1223: ns = a->inode.size;
1224: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1226: /* If max inode size > 4, split it into two inodes.*/
1227: /* also map the inode sizes according to the ordering */
1228: PetscMalloc1(n+1,&tmp_vec1);
1229: for (i=0,j=0; i<node_max; ++i,++j) {
1230: if (ns[i] > 4) {
1231: tmp_vec1[j] = 4;
1232: ++j;
1233: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234: } else {
1235: tmp_vec1[j] = ns[i];
1236: }
1237: }
1238: /* Use the correct node_max */
1239: node_max = j;
1241: /* Now reorder the inode info based on mat re-ordering info */
1242: /* First create a row -> inode_size_array_index map */
1243: PetscMalloc1(n+1,&nsmap);
1244: PetscMalloc1(node_max+1,&tmp_vec2);
1245: for (i=0,row=0; i<node_max; i++) {
1246: nodesz = tmp_vec1[i];
1247: for (j=0; j<nodesz; j++,row++) {
1248: nsmap[row] = i;
1249: }
1250: }
1251: /* Using nsmap, create a reordered ns structure */
1252: for (i=0,j=0; i< node_max; i++) {
1253: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1254: tmp_vec2[i] = nodesz;
1255: j += nodesz;
1256: }
1257: PetscFree(nsmap);
1258: PetscFree(tmp_vec1);
1260: /* Now use the correct ns */
1261: ns = tmp_vec2;
1263: do {
1264: sctx.newshift = PETSC_FALSE;
1265: /* Now loop over each block-row, and do the factorization */
1266: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267: nodesz = ns[inod];
1269: switch (nodesz) {
1270: case 1:
1271: /*----------*/
1272: /* zero rtmp1 */
1273: /* L part */
1274: nz = bi[i+1] - bi[i];
1275: bjtmp = bj + bi[i];
1276: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1278: /* U part */
1279: nz = bdiag[i]-bdiag[i+1];
1280: bjtmp = bj + bdiag[i+1]+1;
1281: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1283: /* load in initial (unfactored row) */
1284: nz = ai[r[i]+1] - ai[r[i]];
1285: ajtmp = aj + ai[r[i]];
1286: v = aa + ai[r[i]];
1287: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1289: /* ZeropivotApply() */
1290: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1292: /* elimination */
1293: bjtmp = bj + bi[i];
1294: row = *bjtmp++;
1295: nzL = bi[i+1] - bi[i];
1296: for (k=0; k < nzL; k++) {
1297: pc = rtmp1 + row;
1298: if (*pc != 0.0) {
1299: pv = b->a + bdiag[row];
1300: mul1 = *pc * (*pv);
1301: *pc = mul1;
1302: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303: pv = b->a + bdiag[row+1]+1;
1304: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306: PetscLogFlops(1+2*nz);
1307: }
1308: row = *bjtmp++;
1309: }
1311: /* finished row so stick it into b->a */
1312: rs = 0.0;
1313: /* L part */
1314: pv = b->a + bi[i];
1315: pj = b->j + bi[i];
1316: nz = bi[i+1] - bi[i];
1317: for (j=0; j<nz; j++) {
1318: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319: }
1321: /* U part */
1322: pv = b->a + bdiag[i+1]+1;
1323: pj = b->j + bdiag[i+1]+1;
1324: nz = bdiag[i] - bdiag[i+1]-1;
1325: for (j=0; j<nz; j++) {
1326: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327: }
1329: /* Check zero pivot */
1330: sctx.rs = rs;
1331: sctx.pv = rtmp1[i];
1332: MatPivotCheck(A,info,&sctx,i);
1333: if (sctx.newshift) break;
1335: /* Mark diagonal and invert diagonal for simplier triangular solves */
1336: pv = b->a + bdiag[i];
1337: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338: break;
1340: case 2:
1341: /*----------*/
1342: /* zero rtmp1 and rtmp2 */
1343: /* L part */
1344: nz = bi[i+1] - bi[i];
1345: bjtmp = bj + bi[i];
1346: for (j=0; j<nz; j++) {
1347: col = bjtmp[j];
1348: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349: }
1351: /* U part */
1352: nz = bdiag[i]-bdiag[i+1];
1353: bjtmp = bj + bdiag[i+1]+1;
1354: for (j=0; j<nz; j++) {
1355: col = bjtmp[j];
1356: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357: }
1359: /* load in initial (unfactored row) */
1360: nz = ai[r[i]+1] - ai[r[i]];
1361: ajtmp = aj + ai[r[i]];
1362: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363: for (j=0; j<nz; j++) {
1364: col = ics[ajtmp[j]];
1365: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366: }
1367: /* ZeropivotApply(): shift the diagonal of the matrix */
1368: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1370: /* elimination */
1371: bjtmp = bj + bi[i];
1372: row = *bjtmp++; /* pivot row */
1373: nzL = bi[i+1] - bi[i];
1374: for (k=0; k < nzL; k++) {
1375: pc1 = rtmp1 + row;
1376: pc2 = rtmp2 + row;
1377: if (*pc1 != 0.0 || *pc2 != 0.0) {
1378: pv = b->a + bdiag[row];
1379: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380: *pc1 = mul1; *pc2 = mul2;
1382: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1383: pv = b->a + bdiag[row+1]+1;
1384: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385: for (j=0; j<nz; j++) {
1386: col = pj[j];
1387: rtmp1[col] -= mul1 * pv[j];
1388: rtmp2[col] -= mul2 * pv[j];
1389: }
1390: PetscLogFlops(2+4*nz);
1391: }
1392: row = *bjtmp++;
1393: }
1395: /* finished row i; check zero pivot, then stick row i into b->a */
1396: rs = 0.0;
1397: /* L part */
1398: pc1 = b->a + bi[i];
1399: pj = b->j + bi[i];
1400: nz = bi[i+1] - bi[i];
1401: for (j=0; j<nz; j++) {
1402: col = pj[j];
1403: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404: }
1405: /* U part */
1406: pc1 = b->a + bdiag[i+1]+1;
1407: pj = b->j + bdiag[i+1]+1;
1408: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409: for (j=0; j<nz; j++) {
1410: col = pj[j];
1411: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412: }
1414: sctx.rs = rs;
1415: sctx.pv = rtmp1[i];
1416: MatPivotCheck(A,info,&sctx,i);
1417: if (sctx.newshift) break;
1418: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1419: *pc1 = 1.0/sctx.pv;
1421: /* Now take care of diagonal 2x2 block. */
1422: pc2 = rtmp2 + i;
1423: if (*pc2 != 0.0) {
1424: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425: *pc2 = mul1; /* insert L entry */
1426: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1427: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428: for (j=0; j<nz; j++) {
1429: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430: }
1431: PetscLogFlops(1+2*nz);
1432: }
1434: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435: rs = 0.0;
1436: /* L part */
1437: pc2 = b->a + bi[i+1];
1438: pj = b->j + bi[i+1];
1439: nz = bi[i+2] - bi[i+1];
1440: for (j=0; j<nz; j++) {
1441: col = pj[j];
1442: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443: }
1444: /* U part */
1445: pc2 = b->a + bdiag[i+2]+1;
1446: pj = b->j + bdiag[i+2]+1;
1447: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448: for (j=0; j<nz; j++) {
1449: col = pj[j];
1450: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451: }
1453: sctx.rs = rs;
1454: sctx.pv = rtmp2[i+1];
1455: MatPivotCheck(A,info,&sctx,i+1);
1456: if (sctx.newshift) break;
1457: pc2 = b->a + bdiag[i+1];
1458: *pc2 = 1.0/sctx.pv;
1459: break;
1461: case 3:
1462: /*----------*/
1463: /* zero rtmp */
1464: /* L part */
1465: nz = bi[i+1] - bi[i];
1466: bjtmp = bj + bi[i];
1467: for (j=0; j<nz; j++) {
1468: col = bjtmp[j];
1469: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470: }
1472: /* U part */
1473: nz = bdiag[i]-bdiag[i+1];
1474: bjtmp = bj + bdiag[i+1]+1;
1475: for (j=0; j<nz; j++) {
1476: col = bjtmp[j];
1477: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478: }
1480: /* load in initial (unfactored row) */
1481: nz = ai[r[i]+1] - ai[r[i]];
1482: ajtmp = aj + ai[r[i]];
1483: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484: for (j=0; j<nz; j++) {
1485: col = ics[ajtmp[j]];
1486: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487: }
1488: /* ZeropivotApply(): shift the diagonal of the matrix */
1489: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1491: /* elimination */
1492: bjtmp = bj + bi[i];
1493: row = *bjtmp++; /* pivot row */
1494: nzL = bi[i+1] - bi[i];
1495: for (k=0; k < nzL; k++) {
1496: pc1 = rtmp1 + row;
1497: pc2 = rtmp2 + row;
1498: pc3 = rtmp3 + row;
1499: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500: pv = b->a + bdiag[row];
1501: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1504: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1505: pv = b->a + bdiag[row+1]+1;
1506: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507: for (j=0; j<nz; j++) {
1508: col = pj[j];
1509: rtmp1[col] -= mul1 * pv[j];
1510: rtmp2[col] -= mul2 * pv[j];
1511: rtmp3[col] -= mul3 * pv[j];
1512: }
1513: PetscLogFlops(3+6*nz);
1514: }
1515: row = *bjtmp++;
1516: }
1518: /* finished row i; check zero pivot, then stick row i into b->a */
1519: rs = 0.0;
1520: /* L part */
1521: pc1 = b->a + bi[i];
1522: pj = b->j + bi[i];
1523: nz = bi[i+1] - bi[i];
1524: for (j=0; j<nz; j++) {
1525: col = pj[j];
1526: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527: }
1528: /* U part */
1529: pc1 = b->a + bdiag[i+1]+1;
1530: pj = b->j + bdiag[i+1]+1;
1531: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532: for (j=0; j<nz; j++) {
1533: col = pj[j];
1534: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535: }
1537: sctx.rs = rs;
1538: sctx.pv = rtmp1[i];
1539: MatPivotCheck(A,info,&sctx,i);
1540: if (sctx.newshift) break;
1541: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1542: *pc1 = 1.0/sctx.pv;
1544: /* Now take care of 1st column of diagonal 3x3 block. */
1545: pc2 = rtmp2 + i;
1546: pc3 = rtmp3 + i;
1547: if (*pc2 != 0.0 || *pc3 != 0.0) {
1548: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1551: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552: for (j=0; j<nz; j++) {
1553: col = pj[j];
1554: rtmp2[col] -= mul2 * rtmp1[col];
1555: rtmp3[col] -= mul3 * rtmp1[col];
1556: }
1557: PetscLogFlops(2+4*nz);
1558: }
1560: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561: rs = 0.0;
1562: /* L part */
1563: pc2 = b->a + bi[i+1];
1564: pj = b->j + bi[i+1];
1565: nz = bi[i+2] - bi[i+1];
1566: for (j=0; j<nz; j++) {
1567: col = pj[j];
1568: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569: }
1570: /* U part */
1571: pc2 = b->a + bdiag[i+2]+1;
1572: pj = b->j + bdiag[i+2]+1;
1573: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574: for (j=0; j<nz; j++) {
1575: col = pj[j];
1576: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577: }
1579: sctx.rs = rs;
1580: sctx.pv = rtmp2[i+1];
1581: MatPivotCheck(A,info,&sctx,i+1);
1582: if (sctx.newshift) break;
1583: pc2 = b->a + bdiag[i+1];
1584: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1586: /* Now take care of 2nd column of diagonal 3x3 block. */
1587: pc3 = rtmp3 + i+1;
1588: if (*pc3 != 0.0) {
1589: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1591: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592: for (j=0; j<nz; j++) {
1593: col = pj[j];
1594: rtmp3[col] -= mul3 * rtmp2[col];
1595: }
1596: PetscLogFlops(1+2*nz);
1597: }
1599: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600: rs = 0.0;
1601: /* L part */
1602: pc3 = b->a + bi[i+2];
1603: pj = b->j + bi[i+2];
1604: nz = bi[i+3] - bi[i+2];
1605: for (j=0; j<nz; j++) {
1606: col = pj[j];
1607: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608: }
1609: /* U part */
1610: pc3 = b->a + bdiag[i+3]+1;
1611: pj = b->j + bdiag[i+3]+1;
1612: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613: for (j=0; j<nz; j++) {
1614: col = pj[j];
1615: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616: }
1618: sctx.rs = rs;
1619: sctx.pv = rtmp3[i+2];
1620: MatPivotCheck(A,info,&sctx,i+2);
1621: if (sctx.newshift) break;
1622: pc3 = b->a + bdiag[i+2];
1623: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624: break;
1625: case 4:
1626: /*----------*/
1627: /* zero rtmp */
1628: /* L part */
1629: nz = bi[i+1] - bi[i];
1630: bjtmp = bj + bi[i];
1631: for (j=0; j<nz; j++) {
1632: col = bjtmp[j];
1633: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634: }
1636: /* U part */
1637: nz = bdiag[i]-bdiag[i+1];
1638: bjtmp = bj + bdiag[i+1]+1;
1639: for (j=0; j<nz; j++) {
1640: col = bjtmp[j];
1641: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642: }
1644: /* load in initial (unfactored row) */
1645: nz = ai[r[i]+1] - ai[r[i]];
1646: ajtmp = aj + ai[r[i]];
1647: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648: for (j=0; j<nz; j++) {
1649: col = ics[ajtmp[j]];
1650: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651: }
1652: /* ZeropivotApply(): shift the diagonal of the matrix */
1653: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1655: /* elimination */
1656: bjtmp = bj + bi[i];
1657: row = *bjtmp++; /* pivot row */
1658: nzL = bi[i+1] - bi[i];
1659: for (k=0; k < nzL; k++) {
1660: pc1 = rtmp1 + row;
1661: pc2 = rtmp2 + row;
1662: pc3 = rtmp3 + row;
1663: pc4 = rtmp4 + row;
1664: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665: pv = b->a + bdiag[row];
1666: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1669: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670: pv = b->a + bdiag[row+1]+1;
1671: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672: for (j=0; j<nz; j++) {
1673: col = pj[j];
1674: rtmp1[col] -= mul1 * pv[j];
1675: rtmp2[col] -= mul2 * pv[j];
1676: rtmp3[col] -= mul3 * pv[j];
1677: rtmp4[col] -= mul4 * pv[j];
1678: }
1679: PetscLogFlops(4+8*nz);
1680: }
1681: row = *bjtmp++;
1682: }
1684: /* finished row i; check zero pivot, then stick row i into b->a */
1685: rs = 0.0;
1686: /* L part */
1687: pc1 = b->a + bi[i];
1688: pj = b->j + bi[i];
1689: nz = bi[i+1] - bi[i];
1690: for (j=0; j<nz; j++) {
1691: col = pj[j];
1692: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693: }
1694: /* U part */
1695: pc1 = b->a + bdiag[i+1]+1;
1696: pj = b->j + bdiag[i+1]+1;
1697: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698: for (j=0; j<nz; j++) {
1699: col = pj[j];
1700: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701: }
1703: sctx.rs = rs;
1704: sctx.pv = rtmp1[i];
1705: MatPivotCheck(A,info,&sctx,i);
1706: if (sctx.newshift) break;
1707: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1708: *pc1 = 1.0/sctx.pv;
1710: /* Now take care of 1st column of diagonal 4x4 block. */
1711: pc2 = rtmp2 + i;
1712: pc3 = rtmp3 + i;
1713: pc4 = rtmp4 + i;
1714: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1719: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720: for (j=0; j<nz; j++) {
1721: col = pj[j];
1722: rtmp2[col] -= mul2 * rtmp1[col];
1723: rtmp3[col] -= mul3 * rtmp1[col];
1724: rtmp4[col] -= mul4 * rtmp1[col];
1725: }
1726: PetscLogFlops(3+6*nz);
1727: }
1729: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730: rs = 0.0;
1731: /* L part */
1732: pc2 = b->a + bi[i+1];
1733: pj = b->j + bi[i+1];
1734: nz = bi[i+2] - bi[i+1];
1735: for (j=0; j<nz; j++) {
1736: col = pj[j];
1737: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738: }
1739: /* U part */
1740: pc2 = b->a + bdiag[i+2]+1;
1741: pj = b->j + bdiag[i+2]+1;
1742: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743: for (j=0; j<nz; j++) {
1744: col = pj[j];
1745: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746: }
1748: sctx.rs = rs;
1749: sctx.pv = rtmp2[i+1];
1750: MatPivotCheck(A,info,&sctx,i+1);
1751: if (sctx.newshift) break;
1752: pc2 = b->a + bdiag[i+1];
1753: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1755: /* Now take care of 2nd column of diagonal 4x4 block. */
1756: pc3 = rtmp3 + i+1;
1757: pc4 = rtmp4 + i+1;
1758: if (*pc3 != 0.0 || *pc4 != 0.0) {
1759: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1762: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763: for (j=0; j<nz; j++) {
1764: col = pj[j];
1765: rtmp3[col] -= mul3 * rtmp2[col];
1766: rtmp4[col] -= mul4 * rtmp2[col];
1767: }
1768: PetscLogFlops(4*nz);
1769: }
1771: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772: rs = 0.0;
1773: /* L part */
1774: pc3 = b->a + bi[i+2];
1775: pj = b->j + bi[i+2];
1776: nz = bi[i+3] - bi[i+2];
1777: for (j=0; j<nz; j++) {
1778: col = pj[j];
1779: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780: }
1781: /* U part */
1782: pc3 = b->a + bdiag[i+3]+1;
1783: pj = b->j + bdiag[i+3]+1;
1784: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785: for (j=0; j<nz; j++) {
1786: col = pj[j];
1787: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788: }
1790: sctx.rs = rs;
1791: sctx.pv = rtmp3[i+2];
1792: MatPivotCheck(A,info,&sctx,i+2);
1793: if (sctx.newshift) break;
1794: pc3 = b->a + bdiag[i+2];
1795: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1797: /* Now take care of 3rd column of diagonal 4x4 block. */
1798: pc4 = rtmp4 + i+2;
1799: if (*pc4 != 0.0) {
1800: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1802: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803: for (j=0; j<nz; j++) {
1804: col = pj[j];
1805: rtmp4[col] -= mul4 * rtmp3[col];
1806: }
1807: PetscLogFlops(1+2*nz);
1808: }
1810: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811: rs = 0.0;
1812: /* L part */
1813: pc4 = b->a + bi[i+3];
1814: pj = b->j + bi[i+3];
1815: nz = bi[i+4] - bi[i+3];
1816: for (j=0; j<nz; j++) {
1817: col = pj[j];
1818: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819: }
1820: /* U part */
1821: pc4 = b->a + bdiag[i+4]+1;
1822: pj = b->j + bdiag[i+4]+1;
1823: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824: for (j=0; j<nz; j++) {
1825: col = pj[j];
1826: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827: }
1829: sctx.rs = rs;
1830: sctx.pv = rtmp4[i+3];
1831: MatPivotCheck(A,info,&sctx,i+3);
1832: if (sctx.newshift) break;
1833: pc4 = b->a + bdiag[i+3];
1834: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835: break;
1837: default:
1838: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839: }
1840: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841: i += nodesz; /* Update the row */
1842: }
1844: /* MatPivotRefine() */
1845: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846: /*
1847: * if no shift in this attempt & shifting & started shifting & can refine,
1848: * then try lower shift
1849: */
1850: sctx.shift_hi = sctx.shift_fraction;
1851: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1853: sctx.newshift = PETSC_TRUE;
1854: sctx.nshift++;
1855: }
1856: } while (sctx.newshift);
1858: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1859: PetscFree(tmp_vec2);
1860: ISRestoreIndices(isicol,&ic);
1861: ISRestoreIndices(isrow,&r);
1863: if (b->inode.size) {
1864: C->ops->solve = MatSolve_SeqAIJ_Inode;
1865: } else {
1866: C->ops->solve = MatSolve_SeqAIJ;
1867: }
1868: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1869: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1870: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871: C->ops->matsolve = MatMatSolve_SeqAIJ;
1872: C->assembled = PETSC_TRUE;
1873: C->preallocated = PETSC_TRUE;
1875: PetscLogFlops(C->cmap->n);
1877: /* MatShiftView(A,info,&sctx) */
1878: if (sctx.nshift) {
1879: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1881: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1883: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1885: }
1886: }
1887: return(0);
1888: }
1892: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893: {
1894: Mat C = B;
1895: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1897: PetscErrorCode ierr;
1898: const PetscInt *r,*ic,*c,*ics;
1899: PetscInt n = A->rmap->n,*bi = b->i;
1900: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1902: PetscInt *ai = a->i,*aj = a->j;
1903: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904: PetscScalar mul1,mul2,mul3,tmp;
1905: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907: PetscReal rs=0.0;
1908: FactorShiftCtx sctx;
1911: sctx.shift_top = 0;
1912: sctx.nshift_max = 0;
1913: sctx.shift_lo = 0;
1914: sctx.shift_hi = 0;
1915: sctx.shift_fraction = 0;
1917: /* if both shift schemes are chosen by user, only use info->shiftpd */
1918: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919: sctx.shift_top = 0;
1920: for (i=0; i<n; i++) {
1921: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922: rs = 0.0;
1923: ajtmp = aj + ai[i];
1924: rtmp1 = aa + ai[i];
1925: nz = ai[i+1] - ai[i];
1926: for (j=0; j<nz; j++) {
1927: if (*ajtmp != i) {
1928: rs += PetscAbsScalar(*rtmp1++);
1929: } else {
1930: rs -= PetscRealPart(*rtmp1++);
1931: }
1932: ajtmp++;
1933: }
1934: if (rs>sctx.shift_top) sctx.shift_top = rs;
1935: }
1936: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937: sctx.shift_top *= 1.1;
1938: sctx.nshift_max = 5;
1939: sctx.shift_lo = 0.;
1940: sctx.shift_hi = 1.;
1941: }
1942: sctx.shift_amount = 0;
1943: sctx.nshift = 0;
1945: ISGetIndices(isrow,&r);
1946: ISGetIndices(iscol,&c);
1947: ISGetIndices(isicol,&ic);
1948: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1949: ics = ic;
1951: node_max = a->inode.node_count;
1952: ns = a->inode.size;
1953: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1955: /* If max inode size > 3, split it into two inodes.*/
1956: /* also map the inode sizes according to the ordering */
1957: PetscMalloc1(n+1,&tmp_vec1);
1958: for (i=0,j=0; i<node_max; ++i,++j) {
1959: if (ns[i]>3) {
1960: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1961: ++j;
1962: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963: } else {
1964: tmp_vec1[j] = ns[i];
1965: }
1966: }
1967: /* Use the correct node_max */
1968: node_max = j;
1970: /* Now reorder the inode info based on mat re-ordering info */
1971: /* First create a row -> inode_size_array_index map */
1972: PetscMalloc1(n+1,&nsmap);
1973: PetscMalloc1(node_max+1,&tmp_vec2);
1974: for (i=0,row=0; i<node_max; i++) {
1975: nodesz = tmp_vec1[i];
1976: for (j=0; j<nodesz; j++,row++) {
1977: nsmap[row] = i;
1978: }
1979: }
1980: /* Using nsmap, create a reordered ns structure */
1981: for (i=0,j=0; i< node_max; i++) {
1982: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1983: tmp_vec2[i] = nodesz;
1984: j += nodesz;
1985: }
1986: PetscFree(nsmap);
1987: PetscFree(tmp_vec1);
1988: /* Now use the correct ns */
1989: ns = tmp_vec2;
1991: do {
1992: sctx.newshift = PETSC_FALSE;
1993: /* Now loop over each block-row, and do the factorization */
1994: for (i=0,row=0; i<node_max; i++) {
1995: nodesz = ns[i];
1996: nz = bi[row+1] - bi[row];
1997: bjtmp = bj + bi[row];
1999: switch (nodesz) {
2000: case 1:
2001: for (j=0; j<nz; j++) {
2002: idx = bjtmp[j];
2003: rtmp11[idx] = 0.0;
2004: }
2006: /* load in initial (unfactored row) */
2007: idx = r[row];
2008: nz_tmp = ai[idx+1] - ai[idx];
2009: ajtmp = aj + ai[idx];
2010: v1 = aa + ai[idx];
2012: for (j=0; j<nz_tmp; j++) {
2013: idx = ics[ajtmp[j]];
2014: rtmp11[idx] = v1[j];
2015: }
2016: rtmp11[ics[r[row]]] += sctx.shift_amount;
2018: prow = *bjtmp++;
2019: while (prow < row) {
2020: pc1 = rtmp11 + prow;
2021: if (*pc1 != 0.0) {
2022: pv = ba + bd[prow];
2023: pj = nbj + bd[prow];
2024: mul1 = *pc1 * *pv++;
2025: *pc1 = mul1;
2026: nz_tmp = bi[prow+1] - bd[prow] - 1;
2027: PetscLogFlops(1+2*nz_tmp);
2028: for (j=0; j<nz_tmp; j++) {
2029: tmp = pv[j];
2030: idx = pj[j];
2031: rtmp11[idx] -= mul1 * tmp;
2032: }
2033: }
2034: prow = *bjtmp++;
2035: }
2036: pj = bj + bi[row];
2037: pc1 = ba + bi[row];
2039: sctx.pv = rtmp11[row];
2040: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041: rs = 0.0;
2042: for (j=0; j<nz; j++) {
2043: idx = pj[j];
2044: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046: }
2047: sctx.rs = rs;
2048: MatPivotCheck(A,info,&sctx,row);
2049: if (sctx.newshift) goto endofwhile;
2050: break;
2052: case 2:
2053: for (j=0; j<nz; j++) {
2054: idx = bjtmp[j];
2055: rtmp11[idx] = 0.0;
2056: rtmp22[idx] = 0.0;
2057: }
2059: /* load in initial (unfactored row) */
2060: idx = r[row];
2061: nz_tmp = ai[idx+1] - ai[idx];
2062: ajtmp = aj + ai[idx];
2063: v1 = aa + ai[idx];
2064: v2 = aa + ai[idx+1];
2065: for (j=0; j<nz_tmp; j++) {
2066: idx = ics[ajtmp[j]];
2067: rtmp11[idx] = v1[j];
2068: rtmp22[idx] = v2[j];
2069: }
2070: rtmp11[ics[r[row]]] += sctx.shift_amount;
2071: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2073: prow = *bjtmp++;
2074: while (prow < row) {
2075: pc1 = rtmp11 + prow;
2076: pc2 = rtmp22 + prow;
2077: if (*pc1 != 0.0 || *pc2 != 0.0) {
2078: pv = ba + bd[prow];
2079: pj = nbj + bd[prow];
2080: mul1 = *pc1 * *pv;
2081: mul2 = *pc2 * *pv;
2082: ++pv;
2083: *pc1 = mul1;
2084: *pc2 = mul2;
2086: nz_tmp = bi[prow+1] - bd[prow] - 1;
2087: for (j=0; j<nz_tmp; j++) {
2088: tmp = pv[j];
2089: idx = pj[j];
2090: rtmp11[idx] -= mul1 * tmp;
2091: rtmp22[idx] -= mul2 * tmp;
2092: }
2093: PetscLogFlops(2+4*nz_tmp);
2094: }
2095: prow = *bjtmp++;
2096: }
2098: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099: pc1 = rtmp11 + prow;
2100: pc2 = rtmp22 + prow;
2102: sctx.pv = *pc1;
2103: pj = bj + bi[prow];
2104: rs = 0.0;
2105: for (j=0; j<nz; j++) {
2106: idx = pj[j];
2107: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108: }
2109: sctx.rs = rs;
2110: MatPivotCheck(A,info,&sctx,row);
2111: if (sctx.newshift) goto endofwhile;
2113: if (*pc2 != 0.0) {
2114: pj = nbj + bd[prow];
2115: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116: *pc2 = mul2;
2117: nz_tmp = bi[prow+1] - bd[prow] - 1;
2118: for (j=0; j<nz_tmp; j++) {
2119: idx = pj[j];
2120: tmp = rtmp11[idx];
2121: rtmp22[idx] -= mul2 * tmp;
2122: }
2123: PetscLogFlops(1+2*nz_tmp);
2124: }
2126: pj = bj + bi[row];
2127: pc1 = ba + bi[row];
2128: pc2 = ba + bi[row+1];
2130: sctx.pv = rtmp22[row+1];
2131: rs = 0.0;
2132: rtmp11[row] = 1.0/rtmp11[row];
2133: rtmp22[row+1] = 1.0/rtmp22[row+1];
2134: /* copy row entries from dense representation to sparse */
2135: for (j=0; j<nz; j++) {
2136: idx = pj[j];
2137: pc1[j] = rtmp11[idx];
2138: pc2[j] = rtmp22[idx];
2139: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140: }
2141: sctx.rs = rs;
2142: MatPivotCheck(A,info,&sctx,row+1);
2143: if (sctx.newshift) goto endofwhile;
2144: break;
2146: case 3:
2147: for (j=0; j<nz; j++) {
2148: idx = bjtmp[j];
2149: rtmp11[idx] = 0.0;
2150: rtmp22[idx] = 0.0;
2151: rtmp33[idx] = 0.0;
2152: }
2153: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154: idx = r[row];
2155: nz_tmp = ai[idx+1] - ai[idx];
2156: ajtmp = aj + ai[idx];
2157: v1 = aa + ai[idx];
2158: v2 = aa + ai[idx+1];
2159: v3 = aa + ai[idx+2];
2160: for (j=0; j<nz_tmp; j++) {
2161: idx = ics[ajtmp[j]];
2162: rtmp11[idx] = v1[j];
2163: rtmp22[idx] = v2[j];
2164: rtmp33[idx] = v3[j];
2165: }
2166: rtmp11[ics[r[row]]] += sctx.shift_amount;
2167: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2170: /* loop over all pivot row blocks above this row block */
2171: prow = *bjtmp++;
2172: while (prow < row) {
2173: pc1 = rtmp11 + prow;
2174: pc2 = rtmp22 + prow;
2175: pc3 = rtmp33 + prow;
2176: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177: pv = ba + bd[prow];
2178: pj = nbj + bd[prow];
2179: mul1 = *pc1 * *pv;
2180: mul2 = *pc2 * *pv;
2181: mul3 = *pc3 * *pv;
2182: ++pv;
2183: *pc1 = mul1;
2184: *pc2 = mul2;
2185: *pc3 = mul3;
2187: nz_tmp = bi[prow+1] - bd[prow] - 1;
2188: /* update this row based on pivot row */
2189: for (j=0; j<nz_tmp; j++) {
2190: tmp = pv[j];
2191: idx = pj[j];
2192: rtmp11[idx] -= mul1 * tmp;
2193: rtmp22[idx] -= mul2 * tmp;
2194: rtmp33[idx] -= mul3 * tmp;
2195: }
2196: PetscLogFlops(3+6*nz_tmp);
2197: }
2198: prow = *bjtmp++;
2199: }
2201: /* Now take care of diagonal 3x3 block in this set of rows */
2202: /* note: prow = row here */
2203: pc1 = rtmp11 + prow;
2204: pc2 = rtmp22 + prow;
2205: pc3 = rtmp33 + prow;
2207: sctx.pv = *pc1;
2208: pj = bj + bi[prow];
2209: rs = 0.0;
2210: for (j=0; j<nz; j++) {
2211: idx = pj[j];
2212: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213: }
2214: sctx.rs = rs;
2215: MatPivotCheck(A,info,&sctx,row);
2216: if (sctx.newshift) goto endofwhile;
2218: if (*pc2 != 0.0 || *pc3 != 0.0) {
2219: mul2 = (*pc2)/(*pc1);
2220: mul3 = (*pc3)/(*pc1);
2221: *pc2 = mul2;
2222: *pc3 = mul3;
2223: nz_tmp = bi[prow+1] - bd[prow] - 1;
2224: pj = nbj + bd[prow];
2225: for (j=0; j<nz_tmp; j++) {
2226: idx = pj[j];
2227: tmp = rtmp11[idx];
2228: rtmp22[idx] -= mul2 * tmp;
2229: rtmp33[idx] -= mul3 * tmp;
2230: }
2231: PetscLogFlops(2+4*nz_tmp);
2232: }
2233: ++prow;
2235: pc2 = rtmp22 + prow;
2236: pc3 = rtmp33 + prow;
2237: sctx.pv = *pc2;
2238: pj = bj + bi[prow];
2239: rs = 0.0;
2240: for (j=0; j<nz; j++) {
2241: idx = pj[j];
2242: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243: }
2244: sctx.rs = rs;
2245: MatPivotCheck(A,info,&sctx,row+1);
2246: if (sctx.newshift) goto endofwhile;
2248: if (*pc3 != 0.0) {
2249: mul3 = (*pc3)/(*pc2);
2250: *pc3 = mul3;
2251: pj = nbj + bd[prow];
2252: nz_tmp = bi[prow+1] - bd[prow] - 1;
2253: for (j=0; j<nz_tmp; j++) {
2254: idx = pj[j];
2255: tmp = rtmp22[idx];
2256: rtmp33[idx] -= mul3 * tmp;
2257: }
2258: PetscLogFlops(1+2*nz_tmp);
2259: }
2261: pj = bj + bi[row];
2262: pc1 = ba + bi[row];
2263: pc2 = ba + bi[row+1];
2264: pc3 = ba + bi[row+2];
2266: sctx.pv = rtmp33[row+2];
2267: rs = 0.0;
2268: rtmp11[row] = 1.0/rtmp11[row];
2269: rtmp22[row+1] = 1.0/rtmp22[row+1];
2270: rtmp33[row+2] = 1.0/rtmp33[row+2];
2271: /* copy row entries from dense representation to sparse */
2272: for (j=0; j<nz; j++) {
2273: idx = pj[j];
2274: pc1[j] = rtmp11[idx];
2275: pc2[j] = rtmp22[idx];
2276: pc3[j] = rtmp33[idx];
2277: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278: }
2280: sctx.rs = rs;
2281: MatPivotCheck(A,info,&sctx,row+2);
2282: if (sctx.newshift) goto endofwhile;
2283: break;
2285: default:
2286: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287: }
2288: row += nodesz; /* Update the row */
2289: }
2290: endofwhile:;
2291: } while (sctx.newshift);
2292: PetscFree3(rtmp11,rtmp22,rtmp33);
2293: PetscFree(tmp_vec2);
2294: ISRestoreIndices(isicol,&ic);
2295: ISRestoreIndices(isrow,&r);
2296: ISRestoreIndices(iscol,&c);
2298: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299: /* do not set solve add, since MatSolve_Inode + Add is faster */
2300: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2301: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302: C->assembled = PETSC_TRUE;
2303: C->preallocated = PETSC_TRUE;
2304: if (sctx.nshift) {
2305: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2307: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2309: }
2310: }
2311: PetscLogFlops(C->cmap->n);
2312: MatSeqAIJCheckInode(C);
2313: return(0);
2314: }
2317: /* ----------------------------------------------------------- */
2320: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321: {
2322: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2323: IS iscol = a->col,isrow = a->row;
2324: PetscErrorCode ierr;
2325: const PetscInt *r,*c,*rout,*cout;
2326: PetscInt i,j,n = A->rmap->n;
2327: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2328: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2330: PetscScalar sum1,sum2,sum3,sum4,sum5;
2331: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332: const PetscScalar *b;
2335: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336: node_max = a->inode.node_count;
2337: ns = a->inode.size; /* Node Size array */
2339: VecGetArrayRead(bb,&b);
2340: VecGetArray(xx,&x);
2341: tmp = a->solve_work;
2343: ISGetIndices(isrow,&rout); r = rout;
2344: ISGetIndices(iscol,&cout); c = cout;
2346: /* forward solve the lower triangular */
2347: tmps = tmp;
2348: aa = a_a;
2349: aj = a_j;
2350: ad = a->diag;
2352: for (i = 0,row = 0; i< node_max; ++i) {
2353: nsz = ns[i];
2354: aii = ai[row];
2355: v1 = aa + aii;
2356: vi = aj + aii;
2357: nz = ai[row+1]- ai[row];
2359: if (i < node_max-1) {
2360: /* Prefetch the indices for the next block */
2361: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362: /* Prefetch the data for the next block */
2363: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364: }
2366: switch (nsz) { /* Each loop in 'case' is unrolled */
2367: case 1:
2368: sum1 = b[r[row]];
2369: for (j=0; j<nz-1; j+=2) {
2370: i0 = vi[j];
2371: i1 = vi[j+1];
2372: tmp0 = tmps[i0];
2373: tmp1 = tmps[i1];
2374: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375: }
2376: if (j == nz-1) {
2377: tmp0 = tmps[vi[j]];
2378: sum1 -= v1[j]*tmp0;
2379: }
2380: tmp[row++]=sum1;
2381: break;
2382: case 2:
2383: sum1 = b[r[row]];
2384: sum2 = b[r[row+1]];
2385: v2 = aa + ai[row+1];
2387: for (j=0; j<nz-1; j+=2) {
2388: i0 = vi[j];
2389: i1 = vi[j+1];
2390: tmp0 = tmps[i0];
2391: tmp1 = tmps[i1];
2392: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394: }
2395: if (j == nz-1) {
2396: tmp0 = tmps[vi[j]];
2397: sum1 -= v1[j] *tmp0;
2398: sum2 -= v2[j] *tmp0;
2399: }
2400: sum2 -= v2[nz] * sum1;
2401: tmp[row++]=sum1;
2402: tmp[row++]=sum2;
2403: break;
2404: case 3:
2405: sum1 = b[r[row]];
2406: sum2 = b[r[row+1]];
2407: sum3 = b[r[row+2]];
2408: v2 = aa + ai[row+1];
2409: v3 = aa + ai[row+2];
2411: for (j=0; j<nz-1; j+=2) {
2412: i0 = vi[j];
2413: i1 = vi[j+1];
2414: tmp0 = tmps[i0];
2415: tmp1 = tmps[i1];
2416: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419: }
2420: if (j == nz-1) {
2421: tmp0 = tmps[vi[j]];
2422: sum1 -= v1[j] *tmp0;
2423: sum2 -= v2[j] *tmp0;
2424: sum3 -= v3[j] *tmp0;
2425: }
2426: sum2 -= v2[nz] * sum1;
2427: sum3 -= v3[nz] * sum1;
2428: sum3 -= v3[nz+1] * sum2;
2429: tmp[row++]=sum1;
2430: tmp[row++]=sum2;
2431: tmp[row++]=sum3;
2432: break;
2434: case 4:
2435: sum1 = b[r[row]];
2436: sum2 = b[r[row+1]];
2437: sum3 = b[r[row+2]];
2438: sum4 = b[r[row+3]];
2439: v2 = aa + ai[row+1];
2440: v3 = aa + ai[row+2];
2441: v4 = aa + ai[row+3];
2443: for (j=0; j<nz-1; j+=2) {
2444: i0 = vi[j];
2445: i1 = vi[j+1];
2446: tmp0 = tmps[i0];
2447: tmp1 = tmps[i1];
2448: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452: }
2453: if (j == nz-1) {
2454: tmp0 = tmps[vi[j]];
2455: sum1 -= v1[j] *tmp0;
2456: sum2 -= v2[j] *tmp0;
2457: sum3 -= v3[j] *tmp0;
2458: sum4 -= v4[j] *tmp0;
2459: }
2460: sum2 -= v2[nz] * sum1;
2461: sum3 -= v3[nz] * sum1;
2462: sum4 -= v4[nz] * sum1;
2463: sum3 -= v3[nz+1] * sum2;
2464: sum4 -= v4[nz+1] * sum2;
2465: sum4 -= v4[nz+2] * sum3;
2467: tmp[row++]=sum1;
2468: tmp[row++]=sum2;
2469: tmp[row++]=sum3;
2470: tmp[row++]=sum4;
2471: break;
2472: case 5:
2473: sum1 = b[r[row]];
2474: sum2 = b[r[row+1]];
2475: sum3 = b[r[row+2]];
2476: sum4 = b[r[row+3]];
2477: sum5 = b[r[row+4]];
2478: v2 = aa + ai[row+1];
2479: v3 = aa + ai[row+2];
2480: v4 = aa + ai[row+3];
2481: v5 = aa + ai[row+4];
2483: for (j=0; j<nz-1; j+=2) {
2484: i0 = vi[j];
2485: i1 = vi[j+1];
2486: tmp0 = tmps[i0];
2487: tmp1 = tmps[i1];
2488: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493: }
2494: if (j == nz-1) {
2495: tmp0 = tmps[vi[j]];
2496: sum1 -= v1[j] *tmp0;
2497: sum2 -= v2[j] *tmp0;
2498: sum3 -= v3[j] *tmp0;
2499: sum4 -= v4[j] *tmp0;
2500: sum5 -= v5[j] *tmp0;
2501: }
2503: sum2 -= v2[nz] * sum1;
2504: sum3 -= v3[nz] * sum1;
2505: sum4 -= v4[nz] * sum1;
2506: sum5 -= v5[nz] * sum1;
2507: sum3 -= v3[nz+1] * sum2;
2508: sum4 -= v4[nz+1] * sum2;
2509: sum5 -= v5[nz+1] * sum2;
2510: sum4 -= v4[nz+2] * sum3;
2511: sum5 -= v5[nz+2] * sum3;
2512: sum5 -= v5[nz+3] * sum4;
2514: tmp[row++]=sum1;
2515: tmp[row++]=sum2;
2516: tmp[row++]=sum3;
2517: tmp[row++]=sum4;
2518: tmp[row++]=sum5;
2519: break;
2520: default:
2521: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522: }
2523: }
2524: /* backward solve the upper triangular */
2525: for (i=node_max -1,row = n-1; i>=0; i--) {
2526: nsz = ns[i];
2527: aii = ad[row+1] + 1;
2528: v1 = aa + aii;
2529: vi = aj + aii;
2530: nz = ad[row]- ad[row+1] - 1;
2532: if (i > 0) {
2533: /* Prefetch the indices for the next block */
2534: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535: /* Prefetch the data for the next block */
2536: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537: }
2539: switch (nsz) { /* Each loop in 'case' is unrolled */
2540: case 1:
2541: sum1 = tmp[row];
2543: for (j=0; j<nz-1; j+=2) {
2544: i0 = vi[j];
2545: i1 = vi[j+1];
2546: tmp0 = tmps[i0];
2547: tmp1 = tmps[i1];
2548: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549: }
2550: if (j == nz-1) {
2551: tmp0 = tmps[vi[j]];
2552: sum1 -= v1[j]*tmp0;
2553: }
2554: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555: break;
2556: case 2:
2557: sum1 = tmp[row];
2558: sum2 = tmp[row-1];
2559: v2 = aa + ad[row] + 1;
2560: for (j=0; j<nz-1; j+=2) {
2561: i0 = vi[j];
2562: i1 = vi[j+1];
2563: tmp0 = tmps[i0];
2564: tmp1 = tmps[i1];
2565: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567: }
2568: if (j == nz-1) {
2569: tmp0 = tmps[vi[j]];
2570: sum1 -= v1[j]* tmp0;
2571: sum2 -= v2[j+1]* tmp0;
2572: }
2574: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575: sum2 -= v2[0] * tmp0;
2576: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577: break;
2578: case 3:
2579: sum1 = tmp[row];
2580: sum2 = tmp[row -1];
2581: sum3 = tmp[row -2];
2582: v2 = aa + ad[row] + 1;
2583: v3 = aa + ad[row -1] + 1;
2584: for (j=0; j<nz-1; j+=2) {
2585: i0 = vi[j];
2586: i1 = vi[j+1];
2587: tmp0 = tmps[i0];
2588: tmp1 = tmps[i1];
2589: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592: }
2593: if (j== nz-1) {
2594: tmp0 = tmps[vi[j]];
2595: sum1 -= v1[j] * tmp0;
2596: sum2 -= v2[j+1] * tmp0;
2597: sum3 -= v3[j+2] * tmp0;
2598: }
2599: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600: sum2 -= v2[0]* tmp0;
2601: sum3 -= v3[1] * tmp0;
2602: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603: sum3 -= v3[0]* tmp0;
2604: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2606: break;
2607: case 4:
2608: sum1 = tmp[row];
2609: sum2 = tmp[row -1];
2610: sum3 = tmp[row -2];
2611: sum4 = tmp[row -3];
2612: v2 = aa + ad[row]+1;
2613: v3 = aa + ad[row -1]+1;
2614: v4 = aa + ad[row -2]+1;
2616: for (j=0; j<nz-1; j+=2) {
2617: i0 = vi[j];
2618: i1 = vi[j+1];
2619: tmp0 = tmps[i0];
2620: tmp1 = tmps[i1];
2621: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2622: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625: }
2626: if (j== nz-1) {
2627: tmp0 = tmps[vi[j]];
2628: sum1 -= v1[j] * tmp0;
2629: sum2 -= v2[j+1] * tmp0;
2630: sum3 -= v3[j+2] * tmp0;
2631: sum4 -= v4[j+3] * tmp0;
2632: }
2634: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635: sum2 -= v2[0] * tmp0;
2636: sum3 -= v3[1] * tmp0;
2637: sum4 -= v4[2] * tmp0;
2638: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639: sum3 -= v3[0] * tmp0;
2640: sum4 -= v4[1] * tmp0;
2641: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642: sum4 -= v4[0] * tmp0;
2643: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644: break;
2645: case 5:
2646: sum1 = tmp[row];
2647: sum2 = tmp[row -1];
2648: sum3 = tmp[row -2];
2649: sum4 = tmp[row -3];
2650: sum5 = tmp[row -4];
2651: v2 = aa + ad[row]+1;
2652: v3 = aa + ad[row -1]+1;
2653: v4 = aa + ad[row -2]+1;
2654: v5 = aa + ad[row -3]+1;
2655: for (j=0; j<nz-1; j+=2) {
2656: i0 = vi[j];
2657: i1 = vi[j+1];
2658: tmp0 = tmps[i0];
2659: tmp1 = tmps[i1];
2660: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665: }
2666: if (j==nz-1) {
2667: tmp0 = tmps[vi[j]];
2668: sum1 -= v1[j] * tmp0;
2669: sum2 -= v2[j+1] * tmp0;
2670: sum3 -= v3[j+2] * tmp0;
2671: sum4 -= v4[j+3] * tmp0;
2672: sum5 -= v5[j+4] * tmp0;
2673: }
2675: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676: sum2 -= v2[0] * tmp0;
2677: sum3 -= v3[1] * tmp0;
2678: sum4 -= v4[2] * tmp0;
2679: sum5 -= v5[3] * tmp0;
2680: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681: sum3 -= v3[0] * tmp0;
2682: sum4 -= v4[1] * tmp0;
2683: sum5 -= v5[2] * tmp0;
2684: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685: sum4 -= v4[0] * tmp0;
2686: sum5 -= v5[1] * tmp0;
2687: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688: sum5 -= v5[0] * tmp0;
2689: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690: break;
2691: default:
2692: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693: }
2694: }
2695: ISRestoreIndices(isrow,&rout);
2696: ISRestoreIndices(iscol,&cout);
2697: VecRestoreArrayRead(bb,&b);
2698: VecRestoreArray(xx,&x);
2699: PetscLogFlops(2.0*a->nz - A->cmap->n);
2700: return(0);
2701: }
2704: /*
2705: Makes a longer coloring[] array and calls the usual code with that
2706: */
2709: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710: {
2711: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2712: PetscErrorCode ierr;
2713: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714: PetscInt *colorused,i;
2715: ISColoringValue *newcolor;
2718: PetscMalloc1(n+1,&newcolor);
2719: /* loop over inodes, marking a color for each column*/
2720: row = 0;
2721: for (i=0; i<m; i++) {
2722: for (j=0; j<ns[i]; j++) {
2723: newcolor[row++] = coloring[i] + j*ncolors;
2724: }
2725: }
2727: /* eliminate unneeded colors */
2728: PetscCalloc1(5*ncolors,&colorused);
2729: for (i=0; i<n; i++) {
2730: colorused[newcolor[i]] = 1;
2731: }
2733: for (i=1; i<5*ncolors; i++) {
2734: colorused[i] += colorused[i-1];
2735: }
2736: ncolors = colorused[5*ncolors-1];
2737: for (i=0; i<n; i++) {
2738: newcolor[i] = colorused[newcolor[i]]-1;
2739: }
2740: PetscFree(colorused);
2741: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2742: PetscFree(coloring);
2743: return(0);
2744: }
2746: #include <petsc/private/kernels/blockinvert.h>
2750: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751: {
2752: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2753: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754: MatScalar *ibdiag,*bdiag,work[25],*t;
2755: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757: const PetscScalar *xb, *b;
2758: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2759: PetscErrorCode ierr;
2760: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761: PetscInt sz,k,ipvt[5];
2762: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2765: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2766: if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2767: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2769: if (!a->inode.ibdiagvalid) {
2770: if (!a->inode.ibdiag) {
2771: /* calculate space needed for diagonal blocks */
2772: for (i=0; i<m; i++) {
2773: cnt += sizes[i]*sizes[i];
2774: }
2775: a->inode.bdiagsize = cnt;
2777: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2778: }
2780: /* copy over the diagonal blocks and invert them */
2781: ibdiag = a->inode.ibdiag;
2782: bdiag = a->inode.bdiag;
2783: cnt = 0;
2784: for (i=0, row = 0; i<m; i++) {
2785: for (j=0; j<sizes[i]; j++) {
2786: for (k=0; k<sizes[i]; k++) {
2787: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2788: }
2789: }
2790: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2792: switch (sizes[i]) {
2793: case 1:
2794: /* Create matrix data structure */
2795: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2796: ibdiag[cnt] = 1.0/ibdiag[cnt];
2797: break;
2798: case 2:
2799: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2800: break;
2801: case 3:
2802: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2803: break;
2804: case 4:
2805: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2806: break;
2807: case 5:
2808: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2809: break;
2810: default:
2811: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2812: }
2813: cnt += sizes[i]*sizes[i];
2814: row += sizes[i];
2815: }
2816: a->inode.ibdiagvalid = PETSC_TRUE;
2817: }
2818: ibdiag = a->inode.ibdiag;
2819: bdiag = a->inode.bdiag;
2820: t = a->inode.ssor_work;
2822: VecGetArray(xx,&x);
2823: VecGetArrayRead(bb,&b);
2824: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2825: if (flag & SOR_ZERO_INITIAL_GUESS) {
2826: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2828: for (i=0, row=0; i<m; i++) {
2829: sz = diag[row] - ii[row];
2830: v1 = a->a + ii[row];
2831: idx = a->j + ii[row];
2833: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2834: switch (sizes[i]) {
2835: case 1:
2837: sum1 = b[row];
2838: for (n = 0; n<sz-1; n+=2) {
2839: i1 = idx[0];
2840: i2 = idx[1];
2841: idx += 2;
2842: tmp0 = x[i1];
2843: tmp1 = x[i2];
2844: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2845: }
2847: if (n == sz-1) {
2848: tmp0 = x[*idx];
2849: sum1 -= *v1 * tmp0;
2850: }
2851: t[row] = sum1;
2852: x[row++] = sum1*(*ibdiag++);
2853: break;
2854: case 2:
2855: v2 = a->a + ii[row+1];
2856: sum1 = b[row];
2857: sum2 = b[row+1];
2858: for (n = 0; n<sz-1; n+=2) {
2859: i1 = idx[0];
2860: i2 = idx[1];
2861: idx += 2;
2862: tmp0 = x[i1];
2863: tmp1 = x[i2];
2864: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2865: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2866: }
2868: if (n == sz-1) {
2869: tmp0 = x[*idx];
2870: sum1 -= v1[0] * tmp0;
2871: sum2 -= v2[0] * tmp0;
2872: }
2873: t[row] = sum1;
2874: t[row+1] = sum2;
2875: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2876: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2877: ibdiag += 4;
2878: break;
2879: case 3:
2880: v2 = a->a + ii[row+1];
2881: v3 = a->a + ii[row+2];
2882: sum1 = b[row];
2883: sum2 = b[row+1];
2884: sum3 = b[row+2];
2885: for (n = 0; n<sz-1; n+=2) {
2886: i1 = idx[0];
2887: i2 = idx[1];
2888: idx += 2;
2889: tmp0 = x[i1];
2890: tmp1 = x[i2];
2891: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2892: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2893: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2894: }
2896: if (n == sz-1) {
2897: tmp0 = x[*idx];
2898: sum1 -= v1[0] * tmp0;
2899: sum2 -= v2[0] * tmp0;
2900: sum3 -= v3[0] * tmp0;
2901: }
2902: t[row] = sum1;
2903: t[row+1] = sum2;
2904: t[row+2] = sum3;
2905: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2906: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2907: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2908: ibdiag += 9;
2909: break;
2910: case 4:
2911: v2 = a->a + ii[row+1];
2912: v3 = a->a + ii[row+2];
2913: v4 = a->a + ii[row+3];
2914: sum1 = b[row];
2915: sum2 = b[row+1];
2916: sum3 = b[row+2];
2917: sum4 = b[row+3];
2918: for (n = 0; n<sz-1; n+=2) {
2919: i1 = idx[0];
2920: i2 = idx[1];
2921: idx += 2;
2922: tmp0 = x[i1];
2923: tmp1 = x[i2];
2924: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2925: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2926: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2927: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2928: }
2930: if (n == sz-1) {
2931: tmp0 = x[*idx];
2932: sum1 -= v1[0] * tmp0;
2933: sum2 -= v2[0] * tmp0;
2934: sum3 -= v3[0] * tmp0;
2935: sum4 -= v4[0] * tmp0;
2936: }
2937: t[row] = sum1;
2938: t[row+1] = sum2;
2939: t[row+2] = sum3;
2940: t[row+3] = sum4;
2941: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2942: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2943: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2944: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2945: ibdiag += 16;
2946: break;
2947: case 5:
2948: v2 = a->a + ii[row+1];
2949: v3 = a->a + ii[row+2];
2950: v4 = a->a + ii[row+3];
2951: v5 = a->a + ii[row+4];
2952: sum1 = b[row];
2953: sum2 = b[row+1];
2954: sum3 = b[row+2];
2955: sum4 = b[row+3];
2956: sum5 = b[row+4];
2957: for (n = 0; n<sz-1; n+=2) {
2958: i1 = idx[0];
2959: i2 = idx[1];
2960: idx += 2;
2961: tmp0 = x[i1];
2962: tmp1 = x[i2];
2963: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2964: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2965: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2966: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2967: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2968: }
2970: if (n == sz-1) {
2971: tmp0 = x[*idx];
2972: sum1 -= v1[0] * tmp0;
2973: sum2 -= v2[0] * tmp0;
2974: sum3 -= v3[0] * tmp0;
2975: sum4 -= v4[0] * tmp0;
2976: sum5 -= v5[0] * tmp0;
2977: }
2978: t[row] = sum1;
2979: t[row+1] = sum2;
2980: t[row+2] = sum3;
2981: t[row+3] = sum4;
2982: t[row+4] = sum5;
2983: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2984: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2985: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2986: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2987: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2988: ibdiag += 25;
2989: break;
2990: default:
2991: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2992: }
2993: }
2995: xb = t;
2996: PetscLogFlops(a->nz);
2997: } else xb = b;
2998: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3000: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3001: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3002: ibdiag -= sizes[i]*sizes[i];
3003: sz = ii[row+1] - diag[row] - 1;
3004: v1 = a->a + diag[row] + 1;
3005: idx = a->j + diag[row] + 1;
3007: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3008: switch (sizes[i]) {
3009: case 1:
3011: sum1 = xb[row];
3012: for (n = 0; n<sz-1; n+=2) {
3013: i1 = idx[0];
3014: i2 = idx[1];
3015: idx += 2;
3016: tmp0 = x[i1];
3017: tmp1 = x[i2];
3018: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3019: }
3021: if (n == sz-1) {
3022: tmp0 = x[*idx];
3023: sum1 -= *v1*tmp0;
3024: }
3025: x[row--] = sum1*(*ibdiag);
3026: break;
3028: case 2:
3030: sum1 = xb[row];
3031: sum2 = xb[row-1];
3032: /* note that sum1 is associated with the second of the two rows */
3033: v2 = a->a + diag[row-1] + 2;
3034: for (n = 0; n<sz-1; n+=2) {
3035: i1 = idx[0];
3036: i2 = idx[1];
3037: idx += 2;
3038: tmp0 = x[i1];
3039: tmp1 = x[i2];
3040: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3041: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3042: }
3044: if (n == sz-1) {
3045: tmp0 = x[*idx];
3046: sum1 -= *v1*tmp0;
3047: sum2 -= *v2*tmp0;
3048: }
3049: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3050: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3051: break;
3052: case 3:
3054: sum1 = xb[row];
3055: sum2 = xb[row-1];
3056: sum3 = xb[row-2];
3057: v2 = a->a + diag[row-1] + 2;
3058: v3 = a->a + diag[row-2] + 3;
3059: for (n = 0; n<sz-1; n+=2) {
3060: i1 = idx[0];
3061: i2 = idx[1];
3062: idx += 2;
3063: tmp0 = x[i1];
3064: tmp1 = x[i2];
3065: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3066: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3067: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3068: }
3070: if (n == sz-1) {
3071: tmp0 = x[*idx];
3072: sum1 -= *v1*tmp0;
3073: sum2 -= *v2*tmp0;
3074: sum3 -= *v3*tmp0;
3075: }
3076: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3077: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3078: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3079: break;
3080: case 4:
3082: sum1 = xb[row];
3083: sum2 = xb[row-1];
3084: sum3 = xb[row-2];
3085: sum4 = xb[row-3];
3086: v2 = a->a + diag[row-1] + 2;
3087: v3 = a->a + diag[row-2] + 3;
3088: v4 = a->a + diag[row-3] + 4;
3089: for (n = 0; n<sz-1; n+=2) {
3090: i1 = idx[0];
3091: i2 = idx[1];
3092: idx += 2;
3093: tmp0 = x[i1];
3094: tmp1 = x[i2];
3095: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3096: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3097: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3098: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3099: }
3101: if (n == sz-1) {
3102: tmp0 = x[*idx];
3103: sum1 -= *v1*tmp0;
3104: sum2 -= *v2*tmp0;
3105: sum3 -= *v3*tmp0;
3106: sum4 -= *v4*tmp0;
3107: }
3108: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3109: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3110: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3111: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3112: break;
3113: case 5:
3115: sum1 = xb[row];
3116: sum2 = xb[row-1];
3117: sum3 = xb[row-2];
3118: sum4 = xb[row-3];
3119: sum5 = xb[row-4];
3120: v2 = a->a + diag[row-1] + 2;
3121: v3 = a->a + diag[row-2] + 3;
3122: v4 = a->a + diag[row-3] + 4;
3123: v5 = a->a + diag[row-4] + 5;
3124: for (n = 0; n<sz-1; n+=2) {
3125: i1 = idx[0];
3126: i2 = idx[1];
3127: idx += 2;
3128: tmp0 = x[i1];
3129: tmp1 = x[i2];
3130: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3131: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3132: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3133: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3134: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3135: }
3137: if (n == sz-1) {
3138: tmp0 = x[*idx];
3139: sum1 -= *v1*tmp0;
3140: sum2 -= *v2*tmp0;
3141: sum3 -= *v3*tmp0;
3142: sum4 -= *v4*tmp0;
3143: sum5 -= *v5*tmp0;
3144: }
3145: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3146: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3147: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3148: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3149: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3150: break;
3151: default:
3152: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3153: }
3154: }
3156: PetscLogFlops(a->nz);
3157: }
3158: its--;
3159: }
3160: while (its--) {
3162: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3163: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3164: i<m;
3165: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3167: sz = diag[row] - ii[row];
3168: v1 = a->a + ii[row];
3169: idx = a->j + ii[row];
3170: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3171: switch (sizes[i]) {
3172: case 1:
3173: sum1 = b[row];
3174: for (n = 0; n<sz-1; n+=2) {
3175: i1 = idx[0];
3176: i2 = idx[1];
3177: idx += 2;
3178: tmp0 = x[i1];
3179: tmp1 = x[i2];
3180: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3181: }
3182: if (n == sz-1) {
3183: tmp0 = x[*idx++];
3184: sum1 -= *v1 * tmp0;
3185: v1++;
3186: }
3187: t[row] = sum1;
3188: sz = ii[row+1] - diag[row] - 1;
3189: idx = a->j + diag[row] + 1;
3190: v1 += 1;
3191: for (n = 0; n<sz-1; n+=2) {
3192: i1 = idx[0];
3193: i2 = idx[1];
3194: idx += 2;
3195: tmp0 = x[i1];
3196: tmp1 = x[i2];
3197: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3198: }
3199: if (n == sz-1) {
3200: tmp0 = x[*idx++];
3201: sum1 -= *v1 * tmp0;
3202: }
3203: /* in MatSOR_SeqAIJ this line would be
3204: *
3205: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3206: *
3207: * but omega == 1, so this becomes
3208: *
3209: * x[row] = sum1*(*ibdiag++);
3210: *
3211: */
3212: x[row] = sum1*(*ibdiag);
3213: break;
3214: case 2:
3215: v2 = a->a + ii[row+1];
3216: sum1 = b[row];
3217: sum2 = b[row+1];
3218: for (n = 0; n<sz-1; n+=2) {
3219: i1 = idx[0];
3220: i2 = idx[1];
3221: idx += 2;
3222: tmp0 = x[i1];
3223: tmp1 = x[i2];
3224: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3225: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3226: }
3227: if (n == sz-1) {
3228: tmp0 = x[*idx++];
3229: sum1 -= v1[0] * tmp0;
3230: sum2 -= v2[0] * tmp0;
3231: v1++; v2++;
3232: }
3233: t[row] = sum1;
3234: t[row+1] = sum2;
3235: sz = ii[row+1] - diag[row] - 2;
3236: idx = a->j + diag[row] + 2;
3237: v1 += 2;
3238: v2 += 2;
3239: for (n = 0; n<sz-1; n+=2) {
3240: i1 = idx[0];
3241: i2 = idx[1];
3242: idx += 2;
3243: tmp0 = x[i1];
3244: tmp1 = x[i2];
3245: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3246: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3247: }
3248: if (n == sz-1) {
3249: tmp0 = x[*idx];
3250: sum1 -= v1[0] * tmp0;
3251: sum2 -= v2[0] * tmp0;
3252: }
3253: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3254: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3255: break;
3256: case 3:
3257: v2 = a->a + ii[row+1];
3258: v3 = a->a + ii[row+2];
3259: sum1 = b[row];
3260: sum2 = b[row+1];
3261: sum3 = b[row+2];
3262: for (n = 0; n<sz-1; n+=2) {
3263: i1 = idx[0];
3264: i2 = idx[1];
3265: idx += 2;
3266: tmp0 = x[i1];
3267: tmp1 = x[i2];
3268: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3269: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3270: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3271: }
3272: if (n == sz-1) {
3273: tmp0 = x[*idx++];
3274: sum1 -= v1[0] * tmp0;
3275: sum2 -= v2[0] * tmp0;
3276: sum3 -= v3[0] * tmp0;
3277: v1++; v2++; v3++;
3278: }
3279: t[row] = sum1;
3280: t[row+1] = sum2;
3281: t[row+2] = sum3;
3282: sz = ii[row+1] - diag[row] - 3;
3283: idx = a->j + diag[row] + 3;
3284: v1 += 3;
3285: v2 += 3;
3286: v3 += 3;
3287: for (n = 0; n<sz-1; n+=2) {
3288: i1 = idx[0];
3289: i2 = idx[1];
3290: idx += 2;
3291: tmp0 = x[i1];
3292: tmp1 = x[i2];
3293: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3294: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3295: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3296: }
3297: if (n == sz-1) {
3298: tmp0 = x[*idx];
3299: sum1 -= v1[0] * tmp0;
3300: sum2 -= v2[0] * tmp0;
3301: sum3 -= v3[0] * tmp0;
3302: }
3303: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3304: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3305: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3306: break;
3307: case 4:
3308: v2 = a->a + ii[row+1];
3309: v3 = a->a + ii[row+2];
3310: v4 = a->a + ii[row+3];
3311: sum1 = b[row];
3312: sum2 = b[row+1];
3313: sum3 = b[row+2];
3314: sum4 = b[row+3];
3315: for (n = 0; n<sz-1; n+=2) {
3316: i1 = idx[0];
3317: i2 = idx[1];
3318: idx += 2;
3319: tmp0 = x[i1];
3320: tmp1 = x[i2];
3321: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3322: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3323: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3324: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3325: }
3326: if (n == sz-1) {
3327: tmp0 = x[*idx++];
3328: sum1 -= v1[0] * tmp0;
3329: sum2 -= v2[0] * tmp0;
3330: sum3 -= v3[0] * tmp0;
3331: sum4 -= v4[0] * tmp0;
3332: v1++; v2++; v3++; v4++;
3333: }
3334: t[row] = sum1;
3335: t[row+1] = sum2;
3336: t[row+2] = sum3;
3337: t[row+3] = sum4;
3338: sz = ii[row+1] - diag[row] - 4;
3339: idx = a->j + diag[row] + 4;
3340: v1 += 4;
3341: v2 += 4;
3342: v3 += 4;
3343: v4 += 4;
3344: for (n = 0; n<sz-1; n+=2) {
3345: i1 = idx[0];
3346: i2 = idx[1];
3347: idx += 2;
3348: tmp0 = x[i1];
3349: tmp1 = x[i2];
3350: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3351: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3352: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3353: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3354: }
3355: if (n == sz-1) {
3356: tmp0 = x[*idx];
3357: sum1 -= v1[0] * tmp0;
3358: sum2 -= v2[0] * tmp0;
3359: sum3 -= v3[0] * tmp0;
3360: sum4 -= v4[0] * tmp0;
3361: }
3362: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3363: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3364: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3365: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3366: break;
3367: case 5:
3368: v2 = a->a + ii[row+1];
3369: v3 = a->a + ii[row+2];
3370: v4 = a->a + ii[row+3];
3371: v5 = a->a + ii[row+4];
3372: sum1 = b[row];
3373: sum2 = b[row+1];
3374: sum3 = b[row+2];
3375: sum4 = b[row+3];
3376: sum5 = b[row+4];
3377: for (n = 0; n<sz-1; n+=2) {
3378: i1 = idx[0];
3379: i2 = idx[1];
3380: idx += 2;
3381: tmp0 = x[i1];
3382: tmp1 = x[i2];
3383: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3384: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3385: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3386: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3387: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3388: }
3389: if (n == sz-1) {
3390: tmp0 = x[*idx++];
3391: sum1 -= v1[0] * tmp0;
3392: sum2 -= v2[0] * tmp0;
3393: sum3 -= v3[0] * tmp0;
3394: sum4 -= v4[0] * tmp0;
3395: sum5 -= v5[0] * tmp0;
3396: v1++; v2++; v3++; v4++; v5++;
3397: }
3398: t[row] = sum1;
3399: t[row+1] = sum2;
3400: t[row+2] = sum3;
3401: t[row+3] = sum4;
3402: t[row+4] = sum5;
3403: sz = ii[row+1] - diag[row] - 5;
3404: idx = a->j + diag[row] + 5;
3405: v1 += 5;
3406: v2 += 5;
3407: v3 += 5;
3408: v4 += 5;
3409: v5 += 5;
3410: for (n = 0; n<sz-1; n+=2) {
3411: i1 = idx[0];
3412: i2 = idx[1];
3413: idx += 2;
3414: tmp0 = x[i1];
3415: tmp1 = x[i2];
3416: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3417: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3418: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3419: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3420: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3421: }
3422: if (n == sz-1) {
3423: tmp0 = x[*idx];
3424: sum1 -= v1[0] * tmp0;
3425: sum2 -= v2[0] * tmp0;
3426: sum3 -= v3[0] * tmp0;
3427: sum4 -= v4[0] * tmp0;
3428: sum5 -= v5[0] * tmp0;
3429: }
3430: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3431: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3432: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3433: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3434: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3435: break;
3436: default:
3437: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3438: }
3439: }
3440: xb = t;
3441: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3442: } else xb = b;
3444: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3446: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3447: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3448: ibdiag -= sizes[i]*sizes[i];
3450: /* set RHS */
3451: if (xb == b) {
3452: /* whole (old way) */
3453: sz = ii[row+1] - ii[row];
3454: idx = a->j + ii[row];
3455: switch (sizes[i]) {
3456: case 5:
3457: v5 = a->a + ii[row-4];
3458: case 4: /* fall through */
3459: v4 = a->a + ii[row-3];
3460: case 3:
3461: v3 = a->a + ii[row-2];
3462: case 2:
3463: v2 = a->a + ii[row-1];
3464: case 1:
3465: v1 = a->a + ii[row];
3466: break;
3467: default:
3468: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3469: }
3470: } else {
3471: /* upper, no diag */
3472: sz = ii[row+1] - diag[row] - 1;
3473: idx = a->j + diag[row] + 1;
3474: switch (sizes[i]) {
3475: case 5:
3476: v5 = a->a + diag[row-4] + 5;
3477: case 4: /* fall through */
3478: v4 = a->a + diag[row-3] + 4;
3479: case 3:
3480: v3 = a->a + diag[row-2] + 3;
3481: case 2:
3482: v2 = a->a + diag[row-1] + 2;
3483: case 1:
3484: v1 = a->a + diag[row] + 1;
3485: }
3486: }
3487: /* set sum */
3488: switch (sizes[i]) {
3489: case 5:
3490: sum5 = xb[row-4];
3491: case 4: /* fall through */
3492: sum4 = xb[row-3];
3493: case 3:
3494: sum3 = xb[row-2];
3495: case 2:
3496: sum2 = xb[row-1];
3497: case 1:
3498: /* note that sum1 is associated with the last row */
3499: sum1 = xb[row];
3500: }
3501: /* do sums */
3502: for (n = 0; n<sz-1; n+=2) {
3503: i1 = idx[0];
3504: i2 = idx[1];
3505: idx += 2;
3506: tmp0 = x[i1];
3507: tmp1 = x[i2];
3508: switch (sizes[i]) {
3509: case 5:
3510: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3511: case 4: /* fall through */
3512: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3513: case 3:
3514: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3515: case 2:
3516: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3517: case 1:
3518: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3519: }
3520: }
3521: /* ragged edge */
3522: if (n == sz-1) {
3523: tmp0 = x[*idx];
3524: switch (sizes[i]) {
3525: case 5:
3526: sum5 -= *v5*tmp0;
3527: case 4: /* fall through */
3528: sum4 -= *v4*tmp0;
3529: case 3:
3530: sum3 -= *v3*tmp0;
3531: case 2:
3532: sum2 -= *v2*tmp0;
3533: case 1:
3534: sum1 -= *v1*tmp0;
3535: }
3536: }
3537: /* update */
3538: if (xb == b) {
3539: /* whole (old way) w/ diag */
3540: switch (sizes[i]) {
3541: case 5:
3542: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3543: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3544: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3545: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3546: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3547: break;
3548: case 4:
3549: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3550: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3551: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3552: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3553: break;
3554: case 3:
3555: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3556: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3557: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3558: break;
3559: case 2:
3560: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3561: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3562: break;
3563: case 1:
3564: x[row--] += sum1*(*ibdiag);
3565: break;
3566: }
3567: } else {
3568: /* no diag so set = */
3569: switch (sizes[i]) {
3570: case 5:
3571: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3572: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3573: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3574: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3575: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3576: break;
3577: case 4:
3578: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3579: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3580: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3581: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3582: break;
3583: case 3:
3584: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3585: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3586: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3587: break;
3588: case 2:
3589: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3590: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3591: break;
3592: case 1:
3593: x[row--] = sum1*(*ibdiag);
3594: break;
3595: }
3596: }
3597: }
3598: if (xb == b) {
3599: PetscLogFlops(2.0*a->nz);
3600: } else {
3601: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3602: }
3603: }
3604: }
3605: if (flag & SOR_EISENSTAT) {
3606: /*
3607: Apply (U + D)^-1 where D is now the block diagonal
3608: */
3609: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3610: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3611: ibdiag -= sizes[i]*sizes[i];
3612: sz = ii[row+1] - diag[row] - 1;
3613: v1 = a->a + diag[row] + 1;
3614: idx = a->j + diag[row] + 1;
3615: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3616: switch (sizes[i]) {
3617: case 1:
3619: sum1 = b[row];
3620: for (n = 0; n<sz-1; n+=2) {
3621: i1 = idx[0];
3622: i2 = idx[1];
3623: idx += 2;
3624: tmp0 = x[i1];
3625: tmp1 = x[i2];
3626: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3627: }
3629: if (n == sz-1) {
3630: tmp0 = x[*idx];
3631: sum1 -= *v1*tmp0;
3632: }
3633: x[row] = sum1*(*ibdiag);row--;
3634: break;
3636: case 2:
3638: sum1 = b[row];
3639: sum2 = b[row-1];
3640: /* note that sum1 is associated with the second of the two rows */
3641: v2 = a->a + diag[row-1] + 2;
3642: for (n = 0; n<sz-1; n+=2) {
3643: i1 = idx[0];
3644: i2 = idx[1];
3645: idx += 2;
3646: tmp0 = x[i1];
3647: tmp1 = x[i2];
3648: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3649: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3650: }
3652: if (n == sz-1) {
3653: tmp0 = x[*idx];
3654: sum1 -= *v1*tmp0;
3655: sum2 -= *v2*tmp0;
3656: }
3657: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3658: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3659: row -= 2;
3660: break;
3661: case 3:
3663: sum1 = b[row];
3664: sum2 = b[row-1];
3665: sum3 = b[row-2];
3666: v2 = a->a + diag[row-1] + 2;
3667: v3 = a->a + diag[row-2] + 3;
3668: for (n = 0; n<sz-1; n+=2) {
3669: i1 = idx[0];
3670: i2 = idx[1];
3671: idx += 2;
3672: tmp0 = x[i1];
3673: tmp1 = x[i2];
3674: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3675: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3676: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3677: }
3679: if (n == sz-1) {
3680: tmp0 = x[*idx];
3681: sum1 -= *v1*tmp0;
3682: sum2 -= *v2*tmp0;
3683: sum3 -= *v3*tmp0;
3684: }
3685: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3686: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3687: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3688: row -= 3;
3689: break;
3690: case 4:
3692: sum1 = b[row];
3693: sum2 = b[row-1];
3694: sum3 = b[row-2];
3695: sum4 = b[row-3];
3696: v2 = a->a + diag[row-1] + 2;
3697: v3 = a->a + diag[row-2] + 3;
3698: v4 = a->a + diag[row-3] + 4;
3699: for (n = 0; n<sz-1; n+=2) {
3700: i1 = idx[0];
3701: i2 = idx[1];
3702: idx += 2;
3703: tmp0 = x[i1];
3704: tmp1 = x[i2];
3705: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3706: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3707: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3708: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3709: }
3711: if (n == sz-1) {
3712: tmp0 = x[*idx];
3713: sum1 -= *v1*tmp0;
3714: sum2 -= *v2*tmp0;
3715: sum3 -= *v3*tmp0;
3716: sum4 -= *v4*tmp0;
3717: }
3718: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3719: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3720: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3721: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3722: row -= 4;
3723: break;
3724: case 5:
3726: sum1 = b[row];
3727: sum2 = b[row-1];
3728: sum3 = b[row-2];
3729: sum4 = b[row-3];
3730: sum5 = b[row-4];
3731: v2 = a->a + diag[row-1] + 2;
3732: v3 = a->a + diag[row-2] + 3;
3733: v4 = a->a + diag[row-3] + 4;
3734: v5 = a->a + diag[row-4] + 5;
3735: for (n = 0; n<sz-1; n+=2) {
3736: i1 = idx[0];
3737: i2 = idx[1];
3738: idx += 2;
3739: tmp0 = x[i1];
3740: tmp1 = x[i2];
3741: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3742: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3743: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3744: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3745: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3746: }
3748: if (n == sz-1) {
3749: tmp0 = x[*idx];
3750: sum1 -= *v1*tmp0;
3751: sum2 -= *v2*tmp0;
3752: sum3 -= *v3*tmp0;
3753: sum4 -= *v4*tmp0;
3754: sum5 -= *v5*tmp0;
3755: }
3756: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3757: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3758: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3759: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3760: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3761: row -= 5;
3762: break;
3763: default:
3764: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3765: }
3766: }
3767: PetscLogFlops(a->nz);
3769: /*
3770: t = b - D x where D is the block diagonal
3771: */
3772: cnt = 0;
3773: for (i=0, row=0; i<m; i++) {
3774: switch (sizes[i]) {
3775: case 1:
3776: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3777: break;
3778: case 2:
3779: x1 = x[row]; x2 = x[row+1];
3780: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3781: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3782: t[row] = b[row] - tmp1;
3783: t[row+1] = b[row+1] - tmp2; row += 2;
3784: cnt += 4;
3785: break;
3786: case 3:
3787: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3788: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3789: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3790: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3791: t[row] = b[row] - tmp1;
3792: t[row+1] = b[row+1] - tmp2;
3793: t[row+2] = b[row+2] - tmp3; row += 3;
3794: cnt += 9;
3795: break;
3796: case 4:
3797: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3798: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3799: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3800: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3801: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3802: t[row] = b[row] - tmp1;
3803: t[row+1] = b[row+1] - tmp2;
3804: t[row+2] = b[row+2] - tmp3;
3805: t[row+3] = b[row+3] - tmp4; row += 4;
3806: cnt += 16;
3807: break;
3808: case 5:
3809: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3810: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3811: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3812: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3813: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3814: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3815: t[row] = b[row] - tmp1;
3816: t[row+1] = b[row+1] - tmp2;
3817: t[row+2] = b[row+2] - tmp3;
3818: t[row+3] = b[row+3] - tmp4;
3819: t[row+4] = b[row+4] - tmp5;row += 5;
3820: cnt += 25;
3821: break;
3822: default:
3823: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3824: }
3825: }
3826: PetscLogFlops(m);
3830: /*
3831: Apply (L + D)^-1 where D is the block diagonal
3832: */
3833: for (i=0, row=0; i<m; i++) {
3834: sz = diag[row] - ii[row];
3835: v1 = a->a + ii[row];
3836: idx = a->j + ii[row];
3837: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3838: switch (sizes[i]) {
3839: case 1:
3841: sum1 = t[row];
3842: for (n = 0; n<sz-1; n+=2) {
3843: i1 = idx[0];
3844: i2 = idx[1];
3845: idx += 2;
3846: tmp0 = t[i1];
3847: tmp1 = t[i2];
3848: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3849: }
3851: if (n == sz-1) {
3852: tmp0 = t[*idx];
3853: sum1 -= *v1 * tmp0;
3854: }
3855: x[row] += t[row] = sum1*(*ibdiag++); row++;
3856: break;
3857: case 2:
3858: v2 = a->a + ii[row+1];
3859: sum1 = t[row];
3860: sum2 = t[row+1];
3861: for (n = 0; n<sz-1; n+=2) {
3862: i1 = idx[0];
3863: i2 = idx[1];
3864: idx += 2;
3865: tmp0 = t[i1];
3866: tmp1 = t[i2];
3867: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3868: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3869: }
3871: if (n == sz-1) {
3872: tmp0 = t[*idx];
3873: sum1 -= v1[0] * tmp0;
3874: sum2 -= v2[0] * tmp0;
3875: }
3876: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3877: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3878: ibdiag += 4; row += 2;
3879: break;
3880: case 3:
3881: v2 = a->a + ii[row+1];
3882: v3 = a->a + ii[row+2];
3883: sum1 = t[row];
3884: sum2 = t[row+1];
3885: sum3 = t[row+2];
3886: for (n = 0; n<sz-1; n+=2) {
3887: i1 = idx[0];
3888: i2 = idx[1];
3889: idx += 2;
3890: tmp0 = t[i1];
3891: tmp1 = t[i2];
3892: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3893: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3894: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3895: }
3897: if (n == sz-1) {
3898: tmp0 = t[*idx];
3899: sum1 -= v1[0] * tmp0;
3900: sum2 -= v2[0] * tmp0;
3901: sum3 -= v3[0] * tmp0;
3902: }
3903: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3904: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3905: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3906: ibdiag += 9; row += 3;
3907: break;
3908: case 4:
3909: v2 = a->a + ii[row+1];
3910: v3 = a->a + ii[row+2];
3911: v4 = a->a + ii[row+3];
3912: sum1 = t[row];
3913: sum2 = t[row+1];
3914: sum3 = t[row+2];
3915: sum4 = t[row+3];
3916: for (n = 0; n<sz-1; n+=2) {
3917: i1 = idx[0];
3918: i2 = idx[1];
3919: idx += 2;
3920: tmp0 = t[i1];
3921: tmp1 = t[i2];
3922: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3923: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3924: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3925: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3926: }
3928: if (n == sz-1) {
3929: tmp0 = t[*idx];
3930: sum1 -= v1[0] * tmp0;
3931: sum2 -= v2[0] * tmp0;
3932: sum3 -= v3[0] * tmp0;
3933: sum4 -= v4[0] * tmp0;
3934: }
3935: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3936: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3937: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3938: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3939: ibdiag += 16; row += 4;
3940: break;
3941: case 5:
3942: v2 = a->a + ii[row+1];
3943: v3 = a->a + ii[row+2];
3944: v4 = a->a + ii[row+3];
3945: v5 = a->a + ii[row+4];
3946: sum1 = t[row];
3947: sum2 = t[row+1];
3948: sum3 = t[row+2];
3949: sum4 = t[row+3];
3950: sum5 = t[row+4];
3951: for (n = 0; n<sz-1; n+=2) {
3952: i1 = idx[0];
3953: i2 = idx[1];
3954: idx += 2;
3955: tmp0 = t[i1];
3956: tmp1 = t[i2];
3957: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3958: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3959: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3960: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3961: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3962: }
3964: if (n == sz-1) {
3965: tmp0 = t[*idx];
3966: sum1 -= v1[0] * tmp0;
3967: sum2 -= v2[0] * tmp0;
3968: sum3 -= v3[0] * tmp0;
3969: sum4 -= v4[0] * tmp0;
3970: sum5 -= v5[0] * tmp0;
3971: }
3972: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3973: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3974: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3975: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3976: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3977: ibdiag += 25; row += 5;
3978: break;
3979: default:
3980: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3981: }
3982: }
3983: PetscLogFlops(a->nz);
3984: }
3985: VecRestoreArray(xx,&x);
3986: VecRestoreArrayRead(bb,&b);
3987: return(0);
3988: }
3992: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3993: {
3994: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3995: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3996: const MatScalar *bdiag = a->inode.bdiag;
3997: const PetscScalar *b;
3998: PetscErrorCode ierr;
3999: PetscInt m = a->inode.node_count,cnt = 0,i,row;
4000: const PetscInt *sizes = a->inode.size;
4003: VecGetArray(xx,&x);
4004: VecGetArrayRead(bb,&b);
4005: cnt = 0;
4006: for (i=0, row=0; i<m; i++) {
4007: switch (sizes[i]) {
4008: case 1:
4009: x[row] = b[row]*bdiag[cnt++];row++;
4010: break;
4011: case 2:
4012: x1 = b[row]; x2 = b[row+1];
4013: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4014: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4015: x[row++] = tmp1;
4016: x[row++] = tmp2;
4017: cnt += 4;
4018: break;
4019: case 3:
4020: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4021: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4022: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4023: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4024: x[row++] = tmp1;
4025: x[row++] = tmp2;
4026: x[row++] = tmp3;
4027: cnt += 9;
4028: break;
4029: case 4:
4030: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4031: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4032: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4033: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4034: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4035: x[row++] = tmp1;
4036: x[row++] = tmp2;
4037: x[row++] = tmp3;
4038: x[row++] = tmp4;
4039: cnt += 16;
4040: break;
4041: case 5:
4042: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4043: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4044: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4045: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4046: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4047: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4048: x[row++] = tmp1;
4049: x[row++] = tmp2;
4050: x[row++] = tmp3;
4051: x[row++] = tmp4;
4052: x[row++] = tmp5;
4053: cnt += 25;
4054: break;
4055: default:
4056: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4057: }
4058: }
4059: PetscLogFlops(2*cnt);
4060: VecRestoreArray(xx,&x);
4061: VecRestoreArrayRead(bb,&b);
4062: return(0);
4063: }
4065: /*
4066: samestructure indicates that the matrix has not changed its nonzero structure so we
4067: do not need to recompute the inodes
4068: */
4071: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4072: {
4073: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4075: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4076: PetscBool flag;
4077: const PetscInt *idx,*idy,*ii;
4080: if (!a->inode.use) return(0);
4081: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4083: m = A->rmap->n;
4084: if (a->inode.size) ns = a->inode.size;
4085: else {
4086: PetscMalloc1(m+1,&ns);
4087: }
4089: i = 0;
4090: node_count = 0;
4091: idx = a->j;
4092: ii = a->i;
4093: while (i < m) { /* For each row */
4094: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4095: /* Limits the number of elements in a node to 'a->inode.limit' */
4096: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4097: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4098: if (nzy != nzx) break;
4099: idy += nzx; /* Same nonzero pattern */
4100: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4101: if (!flag) break;
4102: }
4103: ns[node_count++] = blk_size;
4104: idx += blk_size*nzx;
4105: i = j;
4106: }
4107: /* If not enough inodes found,, do not use inode version of the routines */
4108: if (!m || node_count > .8*m) {
4109: PetscFree(ns);
4111: a->inode.node_count = 0;
4112: a->inode.size = NULL;
4113: a->inode.use = PETSC_FALSE;
4114: A->ops->mult = MatMult_SeqAIJ;
4115: A->ops->sor = MatSOR_SeqAIJ;
4116: A->ops->multadd = MatMultAdd_SeqAIJ;
4117: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4118: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4119: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4120: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4121: A->ops->coloringpatch = 0;
4122: A->ops->multdiagonalblock = 0;
4124: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4125: } else {
4126: if (!A->factortype) {
4127: A->ops->mult = MatMult_SeqAIJ_Inode;
4128: A->ops->sor = MatSOR_SeqAIJ_Inode;
4129: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4130: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4131: if (A->rmap->n == A->cmap->n) {
4132: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4133: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4134: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4135: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4136: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4137: }
4138: } else {
4139: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4140: }
4141: a->inode.node_count = node_count;
4142: a->inode.size = ns;
4143: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4144: }
4145: a->inode.checked = PETSC_TRUE;
4146: a->inode.mat_nonzerostate = A->nonzerostate;
4147: return(0);
4148: }
4152: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4153: {
4154: Mat B =*C;
4155: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4157: PetscInt m=A->rmap->n;
4160: c->inode.use = a->inode.use;
4161: c->inode.limit = a->inode.limit;
4162: c->inode.max_limit = a->inode.max_limit;
4163: if (a->inode.size) {
4164: PetscMalloc1(m+1,&c->inode.size);
4165: c->inode.node_count = a->inode.node_count;
4166: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4167: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4168: if (!B->factortype) {
4169: B->ops->mult = MatMult_SeqAIJ_Inode;
4170: B->ops->sor = MatSOR_SeqAIJ_Inode;
4171: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4172: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4173: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4174: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4175: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4176: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4177: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4178: } else {
4179: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4180: }
4181: } else {
4182: c->inode.size = 0;
4183: c->inode.node_count = 0;
4184: }
4185: c->inode.ibdiagvalid = PETSC_FALSE;
4186: c->inode.ibdiag = 0;
4187: c->inode.bdiag = 0;
4188: return(0);
4189: }
4193: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4194: {
4195: PetscInt k;
4196: const PetscInt *vi;
4199: vi = aj + ai[row];
4200: for (k=0; k<nzl; k++) cols[k] = vi[k];
4201: vi = aj + adiag[row];
4202: cols[nzl] = vi[0];
4203: vi = aj + adiag[row+1]+1;
4204: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4205: return(0);
4206: }
4207: /*
4208: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4209: Modified from MatSeqAIJCheckInode().
4211: Input Parameters:
4212: . Mat A - ILU or LU matrix factor
4214: */
4217: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4218: {
4219: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4221: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4222: PetscInt *cols1,*cols2,*ns;
4223: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4224: PetscBool flag;
4227: if (!a->inode.use) return(0);
4228: if (a->inode.checked) return(0);
4230: m = A->rmap->n;
4231: if (a->inode.size) ns = a->inode.size;
4232: else {
4233: PetscMalloc1(m+1,&ns);
4234: }
4236: i = 0;
4237: node_count = 0;
4238: PetscMalloc2(m,&cols1,m,&cols2);
4239: while (i < m) { /* For each row */
4240: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4241: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4242: nzx = nzl1 + nzu1 + 1;
4243: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4245: /* Limits the number of elements in a node to 'a->inode.limit' */
4246: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4247: nzl2 = ai[j+1] - ai[j];
4248: nzu2 = adiag[j] - adiag[j+1] - 1;
4249: nzy = nzl2 + nzu2 + 1;
4250: if (nzy != nzx) break;
4251: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4252: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4253: if (!flag) break;
4254: }
4255: ns[node_count++] = blk_size;
4256: i = j;
4257: }
4258: PetscFree2(cols1,cols2);
4259: /* If not enough inodes found,, do not use inode version of the routines */
4260: if (!m || node_count > .8*m) {
4261: PetscFree(ns);
4263: a->inode.node_count = 0;
4264: a->inode.size = NULL;
4265: a->inode.use = PETSC_FALSE;
4267: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4268: } else {
4269: A->ops->mult = 0;
4270: A->ops->sor = 0;
4271: A->ops->multadd = 0;
4272: A->ops->getrowij = 0;
4273: A->ops->restorerowij = 0;
4274: A->ops->getcolumnij = 0;
4275: A->ops->restorecolumnij = 0;
4276: A->ops->coloringpatch = 0;
4277: A->ops->multdiagonalblock = 0;
4278: a->inode.node_count = node_count;
4279: a->inode.size = ns;
4281: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4282: }
4283: a->inode.checked = PETSC_TRUE;
4284: return(0);
4285: }
4289: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4290: {
4291: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4294: a->inode.ibdiagvalid = PETSC_FALSE;
4295: return(0);
4296: }
4298: /*
4299: This is really ugly. if inodes are used this replaces the
4300: permutations with ones that correspond to rows/cols of the matrix
4301: rather then inode blocks
4302: */
4305: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4306: {
4310: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4311: return(0);
4312: }
4316: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4317: {
4318: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4320: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4321: const PetscInt *ridx,*cidx;
4322: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4323: PetscInt nslim_col,*ns_col;
4324: IS ris = *rperm,cis = *cperm;
4327: if (!a->inode.size) return(0); /* no inodes so return */
4328: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4330: Mat_CreateColInode(A,&nslim_col,&ns_col);
4331: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4332: PetscMalloc2(m,&permr,n,&permc);
4334: ISGetIndices(ris,&ridx);
4335: ISGetIndices(cis,&cidx);
4337: /* Form the inode structure for the rows of permuted matric using inv perm*/
4338: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4340: /* Construct the permutations for rows*/
4341: for (i=0,row = 0; i<nslim_row; ++i) {
4342: indx = ridx[i];
4343: start_val = tns[indx];
4344: end_val = tns[indx + 1];
4345: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4346: }
4348: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4349: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4351: /* Construct permutations for columns */
4352: for (i=0,col=0; i<nslim_col; ++i) {
4353: indx = cidx[i];
4354: start_val = tns[indx];
4355: end_val = tns[indx + 1];
4356: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4357: }
4359: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4360: ISSetPermutation(*rperm);
4361: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4362: ISSetPermutation(*cperm);
4364: ISRestoreIndices(ris,&ridx);
4365: ISRestoreIndices(cis,&cidx);
4367: PetscFree(ns_col);
4368: PetscFree2(permr,permc);
4369: ISDestroy(&cis);
4370: ISDestroy(&ris);
4371: PetscFree(tns);
4372: return(0);
4373: }
4377: /*@C
4378: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4380: Not Collective
4382: Input Parameter:
4383: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4385: Output Parameter:
4386: + node_count - no of inodes present in the matrix.
4387: . sizes - an array of size node_count,with sizes of each inode.
4388: - limit - the max size used to generate the inodes.
4390: Level: advanced
4392: Notes: This routine returns some internal storage information
4393: of the matrix, it is intended to be used by advanced users.
4394: It should be called after the matrix is assembled.
4395: The contents of the sizes[] array should not be changed.
4396: NULL may be passed for information not requested.
4398: .keywords: matrix, seqaij, get, inode
4400: .seealso: MatGetInfo()
4401: @*/
4402: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4403: {
4404: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4407: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4408: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4409: if (f) {
4410: (*f)(A,node_count,sizes,limit);
4411: }
4412: return(0);
4413: }
4417: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4418: {
4419: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4422: if (node_count) *node_count = a->inode.node_count;
4423: if (sizes) *sizes = a->inode.size;
4424: if (limit) *limit = a->inode.limit;
4425: return(0);
4426: }