Actual source code: inode.c
petsc-3.5.4 2015-05-23
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24: for (; count+1 < n; count++,i++) ;
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc1((n+1),&ns_col);
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for (; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc1((nslim_col+1),&tns);
81: PetscMalloc1((n+1),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1) {
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87: }
88: /* allocate space for row pointers */
89: PetscMalloc1((nslim_row+1),&ia);
90: *iia = ia;
91: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
92: PetscMalloc1((nslim_row+1),&work);
94: /* determine the number of columns in each row */
95: ia[0] = oshift;
96: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
98: j = aj + ai[row] + ishift;
99: jmax = aj + ai[row+1] + ishift;
100: col = *j++ + ishift;
101: i2 = tvc[col];
102: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103: ia[i1+1]++;
104: ia[i2+1]++;
105: i2++; /* Start col of next node */
106: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107: i2 = tvc[col];
108: }
109: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
110: }
112: /* shift ia[i] to point to next row */
113: for (i1=1; i1<nslim_row+1; i1++) {
114: row = ia[i1-1];
115: ia[i1] += row;
116: work[i1-1] = row - oshift;
117: }
119: /* allocate space for column pointers */
120: nz = ia[nslim_row] + (!ishift);
121: PetscMalloc1(nz,&ja);
122: *jja = ja;
124: /* loop over lower triangular part putting into ja */
125: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126: j = aj + ai[row] + ishift;
127: jmax = aj + ai[row+1] + ishift;
128: col = *j++ + ishift;
129: i2 = tvc[col];
130: while (i2<i1 && j<jmax) {
131: ja[work[i2]++] = i1 + oshift;
132: ja[work[i1]++] = i2 + oshift;
133: ++i2;
134: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135: i2 = tvc[col];
136: }
137: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
139: }
140: PetscFree(work);
141: PetscFree(tns);
142: PetscFree(tvc);
143: return(0);
144: }
146: /*
147: This builds nonsymmetric version of nonzero structure,
148: */
151: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152: {
153: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
155: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156: PetscInt *tns,*tvc,nsz,i1,i2;
157: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
160: nslim_row = a->inode.node_count;
161: n = A->cmap->n;
163: /* Create The column_inode for this matrix */
164: Mat_CreateColInode(A,&nslim_col,&ns_col);
166: /* allocate space for reformated column_inode structure */
167: PetscMalloc1((nslim_col +1),&tns);
168: PetscMalloc1((n +1),&tvc);
169: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171: for (i1=0,col=0; i1<nslim_col; ++i1) {
172: nsz = ns_col[i1];
173: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174: }
175: /* allocate space for row pointers */
176: PetscMalloc1((nslim_row+1),&ia);
177: *iia = ia;
178: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
179: PetscMalloc1((nslim_row+1),&work);
181: /* determine the number of columns in each row */
182: ia[0] = oshift;
183: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184: j = aj + ai[row] + ishift;
185: col = *j++ + ishift;
186: i2 = tvc[col];
187: nz = ai[row+1] - ai[row];
188: while (nz-- > 0) { /* off-diagonal elemets */
189: ia[i1+1]++;
190: i2++; /* Start col of next node */
191: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192: if (nz > 0) i2 = tvc[col];
193: }
194: }
196: /* shift ia[i] to point to next row */
197: for (i1=1; i1<nslim_row+1; i1++) {
198: row = ia[i1-1];
199: ia[i1] += row;
200: work[i1-1] = row - oshift;
201: }
203: /* allocate space for column pointers */
204: nz = ia[nslim_row] + (!ishift);
205: PetscMalloc1(nz,&ja);
206: *jja = ja;
208: /* loop over matrix putting into ja */
209: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210: j = aj + ai[row] + ishift;
211: col = *j++ + ishift;
212: i2 = tvc[col];
213: nz = ai[row+1] - ai[row];
214: while (nz-- > 0) {
215: ja[work[i1]++] = i2 + oshift;
216: ++i2;
217: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218: if (nz > 0) i2 = tvc[col];
219: }
220: }
221: PetscFree(ns_col);
222: PetscFree(work);
223: PetscFree(tns);
224: PetscFree(tvc);
225: return(0);
226: }
230: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
231: {
232: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
236: *n = a->inode.node_count;
237: if (!ia) return(0);
238: if (!blockcompressed) {
239: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
240: } else if (symmetric) {
241: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
242: } else {
243: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
244: }
245: return(0);
246: }
250: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
251: {
255: if (!ia) return(0);
257: if (!blockcompressed) {
258: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
259: } else {
260: PetscFree(*ia);
261: PetscFree(*ja);
262: }
263: return(0);
264: }
266: /* ----------------------------------------------------------- */
270: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271: {
272: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
274: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
278: nslim_row = a->inode.node_count;
279: n = A->cmap->n;
281: /* Create The column_inode for this matrix */
282: Mat_CreateColInode(A,&nslim_col,&ns_col);
284: /* allocate space for reformated column_inode structure */
285: PetscMalloc1((nslim_col + 1),&tns);
286: PetscMalloc1((n + 1),&tvc);
287: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
289: for (i1=0,col=0; i1<nslim_col; ++i1) {
290: nsz = ns_col[i1];
291: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292: }
293: /* allocate space for column pointers */
294: PetscMalloc1((nslim_col+1),&ia);
295: *iia = ia;
296: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
297: PetscMalloc1((nslim_col+1),&work);
299: /* determine the number of columns in each row */
300: ia[0] = oshift;
301: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302: j = aj + ai[row] + ishift;
303: col = *j++ + ishift;
304: i2 = tvc[col];
305: nz = ai[row+1] - ai[row];
306: while (nz-- > 0) { /* off-diagonal elemets */
307: /* ia[i1+1]++; */
308: ia[i2+1]++;
309: i2++;
310: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311: if (nz > 0) i2 = tvc[col];
312: }
313: }
315: /* shift ia[i] to point to next col */
316: for (i1=1; i1<nslim_col+1; i1++) {
317: col = ia[i1-1];
318: ia[i1] += col;
319: work[i1-1] = col - oshift;
320: }
322: /* allocate space for column pointers */
323: nz = ia[nslim_col] + (!ishift);
324: PetscMalloc1(nz,&ja);
325: *jja = ja;
327: /* loop over matrix putting into ja */
328: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329: j = aj + ai[row] + ishift;
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338: if (nz > 0) i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
351: {
355: Mat_CreateColInode(A,n,NULL);
356: if (!ia) return(0);
358: if (!blockcompressed) {
359: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
360: } else if (symmetric) {
361: /* Since the indices are symmetric it does'nt matter */
362: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
363: } else {
364: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
365: }
366: return(0);
367: }
371: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
372: {
376: if (!ia) return(0);
377: if (!blockcompressed) {
378: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
379: } else {
380: PetscFree(*ia);
381: PetscFree(*ja);
382: }
383: return(0);
384: }
386: /* ----------------------------------------------------------- */
390: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391: {
392: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
393: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394: PetscScalar *y;
395: const PetscScalar *x;
396: const MatScalar *v1,*v2,*v3,*v4,*v5;
397: PetscErrorCode ierr;
398: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399: const PetscInt *idx,*ns,*ii;
401: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403: #endif
406: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407: node_max = a->inode.node_count;
408: ns = a->inode.size; /* Node Size array */
409: VecGetArrayRead(xx,&x);
410: VecGetArray(yy,&y);
411: idx = a->j;
412: v1 = a->a;
413: ii = a->i;
415: for (i = 0,row = 0; i< node_max; ++i) {
416: nsz = ns[i];
417: n = ii[1] - ii[0];
418: nonzerorow += (n>0)*nsz;
419: ii += nsz;
420: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
421: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
422: sz = n; /* No of non zeros in this row */
423: /* Switch on the size of Node */
424: switch (nsz) { /* Each loop in 'case' is unrolled */
425: case 1:
426: sum1 = 0.;
428: for (n = 0; n< sz-1; n+=2) {
429: i1 = idx[0]; /* The instructions are ordered to */
430: i2 = idx[1]; /* make the compiler's job easy */
431: idx += 2;
432: tmp0 = x[i1];
433: tmp1 = x[i2];
434: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435: }
437: if (n == sz-1) { /* Take care of the last nonzero */
438: tmp0 = x[*idx++];
439: sum1 += *v1++ *tmp0;
440: }
441: y[row++]=sum1;
442: break;
443: case 2:
444: sum1 = 0.;
445: sum2 = 0.;
446: v2 = v1 + n;
448: for (n = 0; n< sz-1; n+=2) {
449: i1 = idx[0];
450: i2 = idx[1];
451: idx += 2;
452: tmp0 = x[i1];
453: tmp1 = x[i2];
454: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456: }
457: if (n == sz-1) {
458: tmp0 = x[*idx++];
459: sum1 += *v1++ * tmp0;
460: sum2 += *v2++ * tmp0;
461: }
462: y[row++]=sum1;
463: y[row++]=sum2;
464: v1 =v2; /* Since the next block to be processed starts there*/
465: idx +=sz;
466: break;
467: case 3:
468: sum1 = 0.;
469: sum2 = 0.;
470: sum3 = 0.;
471: v2 = v1 + n;
472: v3 = v2 + n;
474: for (n = 0; n< sz-1; n+=2) {
475: i1 = idx[0];
476: i2 = idx[1];
477: idx += 2;
478: tmp0 = x[i1];
479: tmp1 = x[i2];
480: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483: }
484: if (n == sz-1) {
485: tmp0 = x[*idx++];
486: sum1 += *v1++ * tmp0;
487: sum2 += *v2++ * tmp0;
488: sum3 += *v3++ * tmp0;
489: }
490: y[row++]=sum1;
491: y[row++]=sum2;
492: y[row++]=sum3;
493: v1 =v3; /* Since the next block to be processed starts there*/
494: idx +=2*sz;
495: break;
496: case 4:
497: sum1 = 0.;
498: sum2 = 0.;
499: sum3 = 0.;
500: sum4 = 0.;
501: v2 = v1 + n;
502: v3 = v2 + n;
503: v4 = v3 + n;
505: for (n = 0; n< sz-1; n+=2) {
506: i1 = idx[0];
507: i2 = idx[1];
508: idx += 2;
509: tmp0 = x[i1];
510: tmp1 = x[i2];
511: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515: }
516: if (n == sz-1) {
517: tmp0 = x[*idx++];
518: sum1 += *v1++ * tmp0;
519: sum2 += *v2++ * tmp0;
520: sum3 += *v3++ * tmp0;
521: sum4 += *v4++ * tmp0;
522: }
523: y[row++]=sum1;
524: y[row++]=sum2;
525: y[row++]=sum3;
526: y[row++]=sum4;
527: v1 =v4; /* Since the next block to be processed starts there*/
528: idx +=3*sz;
529: break;
530: case 5:
531: sum1 = 0.;
532: sum2 = 0.;
533: sum3 = 0.;
534: sum4 = 0.;
535: sum5 = 0.;
536: v2 = v1 + n;
537: v3 = v2 + n;
538: v4 = v3 + n;
539: v5 = v4 + n;
541: for (n = 0; n<sz-1; n+=2) {
542: i1 = idx[0];
543: i2 = idx[1];
544: idx += 2;
545: tmp0 = x[i1];
546: tmp1 = x[i2];
547: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552: }
553: if (n == sz-1) {
554: tmp0 = x[*idx++];
555: sum1 += *v1++ * tmp0;
556: sum2 += *v2++ * tmp0;
557: sum3 += *v3++ * tmp0;
558: sum4 += *v4++ * tmp0;
559: sum5 += *v5++ * tmp0;
560: }
561: y[row++]=sum1;
562: y[row++]=sum2;
563: y[row++]=sum3;
564: y[row++]=sum4;
565: y[row++]=sum5;
566: v1 =v5; /* Since the next block to be processed starts there */
567: idx +=4*sz;
568: break;
569: default:
570: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571: }
572: }
573: VecRestoreArrayRead(xx,&x);
574: VecRestoreArray(yy,&y);
575: PetscLogFlops(2.0*a->nz - nonzerorow);
576: return(0);
577: }
578: /* ----------------------------------------------------------- */
579: /* Almost same code as the MatMult_SeqAIJ_Inode() */
582: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583: {
584: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
585: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586: const MatScalar *v1,*v2,*v3,*v4,*v5;
587: const PetscScalar *x;
588: PetscScalar *y,*z,*zt;
589: PetscErrorCode ierr;
590: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
591: const PetscInt *idx,*ns,*ii;
594: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595: node_max = a->inode.node_count;
596: ns = a->inode.size; /* Node Size array */
598: VecGetArrayRead(xx,&x);
599: VecGetArray(yy,&y);
600: if (zz != yy) {
601: VecGetArray(zz,&z);
602: } else {
603: z = y;
604: }
605: zt = z;
607: idx = a->j;
608: v1 = a->a;
609: ii = a->i;
611: for (i = 0,row = 0; i< node_max; ++i) {
612: nsz = ns[i];
613: n = ii[1] - ii[0];
614: ii += nsz;
615: sz = n; /* No of non zeros in this row */
616: /* Switch on the size of Node */
617: switch (nsz) { /* Each loop in 'case' is unrolled */
618: case 1:
619: sum1 = *zt++;
621: for (n = 0; n< sz-1; n+=2) {
622: i1 = idx[0]; /* The instructions are ordered to */
623: i2 = idx[1]; /* make the compiler's job easy */
624: idx += 2;
625: tmp0 = x[i1];
626: tmp1 = x[i2];
627: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628: }
630: if (n == sz-1) { /* Take care of the last nonzero */
631: tmp0 = x[*idx++];
632: sum1 += *v1++ * tmp0;
633: }
634: y[row++]=sum1;
635: break;
636: case 2:
637: sum1 = *zt++;
638: sum2 = *zt++;
639: v2 = v1 + n;
641: for (n = 0; n< sz-1; n+=2) {
642: i1 = idx[0];
643: i2 = idx[1];
644: idx += 2;
645: tmp0 = x[i1];
646: tmp1 = x[i2];
647: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649: }
650: if (n == sz-1) {
651: tmp0 = x[*idx++];
652: sum1 += *v1++ * tmp0;
653: sum2 += *v2++ * tmp0;
654: }
655: y[row++]=sum1;
656: y[row++]=sum2;
657: v1 =v2; /* Since the next block to be processed starts there*/
658: idx +=sz;
659: break;
660: case 3:
661: sum1 = *zt++;
662: sum2 = *zt++;
663: sum3 = *zt++;
664: v2 = v1 + n;
665: v3 = v2 + n;
667: for (n = 0; n< sz-1; n+=2) {
668: i1 = idx[0];
669: i2 = idx[1];
670: idx += 2;
671: tmp0 = x[i1];
672: tmp1 = x[i2];
673: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676: }
677: if (n == sz-1) {
678: tmp0 = x[*idx++];
679: sum1 += *v1++ * tmp0;
680: sum2 += *v2++ * tmp0;
681: sum3 += *v3++ * tmp0;
682: }
683: y[row++]=sum1;
684: y[row++]=sum2;
685: y[row++]=sum3;
686: v1 =v3; /* Since the next block to be processed starts there*/
687: idx +=2*sz;
688: break;
689: case 4:
690: sum1 = *zt++;
691: sum2 = *zt++;
692: sum3 = *zt++;
693: sum4 = *zt++;
694: v2 = v1 + n;
695: v3 = v2 + n;
696: v4 = v3 + n;
698: for (n = 0; n< sz-1; n+=2) {
699: i1 = idx[0];
700: i2 = idx[1];
701: idx += 2;
702: tmp0 = x[i1];
703: tmp1 = x[i2];
704: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708: }
709: if (n == sz-1) {
710: tmp0 = x[*idx++];
711: sum1 += *v1++ * tmp0;
712: sum2 += *v2++ * tmp0;
713: sum3 += *v3++ * tmp0;
714: sum4 += *v4++ * tmp0;
715: }
716: y[row++]=sum1;
717: y[row++]=sum2;
718: y[row++]=sum3;
719: y[row++]=sum4;
720: v1 =v4; /* Since the next block to be processed starts there*/
721: idx +=3*sz;
722: break;
723: case 5:
724: sum1 = *zt++;
725: sum2 = *zt++;
726: sum3 = *zt++;
727: sum4 = *zt++;
728: sum5 = *zt++;
729: v2 = v1 + n;
730: v3 = v2 + n;
731: v4 = v3 + n;
732: v5 = v4 + n;
734: for (n = 0; n<sz-1; n+=2) {
735: i1 = idx[0];
736: i2 = idx[1];
737: idx += 2;
738: tmp0 = x[i1];
739: tmp1 = x[i2];
740: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745: }
746: if (n == sz-1) {
747: tmp0 = x[*idx++];
748: sum1 += *v1++ * tmp0;
749: sum2 += *v2++ * tmp0;
750: sum3 += *v3++ * tmp0;
751: sum4 += *v4++ * tmp0;
752: sum5 += *v5++ * tmp0;
753: }
754: y[row++]=sum1;
755: y[row++]=sum2;
756: y[row++]=sum3;
757: y[row++]=sum4;
758: y[row++]=sum5;
759: v1 =v5; /* Since the next block to be processed starts there */
760: idx +=4*sz;
761: break;
762: default:
763: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764: }
765: }
766: VecRestoreArrayRead(xx,&x);
767: VecRestoreArray(yy,&y);
768: if (zz != yy) {
769: VecRestoreArray(zz,&z);
770: }
771: PetscLogFlops(2.0*a->nz);
772: return(0);
773: }
775: /* ----------------------------------------------------------- */
778: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779: {
780: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
781: IS iscol = a->col,isrow = a->row;
782: PetscErrorCode ierr;
783: const PetscInt *r,*c,*rout,*cout;
784: PetscInt i,j,n = A->rmap->n,nz;
785: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
786: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
788: PetscScalar sum1,sum2,sum3,sum4,sum5;
789: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790: const PetscScalar *b;
793: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794: node_max = a->inode.node_count;
795: ns = a->inode.size; /* Node Size array */
797: VecGetArrayRead(bb,&b);
798: VecGetArray(xx,&x);
799: tmp = a->solve_work;
801: ISGetIndices(isrow,&rout); r = rout;
802: ISGetIndices(iscol,&cout); c = cout + (n-1);
804: /* forward solve the lower triangular */
805: tmps = tmp;
806: aa = a_a;
807: aj = a_j;
808: ad = a->diag;
810: for (i = 0,row = 0; i< node_max; ++i) {
811: nsz = ns[i];
812: aii = ai[row];
813: v1 = aa + aii;
814: vi = aj + aii;
815: nz = ad[row]- aii;
816: if (i < node_max-1) {
817: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818: * but our indexing to determine it's size could. */
819: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823: }
825: switch (nsz) { /* Each loop in 'case' is unrolled */
826: case 1:
827: sum1 = b[*r++];
828: for (j=0; j<nz-1; j+=2) {
829: i0 = vi[0];
830: i1 = vi[1];
831: vi +=2;
832: tmp0 = tmps[i0];
833: tmp1 = tmps[i1];
834: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835: }
836: if (j == nz-1) {
837: tmp0 = tmps[*vi++];
838: sum1 -= *v1++ *tmp0;
839: }
840: tmp[row++]=sum1;
841: break;
842: case 2:
843: sum1 = b[*r++];
844: sum2 = b[*r++];
845: v2 = aa + ai[row+1];
847: for (j=0; j<nz-1; j+=2) {
848: i0 = vi[0];
849: i1 = vi[1];
850: vi +=2;
851: tmp0 = tmps[i0];
852: tmp1 = tmps[i1];
853: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855: }
856: if (j == nz-1) {
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: }
861: sum2 -= *v2++ *sum1;
862: tmp[row++]=sum1;
863: tmp[row++]=sum2;
864: break;
865: case 3:
866: sum1 = b[*r++];
867: sum2 = b[*r++];
868: sum3 = b[*r++];
869: v2 = aa + ai[row+1];
870: v3 = aa + ai[row+2];
872: for (j=0; j<nz-1; j+=2) {
873: i0 = vi[0];
874: i1 = vi[1];
875: vi +=2;
876: tmp0 = tmps[i0];
877: tmp1 = tmps[i1];
878: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881: }
882: if (j == nz-1) {
883: tmp0 = tmps[*vi++];
884: sum1 -= *v1++ *tmp0;
885: sum2 -= *v2++ *tmp0;
886: sum3 -= *v3++ *tmp0;
887: }
888: sum2 -= *v2++ * sum1;
889: sum3 -= *v3++ * sum1;
890: sum3 -= *v3++ * sum2;
892: tmp[row++]=sum1;
893: tmp[row++]=sum2;
894: tmp[row++]=sum3;
895: break;
897: case 4:
898: sum1 = b[*r++];
899: sum2 = b[*r++];
900: sum3 = b[*r++];
901: sum4 = b[*r++];
902: v2 = aa + ai[row+1];
903: v3 = aa + ai[row+2];
904: v4 = aa + ai[row+3];
906: for (j=0; j<nz-1; j+=2) {
907: i0 = vi[0];
908: i1 = vi[1];
909: vi +=2;
910: tmp0 = tmps[i0];
911: tmp1 = tmps[i1];
912: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916: }
917: if (j == nz-1) {
918: tmp0 = tmps[*vi++];
919: sum1 -= *v1++ *tmp0;
920: sum2 -= *v2++ *tmp0;
921: sum3 -= *v3++ *tmp0;
922: sum4 -= *v4++ *tmp0;
923: }
924: sum2 -= *v2++ * sum1;
925: sum3 -= *v3++ * sum1;
926: sum4 -= *v4++ * sum1;
927: sum3 -= *v3++ * sum2;
928: sum4 -= *v4++ * sum2;
929: sum4 -= *v4++ * sum3;
931: tmp[row++]=sum1;
932: tmp[row++]=sum2;
933: tmp[row++]=sum3;
934: tmp[row++]=sum4;
935: break;
936: case 5:
937: sum1 = b[*r++];
938: sum2 = b[*r++];
939: sum3 = b[*r++];
940: sum4 = b[*r++];
941: sum5 = b[*r++];
942: v2 = aa + ai[row+1];
943: v3 = aa + ai[row+2];
944: v4 = aa + ai[row+3];
945: v5 = aa + ai[row+4];
947: for (j=0; j<nz-1; j+=2) {
948: i0 = vi[0];
949: i1 = vi[1];
950: vi +=2;
951: tmp0 = tmps[i0];
952: tmp1 = tmps[i1];
953: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958: }
959: if (j == nz-1) {
960: tmp0 = tmps[*vi++];
961: sum1 -= *v1++ *tmp0;
962: sum2 -= *v2++ *tmp0;
963: sum3 -= *v3++ *tmp0;
964: sum4 -= *v4++ *tmp0;
965: sum5 -= *v5++ *tmp0;
966: }
968: sum2 -= *v2++ * sum1;
969: sum3 -= *v3++ * sum1;
970: sum4 -= *v4++ * sum1;
971: sum5 -= *v5++ * sum1;
972: sum3 -= *v3++ * sum2;
973: sum4 -= *v4++ * sum2;
974: sum5 -= *v5++ * sum2;
975: sum4 -= *v4++ * sum3;
976: sum5 -= *v5++ * sum3;
977: sum5 -= *v5++ * sum4;
979: tmp[row++]=sum1;
980: tmp[row++]=sum2;
981: tmp[row++]=sum3;
982: tmp[row++]=sum4;
983: tmp[row++]=sum5;
984: break;
985: default:
986: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987: }
988: }
989: /* backward solve the upper triangular */
990: for (i=node_max -1,row = n-1; i>=0; i--) {
991: nsz = ns[i];
992: aii = ai[row+1] -1;
993: v1 = aa + aii;
994: vi = aj + aii;
995: nz = aii- ad[row];
996: switch (nsz) { /* Each loop in 'case' is unrolled */
997: case 1:
998: sum1 = tmp[row];
1000: for (j=nz; j>1; j-=2) {
1001: vi -=2;
1002: i0 = vi[2];
1003: i1 = vi[1];
1004: tmp0 = tmps[i0];
1005: tmp1 = tmps[i1];
1006: v1 -= 2;
1007: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008: }
1009: if (j==1) {
1010: tmp0 = tmps[*vi--];
1011: sum1 -= *v1-- * tmp0;
1012: }
1013: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014: break;
1015: case 2:
1016: sum1 = tmp[row];
1017: sum2 = tmp[row -1];
1018: v2 = aa + ai[row]-1;
1019: for (j=nz; j>1; j-=2) {
1020: vi -=2;
1021: i0 = vi[2];
1022: i1 = vi[1];
1023: tmp0 = tmps[i0];
1024: tmp1 = tmps[i1];
1025: v1 -= 2;
1026: v2 -= 2;
1027: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029: }
1030: if (j==1) {
1031: tmp0 = tmps[*vi--];
1032: sum1 -= *v1-- * tmp0;
1033: sum2 -= *v2-- * tmp0;
1034: }
1036: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037: sum2 -= *v2-- * tmp0;
1038: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039: break;
1040: case 3:
1041: sum1 = tmp[row];
1042: sum2 = tmp[row -1];
1043: sum3 = tmp[row -2];
1044: v2 = aa + ai[row]-1;
1045: v3 = aa + ai[row -1]-1;
1046: for (j=nz; j>1; j-=2) {
1047: vi -=2;
1048: i0 = vi[2];
1049: i1 = vi[1];
1050: tmp0 = tmps[i0];
1051: tmp1 = tmps[i1];
1052: v1 -= 2;
1053: v2 -= 2;
1054: v3 -= 2;
1055: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058: }
1059: if (j==1) {
1060: tmp0 = tmps[*vi--];
1061: sum1 -= *v1-- * tmp0;
1062: sum2 -= *v2-- * tmp0;
1063: sum3 -= *v3-- * tmp0;
1064: }
1065: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066: sum2 -= *v2-- * tmp0;
1067: sum3 -= *v3-- * tmp0;
1068: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069: sum3 -= *v3-- * tmp0;
1070: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072: break;
1073: case 4:
1074: sum1 = tmp[row];
1075: sum2 = tmp[row -1];
1076: sum3 = tmp[row -2];
1077: sum4 = tmp[row -3];
1078: v2 = aa + ai[row]-1;
1079: v3 = aa + ai[row -1]-1;
1080: v4 = aa + ai[row -2]-1;
1082: for (j=nz; j>1; j-=2) {
1083: vi -=2;
1084: i0 = vi[2];
1085: i1 = vi[1];
1086: tmp0 = tmps[i0];
1087: tmp1 = tmps[i1];
1088: v1 -= 2;
1089: v2 -= 2;
1090: v3 -= 2;
1091: v4 -= 2;
1092: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096: }
1097: if (j==1) {
1098: tmp0 = tmps[*vi--];
1099: sum1 -= *v1-- * tmp0;
1100: sum2 -= *v2-- * tmp0;
1101: sum3 -= *v3-- * tmp0;
1102: sum4 -= *v4-- * tmp0;
1103: }
1105: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106: sum2 -= *v2-- * tmp0;
1107: sum3 -= *v3-- * tmp0;
1108: sum4 -= *v4-- * tmp0;
1109: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113: sum4 -= *v4-- * tmp0;
1114: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115: break;
1116: case 5:
1117: sum1 = tmp[row];
1118: sum2 = tmp[row -1];
1119: sum3 = tmp[row -2];
1120: sum4 = tmp[row -3];
1121: sum5 = tmp[row -4];
1122: v2 = aa + ai[row]-1;
1123: v3 = aa + ai[row -1]-1;
1124: v4 = aa + ai[row -2]-1;
1125: v5 = aa + ai[row -3]-1;
1126: for (j=nz; j>1; j-=2) {
1127: vi -= 2;
1128: i0 = vi[2];
1129: i1 = vi[1];
1130: tmp0 = tmps[i0];
1131: tmp1 = tmps[i1];
1132: v1 -= 2;
1133: v2 -= 2;
1134: v3 -= 2;
1135: v4 -= 2;
1136: v5 -= 2;
1137: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142: }
1143: if (j==1) {
1144: tmp0 = tmps[*vi--];
1145: sum1 -= *v1-- * tmp0;
1146: sum2 -= *v2-- * tmp0;
1147: sum3 -= *v3-- * tmp0;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: }
1152: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153: sum2 -= *v2-- * tmp0;
1154: sum3 -= *v3-- * tmp0;
1155: sum4 -= *v4-- * tmp0;
1156: sum5 -= *v5-- * tmp0;
1157: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158: sum3 -= *v3-- * tmp0;
1159: sum4 -= *v4-- * tmp0;
1160: sum5 -= *v5-- * tmp0;
1161: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162: sum4 -= *v4-- * tmp0;
1163: sum5 -= *v5-- * tmp0;
1164: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165: sum5 -= *v5-- * tmp0;
1166: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167: break;
1168: default:
1169: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170: }
1171: }
1172: ISRestoreIndices(isrow,&rout);
1173: ISRestoreIndices(iscol,&cout);
1174: VecRestoreArrayRead(bb,&b);
1175: VecRestoreArray(xx,&x);
1176: PetscLogFlops(2.0*a->nz - A->cmap->n);
1177: return(0);
1178: }
1182: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183: {
1184: Mat C =B;
1185: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186: IS isrow = b->row,isicol = b->icol;
1187: PetscErrorCode ierr;
1188: const PetscInt *r,*ic,*ics;
1189: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190: PetscInt i,j,k,nz,nzL,row,*pj;
1191: const PetscInt *ajtmp,*bjtmp;
1192: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194: FactorShiftCtx sctx;
1195: const PetscInt *ddiag;
1196: PetscReal rs;
1197: MatScalar d;
1198: PetscInt inod,nodesz,node_max,col;
1199: const PetscInt *ns;
1200: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1203: /* MatPivotSetUp(): initialize shift context sctx */
1204: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1206: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207: ddiag = a->diag;
1208: sctx.shift_top = info->zeropivot;
1209: for (i=0; i<n; i++) {
1210: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211: d = (aa)[ddiag[i]];
1212: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213: v = aa+ai[i];
1214: nz = ai[i+1] - ai[i];
1215: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216: if (rs>sctx.shift_top) sctx.shift_top = rs;
1217: }
1218: sctx.shift_top *= 1.1;
1219: sctx.nshift_max = 5;
1220: sctx.shift_lo = 0.;
1221: sctx.shift_hi = 1.;
1222: }
1224: ISGetIndices(isrow,&r);
1225: ISGetIndices(isicol,&ic);
1227: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1228: ics = ic;
1230: node_max = a->inode.node_count;
1231: ns = a->inode.size;
1232: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1234: /* If max inode size > 4, split it into two inodes.*/
1235: /* also map the inode sizes according to the ordering */
1236: PetscMalloc1((n+1),&tmp_vec1);
1237: for (i=0,j=0; i<node_max; ++i,++j) {
1238: if (ns[i] > 4) {
1239: tmp_vec1[j] = 4;
1240: ++j;
1241: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242: } else {
1243: tmp_vec1[j] = ns[i];
1244: }
1245: }
1246: /* Use the correct node_max */
1247: node_max = j;
1249: /* Now reorder the inode info based on mat re-ordering info */
1250: /* First create a row -> inode_size_array_index map */
1251: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1252: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1253: for (i=0,row=0; i<node_max; i++) {
1254: nodesz = tmp_vec1[i];
1255: for (j=0; j<nodesz; j++,row++) {
1256: nsmap[row] = i;
1257: }
1258: }
1259: /* Using nsmap, create a reordered ns structure */
1260: for (i=0,j=0; i< node_max; i++) {
1261: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262: tmp_vec2[i] = nodesz;
1263: j += nodesz;
1264: }
1265: PetscFree(nsmap);
1266: PetscFree(tmp_vec1);
1268: /* Now use the correct ns */
1269: ns = tmp_vec2;
1271: do {
1272: sctx.newshift = PETSC_FALSE;
1273: /* Now loop over each block-row, and do the factorization */
1274: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1275: nodesz = ns[inod];
1277: switch (nodesz) {
1278: case 1:
1279: /*----------*/
1280: /* zero rtmp1 */
1281: /* L part */
1282: nz = bi[i+1] - bi[i];
1283: bjtmp = bj + bi[i];
1284: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1286: /* U part */
1287: nz = bdiag[i]-bdiag[i+1];
1288: bjtmp = bj + bdiag[i+1]+1;
1289: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291: /* load in initial (unfactored row) */
1292: nz = ai[r[i]+1] - ai[r[i]];
1293: ajtmp = aj + ai[r[i]];
1294: v = aa + ai[r[i]];
1295: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1297: /* ZeropivotApply() */
1298: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1300: /* elimination */
1301: bjtmp = bj + bi[i];
1302: row = *bjtmp++;
1303: nzL = bi[i+1] - bi[i];
1304: for (k=0; k < nzL; k++) {
1305: pc = rtmp1 + row;
1306: if (*pc != 0.0) {
1307: pv = b->a + bdiag[row];
1308: mul1 = *pc * (*pv);
1309: *pc = mul1;
1310: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1311: pv = b->a + bdiag[row+1]+1;
1312: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1313: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1314: PetscLogFlops(1+2*nz);
1315: }
1316: row = *bjtmp++;
1317: }
1319: /* finished row so stick it into b->a */
1320: rs = 0.0;
1321: /* L part */
1322: pv = b->a + bi[i];
1323: pj = b->j + bi[i];
1324: nz = bi[i+1] - bi[i];
1325: for (j=0; j<nz; j++) {
1326: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327: }
1329: /* U part */
1330: pv = b->a + bdiag[i+1]+1;
1331: pj = b->j + bdiag[i+1]+1;
1332: nz = bdiag[i] - bdiag[i+1]-1;
1333: for (j=0; j<nz; j++) {
1334: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335: }
1337: /* Check zero pivot */
1338: sctx.rs = rs;
1339: sctx.pv = rtmp1[i];
1340: MatPivotCheck(A,info,&sctx,i);
1341: if (sctx.newshift) break;
1343: /* Mark diagonal and invert diagonal for simplier triangular solves */
1344: pv = b->a + bdiag[i];
1345: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1346: break;
1348: case 2:
1349: /*----------*/
1350: /* zero rtmp1 and rtmp2 */
1351: /* L part */
1352: nz = bi[i+1] - bi[i];
1353: bjtmp = bj + bi[i];
1354: for (j=0; j<nz; j++) {
1355: col = bjtmp[j];
1356: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357: }
1359: /* U part */
1360: nz = bdiag[i]-bdiag[i+1];
1361: bjtmp = bj + bdiag[i+1]+1;
1362: for (j=0; j<nz; j++) {
1363: col = bjtmp[j];
1364: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365: }
1367: /* load in initial (unfactored row) */
1368: nz = ai[r[i]+1] - ai[r[i]];
1369: ajtmp = aj + ai[r[i]];
1370: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1371: for (j=0; j<nz; j++) {
1372: col = ics[ajtmp[j]];
1373: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1374: }
1375: /* ZeropivotApply(): shift the diagonal of the matrix */
1376: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1378: /* elimination */
1379: bjtmp = bj + bi[i];
1380: row = *bjtmp++; /* pivot row */
1381: nzL = bi[i+1] - bi[i];
1382: for (k=0; k < nzL; k++) {
1383: pc1 = rtmp1 + row;
1384: pc2 = rtmp2 + row;
1385: if (*pc1 != 0.0 || *pc2 != 0.0) {
1386: pv = b->a + bdiag[row];
1387: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1388: *pc1 = mul1; *pc2 = mul2;
1390: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1391: pv = b->a + bdiag[row+1]+1;
1392: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1393: for (j=0; j<nz; j++) {
1394: col = pj[j];
1395: rtmp1[col] -= mul1 * pv[j];
1396: rtmp2[col] -= mul2 * pv[j];
1397: }
1398: PetscLogFlops(2+4*nz);
1399: }
1400: row = *bjtmp++;
1401: }
1403: /* finished row i; check zero pivot, then stick row i into b->a */
1404: rs = 0.0;
1405: /* L part */
1406: pc1 = b->a + bi[i];
1407: pj = b->j + bi[i];
1408: nz = bi[i+1] - bi[i];
1409: for (j=0; j<nz; j++) {
1410: col = pj[j];
1411: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412: }
1413: /* U part */
1414: pc1 = b->a + bdiag[i+1]+1;
1415: pj = b->j + bdiag[i+1]+1;
1416: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1417: for (j=0; j<nz; j++) {
1418: col = pj[j];
1419: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420: }
1422: sctx.rs = rs;
1423: sctx.pv = rtmp1[i];
1424: MatPivotCheck(A,info,&sctx,i);
1425: if (sctx.newshift) break;
1426: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1427: *pc1 = 1.0/sctx.pv;
1429: /* Now take care of diagonal 2x2 block. */
1430: pc2 = rtmp2 + i;
1431: if (*pc2 != 0.0) {
1432: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1433: *pc2 = mul1; /* insert L entry */
1434: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1435: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1436: for (j=0; j<nz; j++) {
1437: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1438: }
1439: PetscLogFlops(1+2*nz);
1440: }
1442: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1443: rs = 0.0;
1444: /* L part */
1445: pc2 = b->a + bi[i+1];
1446: pj = b->j + bi[i+1];
1447: nz = bi[i+2] - bi[i+1];
1448: for (j=0; j<nz; j++) {
1449: col = pj[j];
1450: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451: }
1452: /* U part */
1453: pc2 = b->a + bdiag[i+2]+1;
1454: pj = b->j + bdiag[i+2]+1;
1455: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1456: for (j=0; j<nz; j++) {
1457: col = pj[j];
1458: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1459: }
1461: sctx.rs = rs;
1462: sctx.pv = rtmp2[i+1];
1463: MatPivotCheck(A,info,&sctx,i+1);
1464: if (sctx.newshift) break;
1465: pc2 = b->a + bdiag[i+1];
1466: *pc2 = 1.0/sctx.pv;
1467: break;
1469: case 3:
1470: /*----------*/
1471: /* zero rtmp */
1472: /* L part */
1473: nz = bi[i+1] - bi[i];
1474: bjtmp = bj + bi[i];
1475: for (j=0; j<nz; j++) {
1476: col = bjtmp[j];
1477: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478: }
1480: /* U part */
1481: nz = bdiag[i]-bdiag[i+1];
1482: bjtmp = bj + bdiag[i+1]+1;
1483: for (j=0; j<nz; j++) {
1484: col = bjtmp[j];
1485: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486: }
1488: /* load in initial (unfactored row) */
1489: nz = ai[r[i]+1] - ai[r[i]];
1490: ajtmp = aj + ai[r[i]];
1491: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1492: for (j=0; j<nz; j++) {
1493: col = ics[ajtmp[j]];
1494: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1495: }
1496: /* ZeropivotApply(): shift the diagonal of the matrix */
1497: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1499: /* elimination */
1500: bjtmp = bj + bi[i];
1501: row = *bjtmp++; /* pivot row */
1502: nzL = bi[i+1] - bi[i];
1503: for (k=0; k < nzL; k++) {
1504: pc1 = rtmp1 + row;
1505: pc2 = rtmp2 + row;
1506: pc3 = rtmp3 + row;
1507: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1508: pv = b->a + bdiag[row];
1509: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1510: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1512: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1513: pv = b->a + bdiag[row+1]+1;
1514: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1515: for (j=0; j<nz; j++) {
1516: col = pj[j];
1517: rtmp1[col] -= mul1 * pv[j];
1518: rtmp2[col] -= mul2 * pv[j];
1519: rtmp3[col] -= mul3 * pv[j];
1520: }
1521: PetscLogFlops(3+6*nz);
1522: }
1523: row = *bjtmp++;
1524: }
1526: /* finished row i; check zero pivot, then stick row i into b->a */
1527: rs = 0.0;
1528: /* L part */
1529: pc1 = b->a + bi[i];
1530: pj = b->j + bi[i];
1531: nz = bi[i+1] - bi[i];
1532: for (j=0; j<nz; j++) {
1533: col = pj[j];
1534: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535: }
1536: /* U part */
1537: pc1 = b->a + bdiag[i+1]+1;
1538: pj = b->j + bdiag[i+1]+1;
1539: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1540: for (j=0; j<nz; j++) {
1541: col = pj[j];
1542: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1543: }
1545: sctx.rs = rs;
1546: sctx.pv = rtmp1[i];
1547: MatPivotCheck(A,info,&sctx,i);
1548: if (sctx.newshift) break;
1549: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1550: *pc1 = 1.0/sctx.pv;
1552: /* Now take care of 1st column of diagonal 3x3 block. */
1553: pc2 = rtmp2 + i;
1554: pc3 = rtmp3 + i;
1555: if (*pc2 != 0.0 || *pc3 != 0.0) {
1556: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1557: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1558: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1559: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1560: for (j=0; j<nz; j++) {
1561: col = pj[j];
1562: rtmp2[col] -= mul2 * rtmp1[col];
1563: rtmp3[col] -= mul3 * rtmp1[col];
1564: }
1565: PetscLogFlops(2+4*nz);
1566: }
1568: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1569: rs = 0.0;
1570: /* L part */
1571: pc2 = b->a + bi[i+1];
1572: pj = b->j + bi[i+1];
1573: nz = bi[i+2] - bi[i+1];
1574: for (j=0; j<nz; j++) {
1575: col = pj[j];
1576: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577: }
1578: /* U part */
1579: pc2 = b->a + bdiag[i+2]+1;
1580: pj = b->j + bdiag[i+2]+1;
1581: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1582: for (j=0; j<nz; j++) {
1583: col = pj[j];
1584: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1585: }
1587: sctx.rs = rs;
1588: sctx.pv = rtmp2[i+1];
1589: MatPivotCheck(A,info,&sctx,i+1);
1590: if (sctx.newshift) break;
1591: pc2 = b->a + bdiag[i+1];
1592: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1594: /* Now take care of 2nd column of diagonal 3x3 block. */
1595: pc3 = rtmp3 + i+1;
1596: if (*pc3 != 0.0) {
1597: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1598: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1599: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1600: for (j=0; j<nz; j++) {
1601: col = pj[j];
1602: rtmp3[col] -= mul3 * rtmp2[col];
1603: }
1604: PetscLogFlops(1+2*nz);
1605: }
1607: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1608: rs = 0.0;
1609: /* L part */
1610: pc3 = b->a + bi[i+2];
1611: pj = b->j + bi[i+2];
1612: nz = bi[i+3] - bi[i+2];
1613: for (j=0; j<nz; j++) {
1614: col = pj[j];
1615: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616: }
1617: /* U part */
1618: pc3 = b->a + bdiag[i+3]+1;
1619: pj = b->j + bdiag[i+3]+1;
1620: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1621: for (j=0; j<nz; j++) {
1622: col = pj[j];
1623: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1624: }
1626: sctx.rs = rs;
1627: sctx.pv = rtmp3[i+2];
1628: MatPivotCheck(A,info,&sctx,i+2);
1629: if (sctx.newshift) break;
1630: pc3 = b->a + bdiag[i+2];
1631: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1632: break;
1633: case 4:
1634: /*----------*/
1635: /* zero rtmp */
1636: /* L part */
1637: nz = bi[i+1] - bi[i];
1638: bjtmp = bj + bi[i];
1639: for (j=0; j<nz; j++) {
1640: col = bjtmp[j];
1641: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1642: }
1644: /* U part */
1645: nz = bdiag[i]-bdiag[i+1];
1646: bjtmp = bj + bdiag[i+1]+1;
1647: for (j=0; j<nz; j++) {
1648: col = bjtmp[j];
1649: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1650: }
1652: /* load in initial (unfactored row) */
1653: nz = ai[r[i]+1] - ai[r[i]];
1654: ajtmp = aj + ai[r[i]];
1655: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1656: for (j=0; j<nz; j++) {
1657: col = ics[ajtmp[j]];
1658: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1659: }
1660: /* ZeropivotApply(): shift the diagonal of the matrix */
1661: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1663: /* elimination */
1664: bjtmp = bj + bi[i];
1665: row = *bjtmp++; /* pivot row */
1666: nzL = bi[i+1] - bi[i];
1667: for (k=0; k < nzL; k++) {
1668: pc1 = rtmp1 + row;
1669: pc2 = rtmp2 + row;
1670: pc3 = rtmp3 + row;
1671: pc4 = rtmp4 + row;
1672: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1673: pv = b->a + bdiag[row];
1674: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1675: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1677: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1678: pv = b->a + bdiag[row+1]+1;
1679: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1680: for (j=0; j<nz; j++) {
1681: col = pj[j];
1682: rtmp1[col] -= mul1 * pv[j];
1683: rtmp2[col] -= mul2 * pv[j];
1684: rtmp3[col] -= mul3 * pv[j];
1685: rtmp4[col] -= mul4 * pv[j];
1686: }
1687: PetscLogFlops(4+8*nz);
1688: }
1689: row = *bjtmp++;
1690: }
1692: /* finished row i; check zero pivot, then stick row i into b->a */
1693: rs = 0.0;
1694: /* L part */
1695: pc1 = b->a + bi[i];
1696: pj = b->j + bi[i];
1697: nz = bi[i+1] - bi[i];
1698: for (j=0; j<nz; j++) {
1699: col = pj[j];
1700: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701: }
1702: /* U part */
1703: pc1 = b->a + bdiag[i+1]+1;
1704: pj = b->j + bdiag[i+1]+1;
1705: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1706: for (j=0; j<nz; j++) {
1707: col = pj[j];
1708: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1709: }
1711: sctx.rs = rs;
1712: sctx.pv = rtmp1[i];
1713: MatPivotCheck(A,info,&sctx,i);
1714: if (sctx.newshift) break;
1715: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1716: *pc1 = 1.0/sctx.pv;
1718: /* Now take care of 1st column of diagonal 4x4 block. */
1719: pc2 = rtmp2 + i;
1720: pc3 = rtmp3 + i;
1721: pc4 = rtmp4 + i;
1722: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1724: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1725: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1726: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1727: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1728: for (j=0; j<nz; j++) {
1729: col = pj[j];
1730: rtmp2[col] -= mul2 * rtmp1[col];
1731: rtmp3[col] -= mul3 * rtmp1[col];
1732: rtmp4[col] -= mul4 * rtmp1[col];
1733: }
1734: PetscLogFlops(3+6*nz);
1735: }
1737: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1738: rs = 0.0;
1739: /* L part */
1740: pc2 = b->a + bi[i+1];
1741: pj = b->j + bi[i+1];
1742: nz = bi[i+2] - bi[i+1];
1743: for (j=0; j<nz; j++) {
1744: col = pj[j];
1745: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746: }
1747: /* U part */
1748: pc2 = b->a + bdiag[i+2]+1;
1749: pj = b->j + bdiag[i+2]+1;
1750: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1751: for (j=0; j<nz; j++) {
1752: col = pj[j];
1753: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1754: }
1756: sctx.rs = rs;
1757: sctx.pv = rtmp2[i+1];
1758: MatPivotCheck(A,info,&sctx,i+1);
1759: if (sctx.newshift) break;
1760: pc2 = b->a + bdiag[i+1];
1761: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1763: /* Now take care of 2nd column of diagonal 4x4 block. */
1764: pc3 = rtmp3 + i+1;
1765: pc4 = rtmp4 + i+1;
1766: if (*pc3 != 0.0 || *pc4 != 0.0) {
1767: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1768: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1769: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1770: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1771: for (j=0; j<nz; j++) {
1772: col = pj[j];
1773: rtmp3[col] -= mul3 * rtmp2[col];
1774: rtmp4[col] -= mul4 * rtmp2[col];
1775: }
1776: PetscLogFlops(4*nz);
1777: }
1779: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1780: rs = 0.0;
1781: /* L part */
1782: pc3 = b->a + bi[i+2];
1783: pj = b->j + bi[i+2];
1784: nz = bi[i+3] - bi[i+2];
1785: for (j=0; j<nz; j++) {
1786: col = pj[j];
1787: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788: }
1789: /* U part */
1790: pc3 = b->a + bdiag[i+3]+1;
1791: pj = b->j + bdiag[i+3]+1;
1792: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1793: for (j=0; j<nz; j++) {
1794: col = pj[j];
1795: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1796: }
1798: sctx.rs = rs;
1799: sctx.pv = rtmp3[i+2];
1800: MatPivotCheck(A,info,&sctx,i+2);
1801: if (sctx.newshift) break;
1802: pc3 = b->a + bdiag[i+2];
1803: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1805: /* Now take care of 3rd column of diagonal 4x4 block. */
1806: pc4 = rtmp4 + i+2;
1807: if (*pc4 != 0.0) {
1808: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1809: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1810: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1811: for (j=0; j<nz; j++) {
1812: col = pj[j];
1813: rtmp4[col] -= mul4 * rtmp3[col];
1814: }
1815: PetscLogFlops(1+2*nz);
1816: }
1818: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1819: rs = 0.0;
1820: /* L part */
1821: pc4 = b->a + bi[i+3];
1822: pj = b->j + bi[i+3];
1823: nz = bi[i+4] - bi[i+3];
1824: for (j=0; j<nz; j++) {
1825: col = pj[j];
1826: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827: }
1828: /* U part */
1829: pc4 = b->a + bdiag[i+4]+1;
1830: pj = b->j + bdiag[i+4]+1;
1831: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1832: for (j=0; j<nz; j++) {
1833: col = pj[j];
1834: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835: }
1837: sctx.rs = rs;
1838: sctx.pv = rtmp4[i+3];
1839: MatPivotCheck(A,info,&sctx,i+3);
1840: if (sctx.newshift) break;
1841: pc4 = b->a + bdiag[i+3];
1842: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1843: break;
1845: default:
1846: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1847: }
1848: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1849: i += nodesz; /* Update the row */
1850: }
1852: /* MatPivotRefine() */
1853: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1854: /*
1855: * if no shift in this attempt & shifting & started shifting & can refine,
1856: * then try lower shift
1857: */
1858: sctx.shift_hi = sctx.shift_fraction;
1859: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1860: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1861: sctx.newshift = PETSC_TRUE;
1862: sctx.nshift++;
1863: }
1864: } while (sctx.newshift);
1866: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1867: PetscFree(tmp_vec2);
1868: ISRestoreIndices(isicol,&ic);
1869: ISRestoreIndices(isrow,&r);
1871: if (b->inode.size) {
1872: C->ops->solve = MatSolve_SeqAIJ_Inode;
1873: } else {
1874: C->ops->solve = MatSolve_SeqAIJ;
1875: }
1876: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1877: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1878: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1879: C->ops->matsolve = MatMatSolve_SeqAIJ;
1880: C->assembled = PETSC_TRUE;
1881: C->preallocated = PETSC_TRUE;
1883: PetscLogFlops(C->cmap->n);
1885: /* MatShiftView(A,info,&sctx) */
1886: if (sctx.nshift) {
1887: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1889: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1891: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1892: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1893: }
1894: }
1895: return(0);
1896: }
1900: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1901: {
1902: Mat C = B;
1903: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1904: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1905: PetscErrorCode ierr;
1906: const PetscInt *r,*ic,*c,*ics;
1907: PetscInt n = A->rmap->n,*bi = b->i;
1908: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1909: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1910: PetscInt *ai = a->i,*aj = a->j;
1911: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1912: PetscScalar mul1,mul2,mul3,tmp;
1913: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1914: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1915: PetscReal rs=0.0;
1916: FactorShiftCtx sctx;
1919: sctx.shift_top = 0;
1920: sctx.nshift_max = 0;
1921: sctx.shift_lo = 0;
1922: sctx.shift_hi = 0;
1923: sctx.shift_fraction = 0;
1925: /* if both shift schemes are chosen by user, only use info->shiftpd */
1926: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1927: sctx.shift_top = 0;
1928: for (i=0; i<n; i++) {
1929: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1930: rs = 0.0;
1931: ajtmp = aj + ai[i];
1932: rtmp1 = aa + ai[i];
1933: nz = ai[i+1] - ai[i];
1934: for (j=0; j<nz; j++) {
1935: if (*ajtmp != i) {
1936: rs += PetscAbsScalar(*rtmp1++);
1937: } else {
1938: rs -= PetscRealPart(*rtmp1++);
1939: }
1940: ajtmp++;
1941: }
1942: if (rs>sctx.shift_top) sctx.shift_top = rs;
1943: }
1944: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1945: sctx.shift_top *= 1.1;
1946: sctx.nshift_max = 5;
1947: sctx.shift_lo = 0.;
1948: sctx.shift_hi = 1.;
1949: }
1950: sctx.shift_amount = 0;
1951: sctx.nshift = 0;
1953: ISGetIndices(isrow,&r);
1954: ISGetIndices(iscol,&c);
1955: ISGetIndices(isicol,&ic);
1956: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1957: ics = ic;
1959: node_max = a->inode.node_count;
1960: ns = a->inode.size;
1961: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1963: /* If max inode size > 3, split it into two inodes.*/
1964: /* also map the inode sizes according to the ordering */
1965: PetscMalloc1((n+1),&tmp_vec1);
1966: for (i=0,j=0; i<node_max; ++i,++j) {
1967: if (ns[i]>3) {
1968: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1969: ++j;
1970: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1971: } else {
1972: tmp_vec1[j] = ns[i];
1973: }
1974: }
1975: /* Use the correct node_max */
1976: node_max = j;
1978: /* Now reorder the inode info based on mat re-ordering info */
1979: /* First create a row -> inode_size_array_index map */
1980: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1981: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1982: for (i=0,row=0; i<node_max; i++) {
1983: nodesz = tmp_vec1[i];
1984: for (j=0; j<nodesz; j++,row++) {
1985: nsmap[row] = i;
1986: }
1987: }
1988: /* Using nsmap, create a reordered ns structure */
1989: for (i=0,j=0; i< node_max; i++) {
1990: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1991: tmp_vec2[i] = nodesz;
1992: j += nodesz;
1993: }
1994: PetscFree(nsmap);
1995: PetscFree(tmp_vec1);
1996: /* Now use the correct ns */
1997: ns = tmp_vec2;
1999: do {
2000: sctx.newshift = PETSC_FALSE;
2001: /* Now loop over each block-row, and do the factorization */
2002: for (i=0,row=0; i<node_max; i++) {
2003: nodesz = ns[i];
2004: nz = bi[row+1] - bi[row];
2005: bjtmp = bj + bi[row];
2007: switch (nodesz) {
2008: case 1:
2009: for (j=0; j<nz; j++) {
2010: idx = bjtmp[j];
2011: rtmp11[idx] = 0.0;
2012: }
2014: /* load in initial (unfactored row) */
2015: idx = r[row];
2016: nz_tmp = ai[idx+1] - ai[idx];
2017: ajtmp = aj + ai[idx];
2018: v1 = aa + ai[idx];
2020: for (j=0; j<nz_tmp; j++) {
2021: idx = ics[ajtmp[j]];
2022: rtmp11[idx] = v1[j];
2023: }
2024: rtmp11[ics[r[row]]] += sctx.shift_amount;
2026: prow = *bjtmp++;
2027: while (prow < row) {
2028: pc1 = rtmp11 + prow;
2029: if (*pc1 != 0.0) {
2030: pv = ba + bd[prow];
2031: pj = nbj + bd[prow];
2032: mul1 = *pc1 * *pv++;
2033: *pc1 = mul1;
2034: nz_tmp = bi[prow+1] - bd[prow] - 1;
2035: PetscLogFlops(1+2*nz_tmp);
2036: for (j=0; j<nz_tmp; j++) {
2037: tmp = pv[j];
2038: idx = pj[j];
2039: rtmp11[idx] -= mul1 * tmp;
2040: }
2041: }
2042: prow = *bjtmp++;
2043: }
2044: pj = bj + bi[row];
2045: pc1 = ba + bi[row];
2047: sctx.pv = rtmp11[row];
2048: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2049: rs = 0.0;
2050: for (j=0; j<nz; j++) {
2051: idx = pj[j];
2052: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2053: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2054: }
2055: sctx.rs = rs;
2056: MatPivotCheck(A,info,&sctx,row);
2057: if (sctx.newshift) goto endofwhile;
2058: break;
2060: case 2:
2061: for (j=0; j<nz; j++) {
2062: idx = bjtmp[j];
2063: rtmp11[idx] = 0.0;
2064: rtmp22[idx] = 0.0;
2065: }
2067: /* load in initial (unfactored row) */
2068: idx = r[row];
2069: nz_tmp = ai[idx+1] - ai[idx];
2070: ajtmp = aj + ai[idx];
2071: v1 = aa + ai[idx];
2072: v2 = aa + ai[idx+1];
2073: for (j=0; j<nz_tmp; j++) {
2074: idx = ics[ajtmp[j]];
2075: rtmp11[idx] = v1[j];
2076: rtmp22[idx] = v2[j];
2077: }
2078: rtmp11[ics[r[row]]] += sctx.shift_amount;
2079: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2081: prow = *bjtmp++;
2082: while (prow < row) {
2083: pc1 = rtmp11 + prow;
2084: pc2 = rtmp22 + prow;
2085: if (*pc1 != 0.0 || *pc2 != 0.0) {
2086: pv = ba + bd[prow];
2087: pj = nbj + bd[prow];
2088: mul1 = *pc1 * *pv;
2089: mul2 = *pc2 * *pv;
2090: ++pv;
2091: *pc1 = mul1;
2092: *pc2 = mul2;
2094: nz_tmp = bi[prow+1] - bd[prow] - 1;
2095: for (j=0; j<nz_tmp; j++) {
2096: tmp = pv[j];
2097: idx = pj[j];
2098: rtmp11[idx] -= mul1 * tmp;
2099: rtmp22[idx] -= mul2 * tmp;
2100: }
2101: PetscLogFlops(2+4*nz_tmp);
2102: }
2103: prow = *bjtmp++;
2104: }
2106: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2107: pc1 = rtmp11 + prow;
2108: pc2 = rtmp22 + prow;
2110: sctx.pv = *pc1;
2111: pj = bj + bi[prow];
2112: rs = 0.0;
2113: for (j=0; j<nz; j++) {
2114: idx = pj[j];
2115: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2116: }
2117: sctx.rs = rs;
2118: MatPivotCheck(A,info,&sctx,row);
2119: if (sctx.newshift) goto endofwhile;
2121: if (*pc2 != 0.0) {
2122: pj = nbj + bd[prow];
2123: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2124: *pc2 = mul2;
2125: nz_tmp = bi[prow+1] - bd[prow] - 1;
2126: for (j=0; j<nz_tmp; j++) {
2127: idx = pj[j];
2128: tmp = rtmp11[idx];
2129: rtmp22[idx] -= mul2 * tmp;
2130: }
2131: PetscLogFlops(1+2*nz_tmp);
2132: }
2134: pj = bj + bi[row];
2135: pc1 = ba + bi[row];
2136: pc2 = ba + bi[row+1];
2138: sctx.pv = rtmp22[row+1];
2139: rs = 0.0;
2140: rtmp11[row] = 1.0/rtmp11[row];
2141: rtmp22[row+1] = 1.0/rtmp22[row+1];
2142: /* copy row entries from dense representation to sparse */
2143: for (j=0; j<nz; j++) {
2144: idx = pj[j];
2145: pc1[j] = rtmp11[idx];
2146: pc2[j] = rtmp22[idx];
2147: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2148: }
2149: sctx.rs = rs;
2150: MatPivotCheck(A,info,&sctx,row+1);
2151: if (sctx.newshift) goto endofwhile;
2152: break;
2154: case 3:
2155: for (j=0; j<nz; j++) {
2156: idx = bjtmp[j];
2157: rtmp11[idx] = 0.0;
2158: rtmp22[idx] = 0.0;
2159: rtmp33[idx] = 0.0;
2160: }
2161: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2162: idx = r[row];
2163: nz_tmp = ai[idx+1] - ai[idx];
2164: ajtmp = aj + ai[idx];
2165: v1 = aa + ai[idx];
2166: v2 = aa + ai[idx+1];
2167: v3 = aa + ai[idx+2];
2168: for (j=0; j<nz_tmp; j++) {
2169: idx = ics[ajtmp[j]];
2170: rtmp11[idx] = v1[j];
2171: rtmp22[idx] = v2[j];
2172: rtmp33[idx] = v3[j];
2173: }
2174: rtmp11[ics[r[row]]] += sctx.shift_amount;
2175: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2176: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2178: /* loop over all pivot row blocks above this row block */
2179: prow = *bjtmp++;
2180: while (prow < row) {
2181: pc1 = rtmp11 + prow;
2182: pc2 = rtmp22 + prow;
2183: pc3 = rtmp33 + prow;
2184: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2185: pv = ba + bd[prow];
2186: pj = nbj + bd[prow];
2187: mul1 = *pc1 * *pv;
2188: mul2 = *pc2 * *pv;
2189: mul3 = *pc3 * *pv;
2190: ++pv;
2191: *pc1 = mul1;
2192: *pc2 = mul2;
2193: *pc3 = mul3;
2195: nz_tmp = bi[prow+1] - bd[prow] - 1;
2196: /* update this row based on pivot row */
2197: for (j=0; j<nz_tmp; j++) {
2198: tmp = pv[j];
2199: idx = pj[j];
2200: rtmp11[idx] -= mul1 * tmp;
2201: rtmp22[idx] -= mul2 * tmp;
2202: rtmp33[idx] -= mul3 * tmp;
2203: }
2204: PetscLogFlops(3+6*nz_tmp);
2205: }
2206: prow = *bjtmp++;
2207: }
2209: /* Now take care of diagonal 3x3 block in this set of rows */
2210: /* note: prow = row here */
2211: pc1 = rtmp11 + prow;
2212: pc2 = rtmp22 + prow;
2213: pc3 = rtmp33 + prow;
2215: sctx.pv = *pc1;
2216: pj = bj + bi[prow];
2217: rs = 0.0;
2218: for (j=0; j<nz; j++) {
2219: idx = pj[j];
2220: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2221: }
2222: sctx.rs = rs;
2223: MatPivotCheck(A,info,&sctx,row);
2224: if (sctx.newshift) goto endofwhile;
2226: if (*pc2 != 0.0 || *pc3 != 0.0) {
2227: mul2 = (*pc2)/(*pc1);
2228: mul3 = (*pc3)/(*pc1);
2229: *pc2 = mul2;
2230: *pc3 = mul3;
2231: nz_tmp = bi[prow+1] - bd[prow] - 1;
2232: pj = nbj + bd[prow];
2233: for (j=0; j<nz_tmp; j++) {
2234: idx = pj[j];
2235: tmp = rtmp11[idx];
2236: rtmp22[idx] -= mul2 * tmp;
2237: rtmp33[idx] -= mul3 * tmp;
2238: }
2239: PetscLogFlops(2+4*nz_tmp);
2240: }
2241: ++prow;
2243: pc2 = rtmp22 + prow;
2244: pc3 = rtmp33 + prow;
2245: sctx.pv = *pc2;
2246: pj = bj + bi[prow];
2247: rs = 0.0;
2248: for (j=0; j<nz; j++) {
2249: idx = pj[j];
2250: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2251: }
2252: sctx.rs = rs;
2253: MatPivotCheck(A,info,&sctx,row+1);
2254: if (sctx.newshift) goto endofwhile;
2256: if (*pc3 != 0.0) {
2257: mul3 = (*pc3)/(*pc2);
2258: *pc3 = mul3;
2259: pj = nbj + bd[prow];
2260: nz_tmp = bi[prow+1] - bd[prow] - 1;
2261: for (j=0; j<nz_tmp; j++) {
2262: idx = pj[j];
2263: tmp = rtmp22[idx];
2264: rtmp33[idx] -= mul3 * tmp;
2265: }
2266: PetscLogFlops(1+2*nz_tmp);
2267: }
2269: pj = bj + bi[row];
2270: pc1 = ba + bi[row];
2271: pc2 = ba + bi[row+1];
2272: pc3 = ba + bi[row+2];
2274: sctx.pv = rtmp33[row+2];
2275: rs = 0.0;
2276: rtmp11[row] = 1.0/rtmp11[row];
2277: rtmp22[row+1] = 1.0/rtmp22[row+1];
2278: rtmp33[row+2] = 1.0/rtmp33[row+2];
2279: /* copy row entries from dense representation to sparse */
2280: for (j=0; j<nz; j++) {
2281: idx = pj[j];
2282: pc1[j] = rtmp11[idx];
2283: pc2[j] = rtmp22[idx];
2284: pc3[j] = rtmp33[idx];
2285: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2286: }
2288: sctx.rs = rs;
2289: MatPivotCheck(A,info,&sctx,row+2);
2290: if (sctx.newshift) goto endofwhile;
2291: break;
2293: default:
2294: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2295: }
2296: row += nodesz; /* Update the row */
2297: }
2298: endofwhile:;
2299: } while (sctx.newshift);
2300: PetscFree3(rtmp11,rtmp22,rtmp33);
2301: PetscFree(tmp_vec2);
2302: ISRestoreIndices(isicol,&ic);
2303: ISRestoreIndices(isrow,&r);
2304: ISRestoreIndices(iscol,&c);
2306: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2307: /* do not set solve add, since MatSolve_Inode + Add is faster */
2308: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2309: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2310: C->assembled = PETSC_TRUE;
2311: C->preallocated = PETSC_TRUE;
2312: if (sctx.nshift) {
2313: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2314: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2315: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2316: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2317: }
2318: }
2319: PetscLogFlops(C->cmap->n);
2320: Mat_CheckInode(C,PETSC_FALSE);
2321: return(0);
2322: }
2325: /* ----------------------------------------------------------- */
2328: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2329: {
2330: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2331: IS iscol = a->col,isrow = a->row;
2332: PetscErrorCode ierr;
2333: const PetscInt *r,*c,*rout,*cout;
2334: PetscInt i,j,n = A->rmap->n;
2335: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2336: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2337: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2338: PetscScalar sum1,sum2,sum3,sum4,sum5;
2339: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2340: const PetscScalar *b;
2343: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2344: node_max = a->inode.node_count;
2345: ns = a->inode.size; /* Node Size array */
2347: VecGetArrayRead(bb,&b);
2348: VecGetArray(xx,&x);
2349: tmp = a->solve_work;
2351: ISGetIndices(isrow,&rout); r = rout;
2352: ISGetIndices(iscol,&cout); c = cout;
2354: /* forward solve the lower triangular */
2355: tmps = tmp;
2356: aa = a_a;
2357: aj = a_j;
2358: ad = a->diag;
2360: for (i = 0,row = 0; i< node_max; ++i) {
2361: nsz = ns[i];
2362: aii = ai[row];
2363: v1 = aa + aii;
2364: vi = aj + aii;
2365: nz = ai[row+1]- ai[row];
2367: if (i < node_max-1) {
2368: /* Prefetch the indices for the next block */
2369: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2370: /* Prefetch the data for the next block */
2371: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2372: }
2374: switch (nsz) { /* Each loop in 'case' is unrolled */
2375: case 1:
2376: sum1 = b[r[row]];
2377: for (j=0; j<nz-1; j+=2) {
2378: i0 = vi[j];
2379: i1 = vi[j+1];
2380: tmp0 = tmps[i0];
2381: tmp1 = tmps[i1];
2382: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2383: }
2384: if (j == nz-1) {
2385: tmp0 = tmps[vi[j]];
2386: sum1 -= v1[j]*tmp0;
2387: }
2388: tmp[row++]=sum1;
2389: break;
2390: case 2:
2391: sum1 = b[r[row]];
2392: sum2 = b[r[row+1]];
2393: v2 = aa + ai[row+1];
2395: for (j=0; j<nz-1; j+=2) {
2396: i0 = vi[j];
2397: i1 = vi[j+1];
2398: tmp0 = tmps[i0];
2399: tmp1 = tmps[i1];
2400: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2401: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2402: }
2403: if (j == nz-1) {
2404: tmp0 = tmps[vi[j]];
2405: sum1 -= v1[j] *tmp0;
2406: sum2 -= v2[j] *tmp0;
2407: }
2408: sum2 -= v2[nz] * sum1;
2409: tmp[row++]=sum1;
2410: tmp[row++]=sum2;
2411: break;
2412: case 3:
2413: sum1 = b[r[row]];
2414: sum2 = b[r[row+1]];
2415: sum3 = b[r[row+2]];
2416: v2 = aa + ai[row+1];
2417: v3 = aa + ai[row+2];
2419: for (j=0; j<nz-1; j+=2) {
2420: i0 = vi[j];
2421: i1 = vi[j+1];
2422: tmp0 = tmps[i0];
2423: tmp1 = tmps[i1];
2424: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427: }
2428: if (j == nz-1) {
2429: tmp0 = tmps[vi[j]];
2430: sum1 -= v1[j] *tmp0;
2431: sum2 -= v2[j] *tmp0;
2432: sum3 -= v3[j] *tmp0;
2433: }
2434: sum2 -= v2[nz] * sum1;
2435: sum3 -= v3[nz] * sum1;
2436: sum3 -= v3[nz+1] * sum2;
2437: tmp[row++]=sum1;
2438: tmp[row++]=sum2;
2439: tmp[row++]=sum3;
2440: break;
2442: case 4:
2443: sum1 = b[r[row]];
2444: sum2 = b[r[row+1]];
2445: sum3 = b[r[row+2]];
2446: sum4 = b[r[row+3]];
2447: v2 = aa + ai[row+1];
2448: v3 = aa + ai[row+2];
2449: v4 = aa + ai[row+3];
2451: for (j=0; j<nz-1; j+=2) {
2452: i0 = vi[j];
2453: i1 = vi[j+1];
2454: tmp0 = tmps[i0];
2455: tmp1 = tmps[i1];
2456: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2457: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2458: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2459: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2460: }
2461: if (j == nz-1) {
2462: tmp0 = tmps[vi[j]];
2463: sum1 -= v1[j] *tmp0;
2464: sum2 -= v2[j] *tmp0;
2465: sum3 -= v3[j] *tmp0;
2466: sum4 -= v4[j] *tmp0;
2467: }
2468: sum2 -= v2[nz] * sum1;
2469: sum3 -= v3[nz] * sum1;
2470: sum4 -= v4[nz] * sum1;
2471: sum3 -= v3[nz+1] * sum2;
2472: sum4 -= v4[nz+1] * sum2;
2473: sum4 -= v4[nz+2] * sum3;
2475: tmp[row++]=sum1;
2476: tmp[row++]=sum2;
2477: tmp[row++]=sum3;
2478: tmp[row++]=sum4;
2479: break;
2480: case 5:
2481: sum1 = b[r[row]];
2482: sum2 = b[r[row+1]];
2483: sum3 = b[r[row+2]];
2484: sum4 = b[r[row+3]];
2485: sum5 = b[r[row+4]];
2486: v2 = aa + ai[row+1];
2487: v3 = aa + ai[row+2];
2488: v4 = aa + ai[row+3];
2489: v5 = aa + ai[row+4];
2491: for (j=0; j<nz-1; j+=2) {
2492: i0 = vi[j];
2493: i1 = vi[j+1];
2494: tmp0 = tmps[i0];
2495: tmp1 = tmps[i1];
2496: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2497: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2498: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2499: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2500: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2501: }
2502: if (j == nz-1) {
2503: tmp0 = tmps[vi[j]];
2504: sum1 -= v1[j] *tmp0;
2505: sum2 -= v2[j] *tmp0;
2506: sum3 -= v3[j] *tmp0;
2507: sum4 -= v4[j] *tmp0;
2508: sum5 -= v5[j] *tmp0;
2509: }
2511: sum2 -= v2[nz] * sum1;
2512: sum3 -= v3[nz] * sum1;
2513: sum4 -= v4[nz] * sum1;
2514: sum5 -= v5[nz] * sum1;
2515: sum3 -= v3[nz+1] * sum2;
2516: sum4 -= v4[nz+1] * sum2;
2517: sum5 -= v5[nz+1] * sum2;
2518: sum4 -= v4[nz+2] * sum3;
2519: sum5 -= v5[nz+2] * sum3;
2520: sum5 -= v5[nz+3] * sum4;
2522: tmp[row++]=sum1;
2523: tmp[row++]=sum2;
2524: tmp[row++]=sum3;
2525: tmp[row++]=sum4;
2526: tmp[row++]=sum5;
2527: break;
2528: default:
2529: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2530: }
2531: }
2532: /* backward solve the upper triangular */
2533: for (i=node_max -1,row = n-1; i>=0; i--) {
2534: nsz = ns[i];
2535: aii = ad[row+1] + 1;
2536: v1 = aa + aii;
2537: vi = aj + aii;
2538: nz = ad[row]- ad[row+1] - 1;
2540: if (i > 0) {
2541: /* Prefetch the indices for the next block */
2542: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2543: /* Prefetch the data for the next block */
2544: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545: }
2547: switch (nsz) { /* Each loop in 'case' is unrolled */
2548: case 1:
2549: sum1 = tmp[row];
2551: for (j=0; j<nz-1; j+=2) {
2552: i0 = vi[j];
2553: i1 = vi[j+1];
2554: tmp0 = tmps[i0];
2555: tmp1 = tmps[i1];
2556: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557: }
2558: if (j == nz-1) {
2559: tmp0 = tmps[vi[j]];
2560: sum1 -= v1[j]*tmp0;
2561: }
2562: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2563: break;
2564: case 2:
2565: sum1 = tmp[row];
2566: sum2 = tmp[row-1];
2567: v2 = aa + ad[row] + 1;
2568: for (j=0; j<nz-1; j+=2) {
2569: i0 = vi[j];
2570: i1 = vi[j+1];
2571: tmp0 = tmps[i0];
2572: tmp1 = tmps[i1];
2573: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2574: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2575: }
2576: if (j == nz-1) {
2577: tmp0 = tmps[vi[j]];
2578: sum1 -= v1[j]* tmp0;
2579: sum2 -= v2[j+1]* tmp0;
2580: }
2582: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2583: sum2 -= v2[0] * tmp0;
2584: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2585: break;
2586: case 3:
2587: sum1 = tmp[row];
2588: sum2 = tmp[row -1];
2589: sum3 = tmp[row -2];
2590: v2 = aa + ad[row] + 1;
2591: v3 = aa + ad[row -1] + 1;
2592: for (j=0; j<nz-1; j+=2) {
2593: i0 = vi[j];
2594: i1 = vi[j+1];
2595: tmp0 = tmps[i0];
2596: tmp1 = tmps[i1];
2597: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600: }
2601: if (j== nz-1) {
2602: tmp0 = tmps[vi[j]];
2603: sum1 -= v1[j] * tmp0;
2604: sum2 -= v2[j+1] * tmp0;
2605: sum3 -= v3[j+2] * tmp0;
2606: }
2607: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2608: sum2 -= v2[0]* tmp0;
2609: sum3 -= v3[1] * tmp0;
2610: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2611: sum3 -= v3[0]* tmp0;
2612: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2614: break;
2615: case 4:
2616: sum1 = tmp[row];
2617: sum2 = tmp[row -1];
2618: sum3 = tmp[row -2];
2619: sum4 = tmp[row -3];
2620: v2 = aa + ad[row]+1;
2621: v3 = aa + ad[row -1]+1;
2622: v4 = aa + ad[row -2]+1;
2624: for (j=0; j<nz-1; j+=2) {
2625: i0 = vi[j];
2626: i1 = vi[j+1];
2627: tmp0 = tmps[i0];
2628: tmp1 = tmps[i1];
2629: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2630: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633: }
2634: if (j== nz-1) {
2635: tmp0 = tmps[vi[j]];
2636: sum1 -= v1[j] * tmp0;
2637: sum2 -= v2[j+1] * tmp0;
2638: sum3 -= v3[j+2] * tmp0;
2639: sum4 -= v4[j+3] * tmp0;
2640: }
2642: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643: sum2 -= v2[0] * tmp0;
2644: sum3 -= v3[1] * tmp0;
2645: sum4 -= v4[2] * tmp0;
2646: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2647: sum3 -= v3[0] * tmp0;
2648: sum4 -= v4[1] * tmp0;
2649: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650: sum4 -= v4[0] * tmp0;
2651: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2652: break;
2653: case 5:
2654: sum1 = tmp[row];
2655: sum2 = tmp[row -1];
2656: sum3 = tmp[row -2];
2657: sum4 = tmp[row -3];
2658: sum5 = tmp[row -4];
2659: v2 = aa + ad[row]+1;
2660: v3 = aa + ad[row -1]+1;
2661: v4 = aa + ad[row -2]+1;
2662: v5 = aa + ad[row -3]+1;
2663: for (j=0; j<nz-1; j+=2) {
2664: i0 = vi[j];
2665: i1 = vi[j+1];
2666: tmp0 = tmps[i0];
2667: tmp1 = tmps[i1];
2668: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2669: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2670: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2671: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2672: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2673: }
2674: if (j==nz-1) {
2675: tmp0 = tmps[vi[j]];
2676: sum1 -= v1[j] * tmp0;
2677: sum2 -= v2[j+1] * tmp0;
2678: sum3 -= v3[j+2] * tmp0;
2679: sum4 -= v4[j+3] * tmp0;
2680: sum5 -= v5[j+4] * tmp0;
2681: }
2683: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2684: sum2 -= v2[0] * tmp0;
2685: sum3 -= v3[1] * tmp0;
2686: sum4 -= v4[2] * tmp0;
2687: sum5 -= v5[3] * tmp0;
2688: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2689: sum3 -= v3[0] * tmp0;
2690: sum4 -= v4[1] * tmp0;
2691: sum5 -= v5[2] * tmp0;
2692: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2693: sum4 -= v4[0] * tmp0;
2694: sum5 -= v5[1] * tmp0;
2695: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2696: sum5 -= v5[0] * tmp0;
2697: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2698: break;
2699: default:
2700: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2701: }
2702: }
2703: ISRestoreIndices(isrow,&rout);
2704: ISRestoreIndices(iscol,&cout);
2705: VecRestoreArrayRead(bb,&b);
2706: VecRestoreArray(xx,&x);
2707: PetscLogFlops(2.0*a->nz - A->cmap->n);
2708: return(0);
2709: }
2712: /*
2713: Makes a longer coloring[] array and calls the usual code with that
2714: */
2717: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2718: {
2719: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2720: PetscErrorCode ierr;
2721: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2722: PetscInt *colorused,i;
2723: ISColoringValue *newcolor;
2726: PetscMalloc1((n+1),&newcolor);
2727: /* loop over inodes, marking a color for each column*/
2728: row = 0;
2729: for (i=0; i<m; i++) {
2730: for (j=0; j<ns[i]; j++) {
2731: newcolor[row++] = coloring[i] + j*ncolors;
2732: }
2733: }
2735: /* eliminate unneeded colors */
2736: PetscCalloc1(5*ncolors,&colorused);
2737: for (i=0; i<n; i++) {
2738: colorused[newcolor[i]] = 1;
2739: }
2741: for (i=1; i<5*ncolors; i++) {
2742: colorused[i] += colorused[i-1];
2743: }
2744: ncolors = colorused[5*ncolors-1];
2745: for (i=0; i<n; i++) {
2746: newcolor[i] = colorused[newcolor[i]]-1;
2747: }
2748: PetscFree(colorused);
2749: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);
2750: PetscFree(coloring);
2751: return(0);
2752: }
2754: #include <petsc-private/kernels/blockinvert.h>
2758: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2759: {
2760: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2761: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2762: MatScalar *ibdiag,*bdiag,work[25],*t;
2763: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2764: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2765: const PetscScalar *xb, *b;
2766: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2767: PetscErrorCode ierr;
2768: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2769: PetscInt sz,k,ipvt[5];
2770: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2773: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2774: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2776: if (!a->inode.ibdiagvalid) {
2777: if (!a->inode.ibdiag) {
2778: /* calculate space needed for diagonal blocks */
2779: for (i=0; i<m; i++) {
2780: cnt += sizes[i]*sizes[i];
2781: }
2782: a->inode.bdiagsize = cnt;
2784: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2785: }
2787: /* copy over the diagonal blocks and invert them */
2788: ibdiag = a->inode.ibdiag;
2789: bdiag = a->inode.bdiag;
2790: cnt = 0;
2791: for (i=0, row = 0; i<m; i++) {
2792: for (j=0; j<sizes[i]; j++) {
2793: for (k=0; k<sizes[i]; k++) {
2794: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2795: }
2796: }
2797: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2799: switch (sizes[i]) {
2800: case 1:
2801: /* Create matrix data structure */
2802: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2803: ibdiag[cnt] = 1.0/ibdiag[cnt];
2804: break;
2805: case 2:
2806: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2807: break;
2808: case 3:
2809: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2810: break;
2811: case 4:
2812: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2813: break;
2814: case 5:
2815: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2816: break;
2817: default:
2818: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2819: }
2820: cnt += sizes[i]*sizes[i];
2821: row += sizes[i];
2822: }
2823: a->inode.ibdiagvalid = PETSC_TRUE;
2824: }
2825: ibdiag = a->inode.ibdiag;
2826: bdiag = a->inode.bdiag;
2827: t = a->inode.ssor_work;
2829: VecGetArray(xx,&x);
2830: VecGetArrayRead(bb,&b);
2831: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2832: if (flag & SOR_ZERO_INITIAL_GUESS) {
2833: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2835: for (i=0, row=0; i<m; i++) {
2836: sz = diag[row] - ii[row];
2837: v1 = a->a + ii[row];
2838: idx = a->j + ii[row];
2840: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2841: switch (sizes[i]) {
2842: case 1:
2844: sum1 = b[row];
2845: for (n = 0; n<sz-1; n+=2) {
2846: i1 = idx[0];
2847: i2 = idx[1];
2848: idx += 2;
2849: tmp0 = x[i1];
2850: tmp1 = x[i2];
2851: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2852: }
2854: if (n == sz-1) {
2855: tmp0 = x[*idx];
2856: sum1 -= *v1 * tmp0;
2857: }
2858: t[row] = sum1;
2859: x[row++] = sum1*(*ibdiag++);
2860: break;
2861: case 2:
2862: v2 = a->a + ii[row+1];
2863: sum1 = b[row];
2864: sum2 = b[row+1];
2865: for (n = 0; n<sz-1; n+=2) {
2866: i1 = idx[0];
2867: i2 = idx[1];
2868: idx += 2;
2869: tmp0 = x[i1];
2870: tmp1 = x[i2];
2871: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2872: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2873: }
2875: if (n == sz-1) {
2876: tmp0 = x[*idx];
2877: sum1 -= v1[0] * tmp0;
2878: sum2 -= v2[0] * tmp0;
2879: }
2880: t[row] = sum1;
2881: t[row+1] = sum2;
2882: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2883: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2884: ibdiag += 4;
2885: break;
2886: case 3:
2887: v2 = a->a + ii[row+1];
2888: v3 = a->a + ii[row+2];
2889: sum1 = b[row];
2890: sum2 = b[row+1];
2891: sum3 = b[row+2];
2892: for (n = 0; n<sz-1; n+=2) {
2893: i1 = idx[0];
2894: i2 = idx[1];
2895: idx += 2;
2896: tmp0 = x[i1];
2897: tmp1 = x[i2];
2898: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2899: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2900: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2901: }
2903: if (n == sz-1) {
2904: tmp0 = x[*idx];
2905: sum1 -= v1[0] * tmp0;
2906: sum2 -= v2[0] * tmp0;
2907: sum3 -= v3[0] * tmp0;
2908: }
2909: t[row] = sum1;
2910: t[row+1] = sum2;
2911: t[row+2] = sum3;
2912: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2913: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2914: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2915: ibdiag += 9;
2916: break;
2917: case 4:
2918: v2 = a->a + ii[row+1];
2919: v3 = a->a + ii[row+2];
2920: v4 = a->a + ii[row+3];
2921: sum1 = b[row];
2922: sum2 = b[row+1];
2923: sum3 = b[row+2];
2924: sum4 = b[row+3];
2925: for (n = 0; n<sz-1; n+=2) {
2926: i1 = idx[0];
2927: i2 = idx[1];
2928: idx += 2;
2929: tmp0 = x[i1];
2930: tmp1 = x[i2];
2931: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2932: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2933: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2934: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2935: }
2937: if (n == sz-1) {
2938: tmp0 = x[*idx];
2939: sum1 -= v1[0] * tmp0;
2940: sum2 -= v2[0] * tmp0;
2941: sum3 -= v3[0] * tmp0;
2942: sum4 -= v4[0] * tmp0;
2943: }
2944: t[row] = sum1;
2945: t[row+1] = sum2;
2946: t[row+2] = sum3;
2947: t[row+3] = sum4;
2948: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2949: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2950: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2951: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2952: ibdiag += 16;
2953: break;
2954: case 5:
2955: v2 = a->a + ii[row+1];
2956: v3 = a->a + ii[row+2];
2957: v4 = a->a + ii[row+3];
2958: v5 = a->a + ii[row+4];
2959: sum1 = b[row];
2960: sum2 = b[row+1];
2961: sum3 = b[row+2];
2962: sum4 = b[row+3];
2963: sum5 = b[row+4];
2964: for (n = 0; n<sz-1; n+=2) {
2965: i1 = idx[0];
2966: i2 = idx[1];
2967: idx += 2;
2968: tmp0 = x[i1];
2969: tmp1 = x[i2];
2970: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2971: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2972: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2973: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2974: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2975: }
2977: if (n == sz-1) {
2978: tmp0 = x[*idx];
2979: sum1 -= v1[0] * tmp0;
2980: sum2 -= v2[0] * tmp0;
2981: sum3 -= v3[0] * tmp0;
2982: sum4 -= v4[0] * tmp0;
2983: sum5 -= v5[0] * tmp0;
2984: }
2985: t[row] = sum1;
2986: t[row+1] = sum2;
2987: t[row+2] = sum3;
2988: t[row+3] = sum4;
2989: t[row+4] = sum5;
2990: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2991: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2992: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2993: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2994: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2995: ibdiag += 25;
2996: break;
2997: default:
2998: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2999: }
3000: }
3002: xb = t;
3003: PetscLogFlops(a->nz);
3004: } else xb = b;
3005: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3007: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3008: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3009: ibdiag -= sizes[i]*sizes[i];
3010: sz = ii[row+1] - diag[row] - 1;
3011: v1 = a->a + diag[row] + 1;
3012: idx = a->j + diag[row] + 1;
3014: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3015: switch (sizes[i]) {
3016: case 1:
3018: sum1 = xb[row];
3019: for (n = 0; n<sz-1; n+=2) {
3020: i1 = idx[0];
3021: i2 = idx[1];
3022: idx += 2;
3023: tmp0 = x[i1];
3024: tmp1 = x[i2];
3025: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3026: }
3028: if (n == sz-1) {
3029: tmp0 = x[*idx];
3030: sum1 -= *v1*tmp0;
3031: }
3032: x[row--] = sum1*(*ibdiag);
3033: break;
3035: case 2:
3037: sum1 = xb[row];
3038: sum2 = xb[row-1];
3039: /* note that sum1 is associated with the second of the two rows */
3040: v2 = a->a + diag[row-1] + 2;
3041: for (n = 0; n<sz-1; n+=2) {
3042: i1 = idx[0];
3043: i2 = idx[1];
3044: idx += 2;
3045: tmp0 = x[i1];
3046: tmp1 = x[i2];
3047: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3048: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3049: }
3051: if (n == sz-1) {
3052: tmp0 = x[*idx];
3053: sum1 -= *v1*tmp0;
3054: sum2 -= *v2*tmp0;
3055: }
3056: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3057: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3058: break;
3059: case 3:
3061: sum1 = xb[row];
3062: sum2 = xb[row-1];
3063: sum3 = xb[row-2];
3064: v2 = a->a + diag[row-1] + 2;
3065: v3 = a->a + diag[row-2] + 3;
3066: for (n = 0; n<sz-1; n+=2) {
3067: i1 = idx[0];
3068: i2 = idx[1];
3069: idx += 2;
3070: tmp0 = x[i1];
3071: tmp1 = x[i2];
3072: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3073: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3074: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3075: }
3077: if (n == sz-1) {
3078: tmp0 = x[*idx];
3079: sum1 -= *v1*tmp0;
3080: sum2 -= *v2*tmp0;
3081: sum3 -= *v3*tmp0;
3082: }
3083: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3084: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3085: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3086: break;
3087: case 4:
3089: sum1 = xb[row];
3090: sum2 = xb[row-1];
3091: sum3 = xb[row-2];
3092: sum4 = xb[row-3];
3093: v2 = a->a + diag[row-1] + 2;
3094: v3 = a->a + diag[row-2] + 3;
3095: v4 = a->a + diag[row-3] + 4;
3096: for (n = 0; n<sz-1; n+=2) {
3097: i1 = idx[0];
3098: i2 = idx[1];
3099: idx += 2;
3100: tmp0 = x[i1];
3101: tmp1 = x[i2];
3102: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3103: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3104: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3105: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3106: }
3108: if (n == sz-1) {
3109: tmp0 = x[*idx];
3110: sum1 -= *v1*tmp0;
3111: sum2 -= *v2*tmp0;
3112: sum3 -= *v3*tmp0;
3113: sum4 -= *v4*tmp0;
3114: }
3115: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3116: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3117: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3118: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3119: break;
3120: case 5:
3122: sum1 = xb[row];
3123: sum2 = xb[row-1];
3124: sum3 = xb[row-2];
3125: sum4 = xb[row-3];
3126: sum5 = xb[row-4];
3127: v2 = a->a + diag[row-1] + 2;
3128: v3 = a->a + diag[row-2] + 3;
3129: v4 = a->a + diag[row-3] + 4;
3130: v5 = a->a + diag[row-4] + 5;
3131: for (n = 0; n<sz-1; n+=2) {
3132: i1 = idx[0];
3133: i2 = idx[1];
3134: idx += 2;
3135: tmp0 = x[i1];
3136: tmp1 = x[i2];
3137: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3138: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3139: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3140: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3141: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3142: }
3144: if (n == sz-1) {
3145: tmp0 = x[*idx];
3146: sum1 -= *v1*tmp0;
3147: sum2 -= *v2*tmp0;
3148: sum3 -= *v3*tmp0;
3149: sum4 -= *v4*tmp0;
3150: sum5 -= *v5*tmp0;
3151: }
3152: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3153: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3154: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3155: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3156: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3157: break;
3158: default:
3159: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3160: }
3161: }
3163: PetscLogFlops(a->nz);
3164: }
3165: its--;
3166: }
3167: while (its--) {
3169: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3170: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3171: i<m;
3172: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3174: sz = diag[row] - ii[row];
3175: v1 = a->a + ii[row];
3176: idx = a->j + ii[row];
3177: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3178: switch (sizes[i]) {
3179: case 1:
3180: sum1 = b[row];
3181: for (n = 0; n<sz-1; n+=2) {
3182: i1 = idx[0];
3183: i2 = idx[1];
3184: idx += 2;
3185: tmp0 = x[i1];
3186: tmp1 = x[i2];
3187: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3188: }
3189: if (n == sz-1) {
3190: tmp0 = x[*idx++];
3191: sum1 -= *v1 * tmp0;
3192: v1++;
3193: }
3194: t[row] = sum1;
3195: sz = ii[row+1] - diag[row] - 1;
3196: idx = a->j + diag[row] + 1;
3197: v1 += 1;
3198: for (n = 0; n<sz-1; n+=2) {
3199: i1 = idx[0];
3200: i2 = idx[1];
3201: idx += 2;
3202: tmp0 = x[i1];
3203: tmp1 = x[i2];
3204: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3205: }
3206: if (n == sz-1) {
3207: tmp0 = x[*idx++];
3208: sum1 -= *v1 * tmp0;
3209: }
3210: /* in MatSOR_SeqAIJ this line would be
3211: *
3212: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3213: *
3214: * but omega == 1, so this becomes
3215: *
3216: * x[row] = sum1*(*ibdiag++);
3217: *
3218: */
3219: x[row] = sum1*(*ibdiag);
3220: break;
3221: case 2:
3222: v2 = a->a + ii[row+1];
3223: sum1 = b[row];
3224: sum2 = b[row+1];
3225: for (n = 0; n<sz-1; n+=2) {
3226: i1 = idx[0];
3227: i2 = idx[1];
3228: idx += 2;
3229: tmp0 = x[i1];
3230: tmp1 = x[i2];
3231: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3232: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3233: }
3234: if (n == sz-1) {
3235: tmp0 = x[*idx++];
3236: sum1 -= v1[0] * tmp0;
3237: sum2 -= v2[0] * tmp0;
3238: v1++; v2++;
3239: }
3240: t[row] = sum1;
3241: t[row+1] = sum2;
3242: sz = ii[row+1] - diag[row] - 2;
3243: idx = a->j + diag[row] + 2;
3244: v1 += 2;
3245: v2 += 2;
3246: for (n = 0; n<sz-1; n+=2) {
3247: i1 = idx[0];
3248: i2 = idx[1];
3249: idx += 2;
3250: tmp0 = x[i1];
3251: tmp1 = x[i2];
3252: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3253: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3254: }
3255: if (n == sz-1) {
3256: tmp0 = x[*idx];
3257: sum1 -= v1[0] * tmp0;
3258: sum2 -= v2[0] * tmp0;
3259: }
3260: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3261: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3262: break;
3263: case 3:
3264: v2 = a->a + ii[row+1];
3265: v3 = a->a + ii[row+2];
3266: sum1 = b[row];
3267: sum2 = b[row+1];
3268: sum3 = b[row+2];
3269: for (n = 0; n<sz-1; n+=2) {
3270: i1 = idx[0];
3271: i2 = idx[1];
3272: idx += 2;
3273: tmp0 = x[i1];
3274: tmp1 = x[i2];
3275: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3276: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3277: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3278: }
3279: if (n == sz-1) {
3280: tmp0 = x[*idx++];
3281: sum1 -= v1[0] * tmp0;
3282: sum2 -= v2[0] * tmp0;
3283: sum3 -= v3[0] * tmp0;
3284: v1++; v2++; v3++;
3285: }
3286: t[row] = sum1;
3287: t[row+1] = sum2;
3288: t[row+2] = sum3;
3289: sz = ii[row+1] - diag[row] - 3;
3290: idx = a->j + diag[row] + 3;
3291: v1 += 3;
3292: v2 += 3;
3293: v3 += 3;
3294: for (n = 0; n<sz-1; n+=2) {
3295: i1 = idx[0];
3296: i2 = idx[1];
3297: idx += 2;
3298: tmp0 = x[i1];
3299: tmp1 = x[i2];
3300: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3301: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3302: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3303: }
3304: if (n == sz-1) {
3305: tmp0 = x[*idx];
3306: sum1 -= v1[0] * tmp0;
3307: sum2 -= v2[0] * tmp0;
3308: sum3 -= v3[0] * tmp0;
3309: }
3310: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3311: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3312: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3313: break;
3314: case 4:
3315: v2 = a->a + ii[row+1];
3316: v3 = a->a + ii[row+2];
3317: v4 = a->a + ii[row+3];
3318: sum1 = b[row];
3319: sum2 = b[row+1];
3320: sum3 = b[row+2];
3321: sum4 = b[row+3];
3322: for (n = 0; n<sz-1; n+=2) {
3323: i1 = idx[0];
3324: i2 = idx[1];
3325: idx += 2;
3326: tmp0 = x[i1];
3327: tmp1 = x[i2];
3328: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3329: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3330: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3331: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3332: }
3333: if (n == sz-1) {
3334: tmp0 = x[*idx++];
3335: sum1 -= v1[0] * tmp0;
3336: sum2 -= v2[0] * tmp0;
3337: sum3 -= v3[0] * tmp0;
3338: sum4 -= v4[0] * tmp0;
3339: v1++; v2++; v3++; v4++;
3340: }
3341: t[row] = sum1;
3342: t[row+1] = sum2;
3343: t[row+2] = sum3;
3344: t[row+3] = sum4;
3345: sz = ii[row+1] - diag[row] - 4;
3346: idx = a->j + diag[row] + 4;
3347: v1 += 4;
3348: v2 += 4;
3349: v3 += 4;
3350: v4 += 4;
3351: for (n = 0; n<sz-1; n+=2) {
3352: i1 = idx[0];
3353: i2 = idx[1];
3354: idx += 2;
3355: tmp0 = x[i1];
3356: tmp1 = x[i2];
3357: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3358: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3359: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3360: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3361: }
3362: if (n == sz-1) {
3363: tmp0 = x[*idx];
3364: sum1 -= v1[0] * tmp0;
3365: sum2 -= v2[0] * tmp0;
3366: sum3 -= v3[0] * tmp0;
3367: sum4 -= v4[0] * tmp0;
3368: }
3369: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3370: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3371: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3372: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3373: break;
3374: case 5:
3375: v2 = a->a + ii[row+1];
3376: v3 = a->a + ii[row+2];
3377: v4 = a->a + ii[row+3];
3378: v5 = a->a + ii[row+4];
3379: sum1 = b[row];
3380: sum2 = b[row+1];
3381: sum3 = b[row+2];
3382: sum4 = b[row+3];
3383: sum5 = b[row+4];
3384: for (n = 0; n<sz-1; n+=2) {
3385: i1 = idx[0];
3386: i2 = idx[1];
3387: idx += 2;
3388: tmp0 = x[i1];
3389: tmp1 = x[i2];
3390: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3391: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3392: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3393: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3394: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3395: }
3396: if (n == sz-1) {
3397: tmp0 = x[*idx++];
3398: sum1 -= v1[0] * tmp0;
3399: sum2 -= v2[0] * tmp0;
3400: sum3 -= v3[0] * tmp0;
3401: sum4 -= v4[0] * tmp0;
3402: sum5 -= v5[0] * tmp0;
3403: v1++; v2++; v3++; v4++; v5++;
3404: }
3405: t[row] = sum1;
3406: t[row+1] = sum2;
3407: t[row+2] = sum3;
3408: t[row+3] = sum4;
3409: t[row+4] = sum5;
3410: sz = ii[row+1] - diag[row] - 5;
3411: idx = a->j + diag[row] + 5;
3412: v1 += 5;
3413: v2 += 5;
3414: v3 += 5;
3415: v4 += 5;
3416: v5 += 5;
3417: for (n = 0; n<sz-1; n+=2) {
3418: i1 = idx[0];
3419: i2 = idx[1];
3420: idx += 2;
3421: tmp0 = x[i1];
3422: tmp1 = x[i2];
3423: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3424: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3425: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3426: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3427: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3428: }
3429: if (n == sz-1) {
3430: tmp0 = x[*idx];
3431: sum1 -= v1[0] * tmp0;
3432: sum2 -= v2[0] * tmp0;
3433: sum3 -= v3[0] * tmp0;
3434: sum4 -= v4[0] * tmp0;
3435: sum5 -= v5[0] * tmp0;
3436: }
3437: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3438: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3439: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3440: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3441: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3442: break;
3443: default:
3444: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3445: }
3446: }
3447: xb = t;
3448: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3449: } else xb = b;
3451: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3453: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3454: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3455: ibdiag -= sizes[i]*sizes[i];
3457: /* set RHS */
3458: if (xb == b) {
3459: /* whole (old way) */
3460: sz = ii[row+1] - ii[row];
3461: idx = a->j + ii[row];
3462: switch (sizes[i]) {
3463: case 5:
3464: v5 = a->a + ii[row-4];
3465: case 4: /* fall through */
3466: v4 = a->a + ii[row-3];
3467: case 3:
3468: v3 = a->a + ii[row-2];
3469: case 2:
3470: v2 = a->a + ii[row-1];
3471: case 1:
3472: v1 = a->a + ii[row];
3473: break;
3474: default:
3475: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3476: }
3477: } else {
3478: /* upper, no diag */
3479: sz = ii[row+1] - diag[row] - 1;
3480: idx = a->j + diag[row] + 1;
3481: switch (sizes[i]) {
3482: case 5:
3483: v5 = a->a + diag[row-4] + 5;
3484: case 4: /* fall through */
3485: v4 = a->a + diag[row-3] + 4;
3486: case 3:
3487: v3 = a->a + diag[row-2] + 3;
3488: case 2:
3489: v2 = a->a + diag[row-1] + 2;
3490: case 1:
3491: v1 = a->a + diag[row] + 1;
3492: }
3493: }
3494: /* set sum */
3495: switch (sizes[i]) {
3496: case 5:
3497: sum5 = xb[row-4];
3498: case 4: /* fall through */
3499: sum4 = xb[row-3];
3500: case 3:
3501: sum3 = xb[row-2];
3502: case 2:
3503: sum2 = xb[row-1];
3504: case 1:
3505: /* note that sum1 is associated with the last row */
3506: sum1 = xb[row];
3507: }
3508: /* do sums */
3509: for (n = 0; n<sz-1; n+=2) {
3510: i1 = idx[0];
3511: i2 = idx[1];
3512: idx += 2;
3513: tmp0 = x[i1];
3514: tmp1 = x[i2];
3515: switch (sizes[i]) {
3516: case 5:
3517: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3518: case 4: /* fall through */
3519: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3520: case 3:
3521: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3522: case 2:
3523: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3524: case 1:
3525: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3526: }
3527: }
3528: /* ragged edge */
3529: if (n == sz-1) {
3530: tmp0 = x[*idx];
3531: switch (sizes[i]) {
3532: case 5:
3533: sum5 -= *v5*tmp0;
3534: case 4: /* fall through */
3535: sum4 -= *v4*tmp0;
3536: case 3:
3537: sum3 -= *v3*tmp0;
3538: case 2:
3539: sum2 -= *v2*tmp0;
3540: case 1:
3541: sum1 -= *v1*tmp0;
3542: }
3543: }
3544: /* update */
3545: if (xb == b) {
3546: /* whole (old way) w/ diag */
3547: switch (sizes[i]) {
3548: case 5:
3549: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3550: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3551: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3552: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3553: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3554: break;
3555: case 4:
3556: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3557: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3558: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3559: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3560: break;
3561: case 3:
3562: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3563: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3564: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3565: break;
3566: case 2:
3567: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3568: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3569: break;
3570: case 1:
3571: x[row--] += sum1*(*ibdiag);
3572: break;
3573: }
3574: } else {
3575: /* no diag so set = */
3576: switch (sizes[i]) {
3577: case 5:
3578: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3579: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3580: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3581: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3582: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3583: break;
3584: case 4:
3585: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3586: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3587: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3588: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3589: break;
3590: case 3:
3591: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3592: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3593: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3594: break;
3595: case 2:
3596: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3597: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3598: break;
3599: case 1:
3600: x[row--] = sum1*(*ibdiag);
3601: break;
3602: }
3603: }
3604: }
3605: if (xb == b) {
3606: PetscLogFlops(2.0*a->nz);
3607: } else {
3608: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3609: }
3610: }
3611: }
3612: if (flag & SOR_EISENSTAT) {
3613: /*
3614: Apply (U + D)^-1 where D is now the block diagonal
3615: */
3616: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3617: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3618: ibdiag -= sizes[i]*sizes[i];
3619: sz = ii[row+1] - diag[row] - 1;
3620: v1 = a->a + diag[row] + 1;
3621: idx = a->j + diag[row] + 1;
3622: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3623: switch (sizes[i]) {
3624: case 1:
3626: sum1 = b[row];
3627: for (n = 0; n<sz-1; n+=2) {
3628: i1 = idx[0];
3629: i2 = idx[1];
3630: idx += 2;
3631: tmp0 = x[i1];
3632: tmp1 = x[i2];
3633: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634: }
3636: if (n == sz-1) {
3637: tmp0 = x[*idx];
3638: sum1 -= *v1*tmp0;
3639: }
3640: x[row] = sum1*(*ibdiag);row--;
3641: break;
3643: case 2:
3645: sum1 = b[row];
3646: sum2 = b[row-1];
3647: /* note that sum1 is associated with the second of the two rows */
3648: v2 = a->a + diag[row-1] + 2;
3649: for (n = 0; n<sz-1; n+=2) {
3650: i1 = idx[0];
3651: i2 = idx[1];
3652: idx += 2;
3653: tmp0 = x[i1];
3654: tmp1 = x[i2];
3655: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3656: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3657: }
3659: if (n == sz-1) {
3660: tmp0 = x[*idx];
3661: sum1 -= *v1*tmp0;
3662: sum2 -= *v2*tmp0;
3663: }
3664: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3665: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3666: row -= 2;
3667: break;
3668: case 3:
3670: sum1 = b[row];
3671: sum2 = b[row-1];
3672: sum3 = b[row-2];
3673: v2 = a->a + diag[row-1] + 2;
3674: v3 = a->a + diag[row-2] + 3;
3675: for (n = 0; n<sz-1; n+=2) {
3676: i1 = idx[0];
3677: i2 = idx[1];
3678: idx += 2;
3679: tmp0 = x[i1];
3680: tmp1 = x[i2];
3681: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3682: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3683: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3684: }
3686: if (n == sz-1) {
3687: tmp0 = x[*idx];
3688: sum1 -= *v1*tmp0;
3689: sum2 -= *v2*tmp0;
3690: sum3 -= *v3*tmp0;
3691: }
3692: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3693: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3694: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3695: row -= 3;
3696: break;
3697: case 4:
3699: sum1 = b[row];
3700: sum2 = b[row-1];
3701: sum3 = b[row-2];
3702: sum4 = b[row-3];
3703: v2 = a->a + diag[row-1] + 2;
3704: v3 = a->a + diag[row-2] + 3;
3705: v4 = a->a + diag[row-3] + 4;
3706: for (n = 0; n<sz-1; n+=2) {
3707: i1 = idx[0];
3708: i2 = idx[1];
3709: idx += 2;
3710: tmp0 = x[i1];
3711: tmp1 = x[i2];
3712: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3713: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3714: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3715: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3716: }
3718: if (n == sz-1) {
3719: tmp0 = x[*idx];
3720: sum1 -= *v1*tmp0;
3721: sum2 -= *v2*tmp0;
3722: sum3 -= *v3*tmp0;
3723: sum4 -= *v4*tmp0;
3724: }
3725: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3726: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3727: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3728: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3729: row -= 4;
3730: break;
3731: case 5:
3733: sum1 = b[row];
3734: sum2 = b[row-1];
3735: sum3 = b[row-2];
3736: sum4 = b[row-3];
3737: sum5 = b[row-4];
3738: v2 = a->a + diag[row-1] + 2;
3739: v3 = a->a + diag[row-2] + 3;
3740: v4 = a->a + diag[row-3] + 4;
3741: v5 = a->a + diag[row-4] + 5;
3742: for (n = 0; n<sz-1; n+=2) {
3743: i1 = idx[0];
3744: i2 = idx[1];
3745: idx += 2;
3746: tmp0 = x[i1];
3747: tmp1 = x[i2];
3748: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3749: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3750: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3751: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3752: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3753: }
3755: if (n == sz-1) {
3756: tmp0 = x[*idx];
3757: sum1 -= *v1*tmp0;
3758: sum2 -= *v2*tmp0;
3759: sum3 -= *v3*tmp0;
3760: sum4 -= *v4*tmp0;
3761: sum5 -= *v5*tmp0;
3762: }
3763: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3764: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3765: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3766: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3767: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3768: row -= 5;
3769: break;
3770: default:
3771: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3772: }
3773: }
3774: PetscLogFlops(a->nz);
3776: /*
3777: t = b - D x where D is the block diagonal
3778: */
3779: cnt = 0;
3780: for (i=0, row=0; i<m; i++) {
3781: switch (sizes[i]) {
3782: case 1:
3783: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3784: break;
3785: case 2:
3786: x1 = x[row]; x2 = x[row+1];
3787: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3788: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3789: t[row] = b[row] - tmp1;
3790: t[row+1] = b[row+1] - tmp2; row += 2;
3791: cnt += 4;
3792: break;
3793: case 3:
3794: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3795: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3796: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3797: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3798: t[row] = b[row] - tmp1;
3799: t[row+1] = b[row+1] - tmp2;
3800: t[row+2] = b[row+2] - tmp3; row += 3;
3801: cnt += 9;
3802: break;
3803: case 4:
3804: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3805: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3806: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3807: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3808: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3809: t[row] = b[row] - tmp1;
3810: t[row+1] = b[row+1] - tmp2;
3811: t[row+2] = b[row+2] - tmp3;
3812: t[row+3] = b[row+3] - tmp4; row += 4;
3813: cnt += 16;
3814: break;
3815: case 5:
3816: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3817: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3818: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3819: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3820: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3821: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3822: t[row] = b[row] - tmp1;
3823: t[row+1] = b[row+1] - tmp2;
3824: t[row+2] = b[row+2] - tmp3;
3825: t[row+3] = b[row+3] - tmp4;
3826: t[row+4] = b[row+4] - tmp5;row += 5;
3827: cnt += 25;
3828: break;
3829: default:
3830: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3831: }
3832: }
3833: PetscLogFlops(m);
3837: /*
3838: Apply (L + D)^-1 where D is the block diagonal
3839: */
3840: for (i=0, row=0; i<m; i++) {
3841: sz = diag[row] - ii[row];
3842: v1 = a->a + ii[row];
3843: idx = a->j + ii[row];
3844: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3845: switch (sizes[i]) {
3846: case 1:
3848: sum1 = t[row];
3849: for (n = 0; n<sz-1; n+=2) {
3850: i1 = idx[0];
3851: i2 = idx[1];
3852: idx += 2;
3853: tmp0 = t[i1];
3854: tmp1 = t[i2];
3855: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3856: }
3858: if (n == sz-1) {
3859: tmp0 = t[*idx];
3860: sum1 -= *v1 * tmp0;
3861: }
3862: x[row] += t[row] = sum1*(*ibdiag++); row++;
3863: break;
3864: case 2:
3865: v2 = a->a + ii[row+1];
3866: sum1 = t[row];
3867: sum2 = t[row+1];
3868: for (n = 0; n<sz-1; n+=2) {
3869: i1 = idx[0];
3870: i2 = idx[1];
3871: idx += 2;
3872: tmp0 = t[i1];
3873: tmp1 = t[i2];
3874: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3875: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3876: }
3878: if (n == sz-1) {
3879: tmp0 = t[*idx];
3880: sum1 -= v1[0] * tmp0;
3881: sum2 -= v2[0] * tmp0;
3882: }
3883: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3884: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3885: ibdiag += 4; row += 2;
3886: break;
3887: case 3:
3888: v2 = a->a + ii[row+1];
3889: v3 = a->a + ii[row+2];
3890: sum1 = t[row];
3891: sum2 = t[row+1];
3892: sum3 = t[row+2];
3893: for (n = 0; n<sz-1; n+=2) {
3894: i1 = idx[0];
3895: i2 = idx[1];
3896: idx += 2;
3897: tmp0 = t[i1];
3898: tmp1 = t[i2];
3899: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3900: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3901: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3902: }
3904: if (n == sz-1) {
3905: tmp0 = t[*idx];
3906: sum1 -= v1[0] * tmp0;
3907: sum2 -= v2[0] * tmp0;
3908: sum3 -= v3[0] * tmp0;
3909: }
3910: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3911: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3912: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3913: ibdiag += 9; row += 3;
3914: break;
3915: case 4:
3916: v2 = a->a + ii[row+1];
3917: v3 = a->a + ii[row+2];
3918: v4 = a->a + ii[row+3];
3919: sum1 = t[row];
3920: sum2 = t[row+1];
3921: sum3 = t[row+2];
3922: sum4 = t[row+3];
3923: for (n = 0; n<sz-1; n+=2) {
3924: i1 = idx[0];
3925: i2 = idx[1];
3926: idx += 2;
3927: tmp0 = t[i1];
3928: tmp1 = t[i2];
3929: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3930: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3931: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3932: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3933: }
3935: if (n == sz-1) {
3936: tmp0 = t[*idx];
3937: sum1 -= v1[0] * tmp0;
3938: sum2 -= v2[0] * tmp0;
3939: sum3 -= v3[0] * tmp0;
3940: sum4 -= v4[0] * tmp0;
3941: }
3942: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3943: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3944: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3945: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3946: ibdiag += 16; row += 4;
3947: break;
3948: case 5:
3949: v2 = a->a + ii[row+1];
3950: v3 = a->a + ii[row+2];
3951: v4 = a->a + ii[row+3];
3952: v5 = a->a + ii[row+4];
3953: sum1 = t[row];
3954: sum2 = t[row+1];
3955: sum3 = t[row+2];
3956: sum4 = t[row+3];
3957: sum5 = t[row+4];
3958: for (n = 0; n<sz-1; n+=2) {
3959: i1 = idx[0];
3960: i2 = idx[1];
3961: idx += 2;
3962: tmp0 = t[i1];
3963: tmp1 = t[i2];
3964: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3965: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3966: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3967: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3968: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3969: }
3971: if (n == sz-1) {
3972: tmp0 = t[*idx];
3973: sum1 -= v1[0] * tmp0;
3974: sum2 -= v2[0] * tmp0;
3975: sum3 -= v3[0] * tmp0;
3976: sum4 -= v4[0] * tmp0;
3977: sum5 -= v5[0] * tmp0;
3978: }
3979: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3980: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3981: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3982: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3983: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3984: ibdiag += 25; row += 5;
3985: break;
3986: default:
3987: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3988: }
3989: }
3990: PetscLogFlops(a->nz);
3991: }
3992: VecRestoreArray(xx,&x);
3993: VecRestoreArrayRead(bb,&b);
3994: return(0);
3995: }
3999: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4000: {
4001: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4002: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4003: const MatScalar *bdiag = a->inode.bdiag;
4004: const PetscScalar *b;
4005: PetscErrorCode ierr;
4006: PetscInt m = a->inode.node_count,cnt = 0,i,row;
4007: const PetscInt *sizes = a->inode.size;
4010: VecGetArray(xx,&x);
4011: VecGetArrayRead(bb,&b);
4012: cnt = 0;
4013: for (i=0, row=0; i<m; i++) {
4014: switch (sizes[i]) {
4015: case 1:
4016: x[row] = b[row]*bdiag[cnt++];row++;
4017: break;
4018: case 2:
4019: x1 = b[row]; x2 = b[row+1];
4020: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4021: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4022: x[row++] = tmp1;
4023: x[row++] = tmp2;
4024: cnt += 4;
4025: break;
4026: case 3:
4027: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4028: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4029: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4030: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4031: x[row++] = tmp1;
4032: x[row++] = tmp2;
4033: x[row++] = tmp3;
4034: cnt += 9;
4035: break;
4036: case 4:
4037: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4038: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4039: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4040: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4041: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4042: x[row++] = tmp1;
4043: x[row++] = tmp2;
4044: x[row++] = tmp3;
4045: x[row++] = tmp4;
4046: cnt += 16;
4047: break;
4048: case 5:
4049: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4050: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4051: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4052: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4053: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4054: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4055: x[row++] = tmp1;
4056: x[row++] = tmp2;
4057: x[row++] = tmp3;
4058: x[row++] = tmp4;
4059: x[row++] = tmp5;
4060: cnt += 25;
4061: break;
4062: default:
4063: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4064: }
4065: }
4066: PetscLogFlops(2*cnt);
4067: VecRestoreArray(xx,&x);
4068: VecRestoreArrayRead(bb,&b);
4069: return(0);
4070: }
4072: /*
4073: samestructure indicates that the matrix has not changed its nonzero structure so we
4074: do not need to recompute the inodes
4075: */
4078: PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
4079: {
4080: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4082: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4083: PetscBool flag;
4084: const PetscInt *idx,*idy,*ii;
4087: if (!a->inode.use) return(0);
4088: if (a->inode.checked && samestructure) return(0);
4091: m = A->rmap->n;
4092: if (a->inode.size) ns = a->inode.size;
4093: else {
4094: PetscMalloc1((m+1),&ns);
4095: }
4097: i = 0;
4098: node_count = 0;
4099: idx = a->j;
4100: ii = a->i;
4101: while (i < m) { /* For each row */
4102: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4103: /* Limits the number of elements in a node to 'a->inode.limit' */
4104: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4105: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4106: if (nzy != nzx) break;
4107: idy += nzx; /* Same nonzero pattern */
4108: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4109: if (!flag) break;
4110: }
4111: ns[node_count++] = blk_size;
4112: idx += blk_size*nzx;
4113: i = j;
4114: }
4115: /* If not enough inodes found,, do not use inode version of the routines */
4116: if (!m || node_count > .8*m) {
4117: PetscFree(ns);
4119: a->inode.node_count = 0;
4120: a->inode.size = NULL;
4121: a->inode.use = PETSC_FALSE;
4122: A->ops->mult = MatMult_SeqAIJ;
4123: A->ops->sor = MatSOR_SeqAIJ;
4124: A->ops->multadd = MatMultAdd_SeqAIJ;
4125: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4126: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4127: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4128: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4129: A->ops->coloringpatch = 0;
4130: A->ops->multdiagonalblock = 0;
4132: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4133: } else {
4134: if (!A->factortype) {
4135: A->ops->mult = MatMult_SeqAIJ_Inode;
4136: A->ops->sor = MatSOR_SeqAIJ_Inode;
4137: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4138: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4139: if (A->rmap->n == A->cmap->n) {
4140: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4141: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4142: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4143: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4144: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4145: }
4146: } else {
4147: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4148: }
4149: a->inode.node_count = node_count;
4150: a->inode.size = ns;
4151: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4152: }
4153: a->inode.checked = PETSC_TRUE;
4154: return(0);
4155: }
4159: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4160: {
4161: Mat B =*C;
4162: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4164: PetscInt m=A->rmap->n;
4167: c->inode.use = a->inode.use;
4168: c->inode.limit = a->inode.limit;
4169: c->inode.max_limit = a->inode.max_limit;
4170: if (a->inode.size) {
4171: PetscMalloc1((m+1),&c->inode.size);
4172: c->inode.node_count = a->inode.node_count;
4173: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4174: /* note the table of functions below should match that in Mat_CheckInode() */
4175: if (!B->factortype) {
4176: B->ops->mult = MatMult_SeqAIJ_Inode;
4177: B->ops->sor = MatSOR_SeqAIJ_Inode;
4178: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4179: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4180: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4181: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4182: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4183: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4184: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4185: } else {
4186: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4187: }
4188: } else {
4189: c->inode.size = 0;
4190: c->inode.node_count = 0;
4191: }
4192: c->inode.ibdiagvalid = PETSC_FALSE;
4193: c->inode.ibdiag = 0;
4194: c->inode.bdiag = 0;
4195: return(0);
4196: }
4200: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4201: {
4202: PetscInt k;
4203: const PetscInt *vi;
4206: vi = aj + ai[row];
4207: for (k=0; k<nzl; k++) cols[k] = vi[k];
4208: vi = aj + adiag[row];
4209: cols[nzl] = vi[0];
4210: vi = aj + adiag[row+1]+1;
4211: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4212: return(0);
4213: }
4214: /*
4215: Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4216: Modified from Mat_CheckInode().
4218: Input Parameters:
4219: . Mat A - ILU or LU matrix factor
4221: */
4224: PetscErrorCode Mat_CheckInode_FactorLU(Mat A)
4225: {
4226: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4228: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4229: PetscInt *cols1,*cols2,*ns;
4230: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4231: PetscBool flag;
4234: if (!a->inode.use) return(0);
4235: if (a->inode.checked) return(0);
4237: m = A->rmap->n;
4238: if (a->inode.size) ns = a->inode.size;
4239: else {
4240: PetscMalloc1((m+1),&ns);
4241: }
4243: i = 0;
4244: node_count = 0;
4245: PetscMalloc2(m,&cols1,m,&cols2);
4246: while (i < m) { /* For each row */
4247: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4248: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4249: nzx = nzl1 + nzu1 + 1;
4250: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4252: /* Limits the number of elements in a node to 'a->inode.limit' */
4253: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4254: nzl2 = ai[j+1] - ai[j];
4255: nzu2 = adiag[j] - adiag[j+1] - 1;
4256: nzy = nzl2 + nzu2 + 1;
4257: if (nzy != nzx) break;
4258: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4259: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4260: if (!flag) break;
4261: }
4262: ns[node_count++] = blk_size;
4263: i = j;
4264: }
4265: PetscFree2(cols1,cols2);
4266: /* If not enough inodes found,, do not use inode version of the routines */
4267: if (!m || node_count > .8*m) {
4268: PetscFree(ns);
4270: a->inode.node_count = 0;
4271: a->inode.size = NULL;
4272: a->inode.use = PETSC_FALSE;
4274: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4275: } else {
4276: A->ops->mult = 0;
4277: A->ops->sor = 0;
4278: A->ops->multadd = 0;
4279: A->ops->getrowij = 0;
4280: A->ops->restorerowij = 0;
4281: A->ops->getcolumnij = 0;
4282: A->ops->restorecolumnij = 0;
4283: A->ops->coloringpatch = 0;
4284: A->ops->multdiagonalblock = 0;
4285: a->inode.node_count = node_count;
4286: a->inode.size = ns;
4288: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4289: }
4290: a->inode.checked = PETSC_TRUE;
4291: return(0);
4292: }
4296: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4297: {
4298: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4301: a->inode.ibdiagvalid = PETSC_FALSE;
4302: return(0);
4303: }
4305: /*
4306: This is really ugly. if inodes are used this replaces the
4307: permutations with ones that correspond to rows/cols of the matrix
4308: rather then inode blocks
4309: */
4312: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4313: {
4317: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4318: return(0);
4319: }
4323: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4324: {
4325: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4327: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4328: const PetscInt *ridx,*cidx;
4329: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4330: PetscInt nslim_col,*ns_col;
4331: IS ris = *rperm,cis = *cperm;
4334: if (!a->inode.size) return(0); /* no inodes so return */
4335: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4337: Mat_CreateColInode(A,&nslim_col,&ns_col);
4338: PetscMalloc1((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1),&tns);
4339: PetscMalloc2(m,&permr,n,&permc);
4341: ISGetIndices(ris,&ridx);
4342: ISGetIndices(cis,&cidx);
4344: /* Form the inode structure for the rows of permuted matric using inv perm*/
4345: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4347: /* Construct the permutations for rows*/
4348: for (i=0,row = 0; i<nslim_row; ++i) {
4349: indx = ridx[i];
4350: start_val = tns[indx];
4351: end_val = tns[indx + 1];
4352: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4353: }
4355: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4356: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4358: /* Construct permutations for columns */
4359: for (i=0,col=0; i<nslim_col; ++i) {
4360: indx = cidx[i];
4361: start_val = tns[indx];
4362: end_val = tns[indx + 1];
4363: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4364: }
4366: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4367: ISSetPermutation(*rperm);
4368: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4369: ISSetPermutation(*cperm);
4371: ISRestoreIndices(ris,&ridx);
4372: ISRestoreIndices(cis,&cidx);
4374: PetscFree(ns_col);
4375: PetscFree2(permr,permc);
4376: ISDestroy(&cis);
4377: ISDestroy(&ris);
4378: PetscFree(tns);
4379: return(0);
4380: }
4384: /*@C
4385: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4387: Not Collective
4389: Input Parameter:
4390: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4392: Output Parameter:
4393: + node_count - no of inodes present in the matrix.
4394: . sizes - an array of size node_count,with sizes of each inode.
4395: - limit - the max size used to generate the inodes.
4397: Level: advanced
4399: Notes: This routine returns some internal storage information
4400: of the matrix, it is intended to be used by advanced users.
4401: It should be called after the matrix is assembled.
4402: The contents of the sizes[] array should not be changed.
4403: NULL may be passed for information not requested.
4405: .keywords: matrix, seqaij, get, inode
4407: .seealso: MatGetInfo()
4408: @*/
4409: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4410: {
4411: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4414: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4415: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4416: if (f) {
4417: (*f)(A,node_count,sizes,limit);
4418: }
4419: return(0);
4420: }
4424: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4425: {
4426: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4429: if (node_count) *node_count = a->inode.node_count;
4430: if (sizes) *sizes = a->inode.size;
4431: if (limit) *limit = a->inode.limit;
4432: return(0);
4433: }