Actual source code: inode.c
petsc-3.7.7 2017-09-25
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24: for (; count+1 < n; count++,i++) ;
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc1(n+1,&ns_col);
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for (; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc1(nslim_col+1,&tns);
81: PetscMalloc1(n+1,&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1) {
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87: }
88: /* allocate space for row pointers */
89: PetscMalloc1(nslim_row+1,&ia);
90: *iia = ia;
91: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
92: PetscMalloc1(nslim_row+1,&work);
94: /* determine the number of columns in each row */
95: ia[0] = oshift;
96: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
98: j = aj + ai[row] + ishift;
99: jmax = aj + ai[row+1] + ishift;
100: if (j==jmax) continue; /* empty row */
101: col = *j++ + ishift;
102: i2 = tvc[col];
103: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104: ia[i1+1]++;
105: ia[i2+1]++;
106: i2++; /* Start col of next node */
107: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108: i2 = tvc[col];
109: }
110: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
111: }
113: /* shift ia[i] to point to next row */
114: for (i1=1; i1<nslim_row+1; i1++) {
115: row = ia[i1-1];
116: ia[i1] += row;
117: work[i1-1] = row - oshift;
118: }
120: /* allocate space for column pointers */
121: nz = ia[nslim_row] + (!ishift);
122: PetscMalloc1(nz,&ja);
123: *jja = ja;
125: /* loop over lower triangular part putting into ja */
126: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127: j = aj + ai[row] + ishift;
128: jmax = aj + ai[row+1] + ishift;
129: if (j==jmax) continue; /* empty row */
130: col = *j++ + ishift;
131: i2 = tvc[col];
132: while (i2<i1 && j<jmax) {
133: ja[work[i2]++] = i1 + oshift;
134: ja[work[i1]++] = i2 + oshift;
135: ++i2;
136: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137: i2 = tvc[col];
138: }
139: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141: }
142: PetscFree(work);
143: PetscFree(tns);
144: PetscFree(tvc);
145: return(0);
146: }
148: /*
149: This builds nonsymmetric version of nonzero structure,
150: */
153: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154: {
155: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
157: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158: PetscInt *tns,*tvc,nsz,i1,i2;
159: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
162: nslim_row = a->inode.node_count;
163: n = A->cmap->n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc1(nslim_col +1,&tns);
170: PetscMalloc1(n + 1,&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1) {
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
176: }
177: /* allocate space for row pointers */
178: PetscMalloc1(nslim_row+1,&ia);
179: *iia = ia;
180: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
181: PetscMalloc1(nslim_row+1,&work);
183: /* determine the number of columns in each row */
184: ia[0] = oshift;
185: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186: j = aj + ai[row] + ishift;
187: nz = ai[row+1] - ai[row];
188: if (!nz) continue; /* empty row */
189: col = *j++ + ishift;
190: i2 = tvc[col];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc1(nz,&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: nz = ai[row+1] - ai[row];
215: if (!nz) continue; /* empty row */
216: col = *j++ + ishift;
217: i2 = tvc[col];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
242: if (!blockcompressed) {
243: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244: } else if (symmetric) {
245: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246: } else {
247: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248: }
249: return(0);
250: }
254: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
255: {
259: if (!ia) return(0);
261: if (!blockcompressed) {
262: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263: } else {
264: PetscFree(*ia);
265: PetscFree(*ja);
266: }
267: return(0);
268: }
270: /* ----------------------------------------------------------- */
274: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275: {
276: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
278: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
282: nslim_row = a->inode.node_count;
283: n = A->cmap->n;
285: /* Create The column_inode for this matrix */
286: Mat_CreateColInode(A,&nslim_col,&ns_col);
288: /* allocate space for reformated column_inode structure */
289: PetscMalloc1(nslim_col + 1,&tns);
290: PetscMalloc1(n + 1,&tvc);
291: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293: for (i1=0,col=0; i1<nslim_col; ++i1) {
294: nsz = ns_col[i1];
295: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
296: }
297: /* allocate space for column pointers */
298: PetscMalloc1(nslim_col+1,&ia);
299: *iia = ia;
300: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
301: PetscMalloc1(nslim_col+1,&work);
303: /* determine the number of columns in each row */
304: ia[0] = oshift;
305: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
306: j = aj + ai[row] + ishift;
307: col = *j++ + ishift;
308: i2 = tvc[col];
309: nz = ai[row+1] - ai[row];
310: while (nz-- > 0) { /* off-diagonal elemets */
311: /* ia[i1+1]++; */
312: ia[i2+1]++;
313: i2++;
314: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
315: if (nz > 0) i2 = tvc[col];
316: }
317: }
319: /* shift ia[i] to point to next col */
320: for (i1=1; i1<nslim_col+1; i1++) {
321: col = ia[i1-1];
322: ia[i1] += col;
323: work[i1-1] = col - oshift;
324: }
326: /* allocate space for column pointers */
327: nz = ia[nslim_col] + (!ishift);
328: PetscMalloc1(nz,&ja);
329: *jja = ja;
331: /* loop over matrix putting into ja */
332: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
333: j = aj + ai[row] + ishift;
334: col = *j++ + ishift;
335: i2 = tvc[col];
336: nz = ai[row+1] - ai[row];
337: while (nz-- > 0) {
338: /* ja[work[i1]++] = i2 + oshift; */
339: ja[work[i2]++] = i1 + oshift;
340: i2++;
341: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
342: if (nz > 0) i2 = tvc[col];
343: }
344: }
345: PetscFree(ns_col);
346: PetscFree(work);
347: PetscFree(tns);
348: PetscFree(tvc);
349: return(0);
350: }
354: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
355: {
359: Mat_CreateColInode(A,n,NULL);
360: if (!ia) return(0);
362: if (!blockcompressed) {
363: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
364: } else if (symmetric) {
365: /* Since the indices are symmetric it does'nt matter */
366: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
367: } else {
368: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
369: }
370: return(0);
371: }
375: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
376: {
380: if (!ia) return(0);
381: if (!blockcompressed) {
382: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
383: } else {
384: PetscFree(*ia);
385: PetscFree(*ja);
386: }
387: return(0);
388: }
390: /* ----------------------------------------------------------- */
394: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
395: {
396: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
397: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
398: PetscScalar *y;
399: const PetscScalar *x;
400: const MatScalar *v1,*v2,*v3,*v4,*v5;
401: PetscErrorCode ierr;
402: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
403: const PetscInt *idx,*ns,*ii;
405: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407: #endif
410: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
411: node_max = a->inode.node_count;
412: ns = a->inode.size; /* Node Size array */
413: VecGetArrayRead(xx,&x);
414: VecGetArray(yy,&y);
415: idx = a->j;
416: v1 = a->a;
417: ii = a->i;
419: for (i = 0,row = 0; i< node_max; ++i) {
420: nsz = ns[i];
421: n = ii[1] - ii[0];
422: nonzerorow += (n>0)*nsz;
423: ii += nsz;
424: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
425: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
426: sz = n; /* No of non zeros in this row */
427: /* Switch on the size of Node */
428: switch (nsz) { /* Each loop in 'case' is unrolled */
429: case 1:
430: sum1 = 0.;
432: for (n = 0; n< sz-1; n+=2) {
433: i1 = idx[0]; /* The instructions are ordered to */
434: i2 = idx[1]; /* make the compiler's job easy */
435: idx += 2;
436: tmp0 = x[i1];
437: tmp1 = x[i2];
438: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439: }
441: if (n == sz-1) { /* Take care of the last nonzero */
442: tmp0 = x[*idx++];
443: sum1 += *v1++ *tmp0;
444: }
445: y[row++]=sum1;
446: break;
447: case 2:
448: sum1 = 0.;
449: sum2 = 0.;
450: v2 = v1 + n;
452: for (n = 0; n< sz-1; n+=2) {
453: i1 = idx[0];
454: i2 = idx[1];
455: idx += 2;
456: tmp0 = x[i1];
457: tmp1 = x[i2];
458: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460: }
461: if (n == sz-1) {
462: tmp0 = x[*idx++];
463: sum1 += *v1++ * tmp0;
464: sum2 += *v2++ * tmp0;
465: }
466: y[row++]=sum1;
467: y[row++]=sum2;
468: v1 =v2; /* Since the next block to be processed starts there*/
469: idx +=sz;
470: break;
471: case 3:
472: sum1 = 0.;
473: sum2 = 0.;
474: sum3 = 0.;
475: v2 = v1 + n;
476: v3 = v2 + n;
478: for (n = 0; n< sz-1; n+=2) {
479: i1 = idx[0];
480: i2 = idx[1];
481: idx += 2;
482: tmp0 = x[i1];
483: tmp1 = x[i2];
484: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487: }
488: if (n == sz-1) {
489: tmp0 = x[*idx++];
490: sum1 += *v1++ * tmp0;
491: sum2 += *v2++ * tmp0;
492: sum3 += *v3++ * tmp0;
493: }
494: y[row++]=sum1;
495: y[row++]=sum2;
496: y[row++]=sum3;
497: v1 =v3; /* Since the next block to be processed starts there*/
498: idx +=2*sz;
499: break;
500: case 4:
501: sum1 = 0.;
502: sum2 = 0.;
503: sum3 = 0.;
504: sum4 = 0.;
505: v2 = v1 + n;
506: v3 = v2 + n;
507: v4 = v3 + n;
509: for (n = 0; n< sz-1; n+=2) {
510: i1 = idx[0];
511: i2 = idx[1];
512: idx += 2;
513: tmp0 = x[i1];
514: tmp1 = x[i2];
515: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519: }
520: if (n == sz-1) {
521: tmp0 = x[*idx++];
522: sum1 += *v1++ * tmp0;
523: sum2 += *v2++ * tmp0;
524: sum3 += *v3++ * tmp0;
525: sum4 += *v4++ * tmp0;
526: }
527: y[row++]=sum1;
528: y[row++]=sum2;
529: y[row++]=sum3;
530: y[row++]=sum4;
531: v1 =v4; /* Since the next block to be processed starts there*/
532: idx +=3*sz;
533: break;
534: case 5:
535: sum1 = 0.;
536: sum2 = 0.;
537: sum3 = 0.;
538: sum4 = 0.;
539: sum5 = 0.;
540: v2 = v1 + n;
541: v3 = v2 + n;
542: v4 = v3 + n;
543: v5 = v4 + n;
545: for (n = 0; n<sz-1; n+=2) {
546: i1 = idx[0];
547: i2 = idx[1];
548: idx += 2;
549: tmp0 = x[i1];
550: tmp1 = x[i2];
551: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556: }
557: if (n == sz-1) {
558: tmp0 = x[*idx++];
559: sum1 += *v1++ * tmp0;
560: sum2 += *v2++ * tmp0;
561: sum3 += *v3++ * tmp0;
562: sum4 += *v4++ * tmp0;
563: sum5 += *v5++ * tmp0;
564: }
565: y[row++]=sum1;
566: y[row++]=sum2;
567: y[row++]=sum3;
568: y[row++]=sum4;
569: y[row++]=sum5;
570: v1 =v5; /* Since the next block to be processed starts there */
571: idx +=4*sz;
572: break;
573: default:
574: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
575: }
576: }
577: VecRestoreArrayRead(xx,&x);
578: VecRestoreArray(yy,&y);
579: PetscLogFlops(2.0*a->nz - nonzerorow);
580: return(0);
581: }
582: /* ----------------------------------------------------------- */
583: /* Almost same code as the MatMult_SeqAIJ_Inode() */
586: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587: {
588: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
589: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590: const MatScalar *v1,*v2,*v3,*v4,*v5;
591: const PetscScalar *x;
592: PetscScalar *y,*z,*zt;
593: PetscErrorCode ierr;
594: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
595: const PetscInt *idx,*ns,*ii;
598: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599: node_max = a->inode.node_count;
600: ns = a->inode.size; /* Node Size array */
602: VecGetArrayRead(xx,&x);
603: VecGetArrayPair(zz,yy,&z,&y);
604: zt = z;
606: idx = a->j;
607: v1 = a->a;
608: ii = a->i;
610: for (i = 0,row = 0; i< node_max; ++i) {
611: nsz = ns[i];
612: n = ii[1] - ii[0];
613: ii += nsz;
614: sz = n; /* No of non zeros in this row */
615: /* Switch on the size of Node */
616: switch (nsz) { /* Each loop in 'case' is unrolled */
617: case 1:
618: sum1 = *zt++;
620: for (n = 0; n< sz-1; n+=2) {
621: i1 = idx[0]; /* The instructions are ordered to */
622: i2 = idx[1]; /* make the compiler's job easy */
623: idx += 2;
624: tmp0 = x[i1];
625: tmp1 = x[i2];
626: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627: }
629: if (n == sz-1) { /* Take care of the last nonzero */
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: }
633: y[row++]=sum1;
634: break;
635: case 2:
636: sum1 = *zt++;
637: sum2 = *zt++;
638: v2 = v1 + n;
640: for (n = 0; n< sz-1; n+=2) {
641: i1 = idx[0];
642: i2 = idx[1];
643: idx += 2;
644: tmp0 = x[i1];
645: tmp1 = x[i2];
646: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648: }
649: if (n == sz-1) {
650: tmp0 = x[*idx++];
651: sum1 += *v1++ * tmp0;
652: sum2 += *v2++ * tmp0;
653: }
654: y[row++]=sum1;
655: y[row++]=sum2;
656: v1 =v2; /* Since the next block to be processed starts there*/
657: idx +=sz;
658: break;
659: case 3:
660: sum1 = *zt++;
661: sum2 = *zt++;
662: sum3 = *zt++;
663: v2 = v1 + n;
664: v3 = v2 + n;
666: for (n = 0; n< sz-1; n+=2) {
667: i1 = idx[0];
668: i2 = idx[1];
669: idx += 2;
670: tmp0 = x[i1];
671: tmp1 = x[i2];
672: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
673: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
674: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
675: }
676: if (n == sz-1) {
677: tmp0 = x[*idx++];
678: sum1 += *v1++ * tmp0;
679: sum2 += *v2++ * tmp0;
680: sum3 += *v3++ * tmp0;
681: }
682: y[row++]=sum1;
683: y[row++]=sum2;
684: y[row++]=sum3;
685: v1 =v3; /* Since the next block to be processed starts there*/
686: idx +=2*sz;
687: break;
688: case 4:
689: sum1 = *zt++;
690: sum2 = *zt++;
691: sum3 = *zt++;
692: sum4 = *zt++;
693: v2 = v1 + n;
694: v3 = v2 + n;
695: v4 = v3 + n;
697: for (n = 0; n< sz-1; n+=2) {
698: i1 = idx[0];
699: i2 = idx[1];
700: idx += 2;
701: tmp0 = x[i1];
702: tmp1 = x[i2];
703: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
704: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
705: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
706: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
707: }
708: if (n == sz-1) {
709: tmp0 = x[*idx++];
710: sum1 += *v1++ * tmp0;
711: sum2 += *v2++ * tmp0;
712: sum3 += *v3++ * tmp0;
713: sum4 += *v4++ * tmp0;
714: }
715: y[row++]=sum1;
716: y[row++]=sum2;
717: y[row++]=sum3;
718: y[row++]=sum4;
719: v1 =v4; /* Since the next block to be processed starts there*/
720: idx +=3*sz;
721: break;
722: case 5:
723: sum1 = *zt++;
724: sum2 = *zt++;
725: sum3 = *zt++;
726: sum4 = *zt++;
727: sum5 = *zt++;
728: v2 = v1 + n;
729: v3 = v2 + n;
730: v4 = v3 + n;
731: v5 = v4 + n;
733: for (n = 0; n<sz-1; n+=2) {
734: i1 = idx[0];
735: i2 = idx[1];
736: idx += 2;
737: tmp0 = x[i1];
738: tmp1 = x[i2];
739: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
740: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
741: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
742: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
743: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
744: }
745: if (n == sz-1) {
746: tmp0 = x[*idx++];
747: sum1 += *v1++ * tmp0;
748: sum2 += *v2++ * tmp0;
749: sum3 += *v3++ * tmp0;
750: sum4 += *v4++ * tmp0;
751: sum5 += *v5++ * tmp0;
752: }
753: y[row++]=sum1;
754: y[row++]=sum2;
755: y[row++]=sum3;
756: y[row++]=sum4;
757: y[row++]=sum5;
758: v1 =v5; /* Since the next block to be processed starts there */
759: idx +=4*sz;
760: break;
761: default:
762: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
763: }
764: }
765: VecRestoreArrayRead(xx,&x);
766: VecRestoreArrayPair(zz,yy,&z,&y);
767: PetscLogFlops(2.0*a->nz);
768: return(0);
769: }
771: /* ----------------------------------------------------------- */
774: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
775: {
776: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
777: IS iscol = a->col,isrow = a->row;
778: PetscErrorCode ierr;
779: const PetscInt *r,*c,*rout,*cout;
780: PetscInt i,j,n = A->rmap->n,nz;
781: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
782: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
783: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
784: PetscScalar sum1,sum2,sum3,sum4,sum5;
785: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
786: const PetscScalar *b;
789: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
790: node_max = a->inode.node_count;
791: ns = a->inode.size; /* Node Size array */
793: VecGetArrayRead(bb,&b);
794: VecGetArray(xx,&x);
795: tmp = a->solve_work;
797: ISGetIndices(isrow,&rout); r = rout;
798: ISGetIndices(iscol,&cout); c = cout + (n-1);
800: /* forward solve the lower triangular */
801: tmps = tmp;
802: aa = a_a;
803: aj = a_j;
804: ad = a->diag;
806: for (i = 0,row = 0; i< node_max; ++i) {
807: nsz = ns[i];
808: aii = ai[row];
809: v1 = aa + aii;
810: vi = aj + aii;
811: nz = ad[row]- aii;
812: if (i < node_max-1) {
813: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
814: * but our indexing to determine it's size could. */
815: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
816: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
817: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
818: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
819: }
821: switch (nsz) { /* Each loop in 'case' is unrolled */
822: case 1:
823: sum1 = b[*r++];
824: for (j=0; j<nz-1; j+=2) {
825: i0 = vi[0];
826: i1 = vi[1];
827: vi +=2;
828: tmp0 = tmps[i0];
829: tmp1 = tmps[i1];
830: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
831: }
832: if (j == nz-1) {
833: tmp0 = tmps[*vi++];
834: sum1 -= *v1++ *tmp0;
835: }
836: tmp[row++]=sum1;
837: break;
838: case 2:
839: sum1 = b[*r++];
840: sum2 = b[*r++];
841: v2 = aa + ai[row+1];
843: for (j=0; j<nz-1; j+=2) {
844: i0 = vi[0];
845: i1 = vi[1];
846: vi +=2;
847: tmp0 = tmps[i0];
848: tmp1 = tmps[i1];
849: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
850: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
851: }
852: if (j == nz-1) {
853: tmp0 = tmps[*vi++];
854: sum1 -= *v1++ *tmp0;
855: sum2 -= *v2++ *tmp0;
856: }
857: sum2 -= *v2++ *sum1;
858: tmp[row++]=sum1;
859: tmp[row++]=sum2;
860: break;
861: case 3:
862: sum1 = b[*r++];
863: sum2 = b[*r++];
864: sum3 = b[*r++];
865: v2 = aa + ai[row+1];
866: v3 = aa + ai[row+2];
868: for (j=0; j<nz-1; j+=2) {
869: i0 = vi[0];
870: i1 = vi[1];
871: vi +=2;
872: tmp0 = tmps[i0];
873: tmp1 = tmps[i1];
874: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
875: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
876: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
877: }
878: if (j == nz-1) {
879: tmp0 = tmps[*vi++];
880: sum1 -= *v1++ *tmp0;
881: sum2 -= *v2++ *tmp0;
882: sum3 -= *v3++ *tmp0;
883: }
884: sum2 -= *v2++ * sum1;
885: sum3 -= *v3++ * sum1;
886: sum3 -= *v3++ * sum2;
888: tmp[row++]=sum1;
889: tmp[row++]=sum2;
890: tmp[row++]=sum3;
891: break;
893: case 4:
894: sum1 = b[*r++];
895: sum2 = b[*r++];
896: sum3 = b[*r++];
897: sum4 = b[*r++];
898: v2 = aa + ai[row+1];
899: v3 = aa + ai[row+2];
900: v4 = aa + ai[row+3];
902: for (j=0; j<nz-1; j+=2) {
903: i0 = vi[0];
904: i1 = vi[1];
905: vi +=2;
906: tmp0 = tmps[i0];
907: tmp1 = tmps[i1];
908: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
909: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
910: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
911: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
912: }
913: if (j == nz-1) {
914: tmp0 = tmps[*vi++];
915: sum1 -= *v1++ *tmp0;
916: sum2 -= *v2++ *tmp0;
917: sum3 -= *v3++ *tmp0;
918: sum4 -= *v4++ *tmp0;
919: }
920: sum2 -= *v2++ * sum1;
921: sum3 -= *v3++ * sum1;
922: sum4 -= *v4++ * sum1;
923: sum3 -= *v3++ * sum2;
924: sum4 -= *v4++ * sum2;
925: sum4 -= *v4++ * sum3;
927: tmp[row++]=sum1;
928: tmp[row++]=sum2;
929: tmp[row++]=sum3;
930: tmp[row++]=sum4;
931: break;
932: case 5:
933: sum1 = b[*r++];
934: sum2 = b[*r++];
935: sum3 = b[*r++];
936: sum4 = b[*r++];
937: sum5 = b[*r++];
938: v2 = aa + ai[row+1];
939: v3 = aa + ai[row+2];
940: v4 = aa + ai[row+3];
941: v5 = aa + ai[row+4];
943: for (j=0; j<nz-1; j+=2) {
944: i0 = vi[0];
945: i1 = vi[1];
946: vi +=2;
947: tmp0 = tmps[i0];
948: tmp1 = tmps[i1];
949: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
950: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
951: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
952: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
953: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
954: }
955: if (j == nz-1) {
956: tmp0 = tmps[*vi++];
957: sum1 -= *v1++ *tmp0;
958: sum2 -= *v2++ *tmp0;
959: sum3 -= *v3++ *tmp0;
960: sum4 -= *v4++ *tmp0;
961: sum5 -= *v5++ *tmp0;
962: }
964: sum2 -= *v2++ * sum1;
965: sum3 -= *v3++ * sum1;
966: sum4 -= *v4++ * sum1;
967: sum5 -= *v5++ * sum1;
968: sum3 -= *v3++ * sum2;
969: sum4 -= *v4++ * sum2;
970: sum5 -= *v5++ * sum2;
971: sum4 -= *v4++ * sum3;
972: sum5 -= *v5++ * sum3;
973: sum5 -= *v5++ * sum4;
975: tmp[row++]=sum1;
976: tmp[row++]=sum2;
977: tmp[row++]=sum3;
978: tmp[row++]=sum4;
979: tmp[row++]=sum5;
980: break;
981: default:
982: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
983: }
984: }
985: /* backward solve the upper triangular */
986: for (i=node_max -1,row = n-1; i>=0; i--) {
987: nsz = ns[i];
988: aii = ai[row+1] -1;
989: v1 = aa + aii;
990: vi = aj + aii;
991: nz = aii- ad[row];
992: switch (nsz) { /* Each loop in 'case' is unrolled */
993: case 1:
994: sum1 = tmp[row];
996: for (j=nz; j>1; j-=2) {
997: vi -=2;
998: i0 = vi[2];
999: i1 = vi[1];
1000: tmp0 = tmps[i0];
1001: tmp1 = tmps[i1];
1002: v1 -= 2;
1003: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1004: }
1005: if (j==1) {
1006: tmp0 = tmps[*vi--];
1007: sum1 -= *v1-- * tmp0;
1008: }
1009: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010: break;
1011: case 2:
1012: sum1 = tmp[row];
1013: sum2 = tmp[row -1];
1014: v2 = aa + ai[row]-1;
1015: for (j=nz; j>1; j-=2) {
1016: vi -=2;
1017: i0 = vi[2];
1018: i1 = vi[1];
1019: tmp0 = tmps[i0];
1020: tmp1 = tmps[i1];
1021: v1 -= 2;
1022: v2 -= 2;
1023: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025: }
1026: if (j==1) {
1027: tmp0 = tmps[*vi--];
1028: sum1 -= *v1-- * tmp0;
1029: sum2 -= *v2-- * tmp0;
1030: }
1032: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1033: sum2 -= *v2-- * tmp0;
1034: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1035: break;
1036: case 3:
1037: sum1 = tmp[row];
1038: sum2 = tmp[row -1];
1039: sum3 = tmp[row -2];
1040: v2 = aa + ai[row]-1;
1041: v3 = aa + ai[row -1]-1;
1042: for (j=nz; j>1; j-=2) {
1043: vi -=2;
1044: i0 = vi[2];
1045: i1 = vi[1];
1046: tmp0 = tmps[i0];
1047: tmp1 = tmps[i1];
1048: v1 -= 2;
1049: v2 -= 2;
1050: v3 -= 2;
1051: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1052: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1053: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1054: }
1055: if (j==1) {
1056: tmp0 = tmps[*vi--];
1057: sum1 -= *v1-- * tmp0;
1058: sum2 -= *v2-- * tmp0;
1059: sum3 -= *v3-- * tmp0;
1060: }
1061: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1062: sum2 -= *v2-- * tmp0;
1063: sum3 -= *v3-- * tmp0;
1064: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1065: sum3 -= *v3-- * tmp0;
1066: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1068: break;
1069: case 4:
1070: sum1 = tmp[row];
1071: sum2 = tmp[row -1];
1072: sum3 = tmp[row -2];
1073: sum4 = tmp[row -3];
1074: v2 = aa + ai[row]-1;
1075: v3 = aa + ai[row -1]-1;
1076: v4 = aa + ai[row -2]-1;
1078: for (j=nz; j>1; j-=2) {
1079: vi -=2;
1080: i0 = vi[2];
1081: i1 = vi[1];
1082: tmp0 = tmps[i0];
1083: tmp1 = tmps[i1];
1084: v1 -= 2;
1085: v2 -= 2;
1086: v3 -= 2;
1087: v4 -= 2;
1088: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092: }
1093: if (j==1) {
1094: tmp0 = tmps[*vi--];
1095: sum1 -= *v1-- * tmp0;
1096: sum2 -= *v2-- * tmp0;
1097: sum3 -= *v3-- * tmp0;
1098: sum4 -= *v4-- * tmp0;
1099: }
1101: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1102: sum2 -= *v2-- * tmp0;
1103: sum3 -= *v3-- * tmp0;
1104: sum4 -= *v4-- * tmp0;
1105: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1106: sum3 -= *v3-- * tmp0;
1107: sum4 -= *v4-- * tmp0;
1108: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1109: sum4 -= *v4-- * tmp0;
1110: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1111: break;
1112: case 5:
1113: sum1 = tmp[row];
1114: sum2 = tmp[row -1];
1115: sum3 = tmp[row -2];
1116: sum4 = tmp[row -3];
1117: sum5 = tmp[row -4];
1118: v2 = aa + ai[row]-1;
1119: v3 = aa + ai[row -1]-1;
1120: v4 = aa + ai[row -2]-1;
1121: v5 = aa + ai[row -3]-1;
1122: for (j=nz; j>1; j-=2) {
1123: vi -= 2;
1124: i0 = vi[2];
1125: i1 = vi[1];
1126: tmp0 = tmps[i0];
1127: tmp1 = tmps[i1];
1128: v1 -= 2;
1129: v2 -= 2;
1130: v3 -= 2;
1131: v4 -= 2;
1132: v5 -= 2;
1133: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1134: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1135: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1136: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1137: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1138: }
1139: if (j==1) {
1140: tmp0 = tmps[*vi--];
1141: sum1 -= *v1-- * tmp0;
1142: sum2 -= *v2-- * tmp0;
1143: sum3 -= *v3-- * tmp0;
1144: sum4 -= *v4-- * tmp0;
1145: sum5 -= *v5-- * tmp0;
1146: }
1148: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1149: sum2 -= *v2-- * tmp0;
1150: sum3 -= *v3-- * tmp0;
1151: sum4 -= *v4-- * tmp0;
1152: sum5 -= *v5-- * tmp0;
1153: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1154: sum3 -= *v3-- * tmp0;
1155: sum4 -= *v4-- * tmp0;
1156: sum5 -= *v5-- * tmp0;
1157: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1158: sum4 -= *v4-- * tmp0;
1159: sum5 -= *v5-- * tmp0;
1160: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1161: sum5 -= *v5-- * tmp0;
1162: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1163: break;
1164: default:
1165: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1166: }
1167: }
1168: ISRestoreIndices(isrow,&rout);
1169: ISRestoreIndices(iscol,&cout);
1170: VecRestoreArrayRead(bb,&b);
1171: VecRestoreArray(xx,&x);
1172: PetscLogFlops(2.0*a->nz - A->cmap->n);
1173: return(0);
1174: }
1178: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1179: {
1180: Mat C =B;
1181: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1182: IS isrow = b->row,isicol = b->icol;
1183: PetscErrorCode ierr;
1184: const PetscInt *r,*ic,*ics;
1185: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1186: PetscInt i,j,k,nz,nzL,row,*pj;
1187: const PetscInt *ajtmp,*bjtmp;
1188: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1189: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1190: FactorShiftCtx sctx;
1191: const PetscInt *ddiag;
1192: PetscReal rs;
1193: MatScalar d;
1194: PetscInt inod,nodesz,node_max,col;
1195: const PetscInt *ns;
1196: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1199: /* MatPivotSetUp(): initialize shift context sctx */
1200: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1202: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1203: ddiag = a->diag;
1204: sctx.shift_top = info->zeropivot;
1205: for (i=0; i<n; i++) {
1206: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1207: d = (aa)[ddiag[i]];
1208: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1209: v = aa+ai[i];
1210: nz = ai[i+1] - ai[i];
1211: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1212: if (rs>sctx.shift_top) sctx.shift_top = rs;
1213: }
1214: sctx.shift_top *= 1.1;
1215: sctx.nshift_max = 5;
1216: sctx.shift_lo = 0.;
1217: sctx.shift_hi = 1.;
1218: }
1220: ISGetIndices(isrow,&r);
1221: ISGetIndices(isicol,&ic);
1223: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1224: ics = ic;
1226: node_max = a->inode.node_count;
1227: ns = a->inode.size;
1228: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1230: /* If max inode size > 4, split it into two inodes.*/
1231: /* also map the inode sizes according to the ordering */
1232: PetscMalloc1(n+1,&tmp_vec1);
1233: for (i=0,j=0; i<node_max; ++i,++j) {
1234: if (ns[i] > 4) {
1235: tmp_vec1[j] = 4;
1236: ++j;
1237: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1238: } else {
1239: tmp_vec1[j] = ns[i];
1240: }
1241: }
1242: /* Use the correct node_max */
1243: node_max = j;
1245: /* Now reorder the inode info based on mat re-ordering info */
1246: /* First create a row -> inode_size_array_index map */
1247: PetscMalloc1(n+1,&nsmap);
1248: PetscMalloc1(node_max+1,&tmp_vec2);
1249: for (i=0,row=0; i<node_max; i++) {
1250: nodesz = tmp_vec1[i];
1251: for (j=0; j<nodesz; j++,row++) {
1252: nsmap[row] = i;
1253: }
1254: }
1255: /* Using nsmap, create a reordered ns structure */
1256: for (i=0,j=0; i< node_max; i++) {
1257: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1258: tmp_vec2[i] = nodesz;
1259: j += nodesz;
1260: }
1261: PetscFree(nsmap);
1262: PetscFree(tmp_vec1);
1264: /* Now use the correct ns */
1265: ns = tmp_vec2;
1267: do {
1268: sctx.newshift = PETSC_FALSE;
1269: /* Now loop over each block-row, and do the factorization */
1270: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1271: nodesz = ns[inod];
1273: switch (nodesz) {
1274: case 1:
1275: /*----------*/
1276: /* zero rtmp1 */
1277: /* L part */
1278: nz = bi[i+1] - bi[i];
1279: bjtmp = bj + bi[i];
1280: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1282: /* U part */
1283: nz = bdiag[i]-bdiag[i+1];
1284: bjtmp = bj + bdiag[i+1]+1;
1285: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1287: /* load in initial (unfactored row) */
1288: nz = ai[r[i]+1] - ai[r[i]];
1289: ajtmp = aj + ai[r[i]];
1290: v = aa + ai[r[i]];
1291: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1293: /* ZeropivotApply() */
1294: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1296: /* elimination */
1297: bjtmp = bj + bi[i];
1298: row = *bjtmp++;
1299: nzL = bi[i+1] - bi[i];
1300: for (k=0; k < nzL; k++) {
1301: pc = rtmp1 + row;
1302: if (*pc != 0.0) {
1303: pv = b->a + bdiag[row];
1304: mul1 = *pc * (*pv);
1305: *pc = mul1;
1306: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1307: pv = b->a + bdiag[row+1]+1;
1308: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1309: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1310: PetscLogFlops(1+2*nz);
1311: }
1312: row = *bjtmp++;
1313: }
1315: /* finished row so stick it into b->a */
1316: rs = 0.0;
1317: /* L part */
1318: pv = b->a + bi[i];
1319: pj = b->j + bi[i];
1320: nz = bi[i+1] - bi[i];
1321: for (j=0; j<nz; j++) {
1322: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1323: }
1325: /* U part */
1326: pv = b->a + bdiag[i+1]+1;
1327: pj = b->j + bdiag[i+1]+1;
1328: nz = bdiag[i] - bdiag[i+1]-1;
1329: for (j=0; j<nz; j++) {
1330: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331: }
1333: /* Check zero pivot */
1334: sctx.rs = rs;
1335: sctx.pv = rtmp1[i];
1336: MatPivotCheck(B,A,info,&sctx,i);
1337: if (sctx.newshift) break;
1339: /* Mark diagonal and invert diagonal for simplier triangular solves */
1340: pv = b->a + bdiag[i];
1341: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1342: break;
1344: case 2:
1345: /*----------*/
1346: /* zero rtmp1 and rtmp2 */
1347: /* L part */
1348: nz = bi[i+1] - bi[i];
1349: bjtmp = bj + bi[i];
1350: for (j=0; j<nz; j++) {
1351: col = bjtmp[j];
1352: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1353: }
1355: /* U part */
1356: nz = bdiag[i]-bdiag[i+1];
1357: bjtmp = bj + bdiag[i+1]+1;
1358: for (j=0; j<nz; j++) {
1359: col = bjtmp[j];
1360: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361: }
1363: /* load in initial (unfactored row) */
1364: nz = ai[r[i]+1] - ai[r[i]];
1365: ajtmp = aj + ai[r[i]];
1366: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1367: for (j=0; j<nz; j++) {
1368: col = ics[ajtmp[j]];
1369: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1370: }
1371: /* ZeropivotApply(): shift the diagonal of the matrix */
1372: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1374: /* elimination */
1375: bjtmp = bj + bi[i];
1376: row = *bjtmp++; /* pivot row */
1377: nzL = bi[i+1] - bi[i];
1378: for (k=0; k < nzL; k++) {
1379: pc1 = rtmp1 + row;
1380: pc2 = rtmp2 + row;
1381: if (*pc1 != 0.0 || *pc2 != 0.0) {
1382: pv = b->a + bdiag[row];
1383: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1384: *pc1 = mul1; *pc2 = mul2;
1386: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1387: pv = b->a + bdiag[row+1]+1;
1388: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1389: for (j=0; j<nz; j++) {
1390: col = pj[j];
1391: rtmp1[col] -= mul1 * pv[j];
1392: rtmp2[col] -= mul2 * pv[j];
1393: }
1394: PetscLogFlops(2+4*nz);
1395: }
1396: row = *bjtmp++;
1397: }
1399: /* finished row i; check zero pivot, then stick row i into b->a */
1400: rs = 0.0;
1401: /* L part */
1402: pc1 = b->a + bi[i];
1403: pj = b->j + bi[i];
1404: nz = bi[i+1] - bi[i];
1405: for (j=0; j<nz; j++) {
1406: col = pj[j];
1407: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1408: }
1409: /* U part */
1410: pc1 = b->a + bdiag[i+1]+1;
1411: pj = b->j + bdiag[i+1]+1;
1412: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1413: for (j=0; j<nz; j++) {
1414: col = pj[j];
1415: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416: }
1418: sctx.rs = rs;
1419: sctx.pv = rtmp1[i];
1420: MatPivotCheck(B,A,info,&sctx,i);
1421: if (sctx.newshift) break;
1422: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1423: *pc1 = 1.0/sctx.pv;
1425: /* Now take care of diagonal 2x2 block. */
1426: pc2 = rtmp2 + i;
1427: if (*pc2 != 0.0) {
1428: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1429: *pc2 = mul1; /* insert L entry */
1430: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1431: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1432: for (j=0; j<nz; j++) {
1433: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1434: }
1435: PetscLogFlops(1+2*nz);
1436: }
1438: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1439: rs = 0.0;
1440: /* L part */
1441: pc2 = b->a + bi[i+1];
1442: pj = b->j + bi[i+1];
1443: nz = bi[i+2] - bi[i+1];
1444: for (j=0; j<nz; j++) {
1445: col = pj[j];
1446: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1447: }
1448: /* U part */
1449: pc2 = b->a + bdiag[i+2]+1;
1450: pj = b->j + bdiag[i+2]+1;
1451: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1452: for (j=0; j<nz; j++) {
1453: col = pj[j];
1454: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455: }
1457: sctx.rs = rs;
1458: sctx.pv = rtmp2[i+1];
1459: MatPivotCheck(B,A,info,&sctx,i+1);
1460: if (sctx.newshift) break;
1461: pc2 = b->a + bdiag[i+1];
1462: *pc2 = 1.0/sctx.pv;
1463: break;
1465: case 3:
1466: /*----------*/
1467: /* zero rtmp */
1468: /* L part */
1469: nz = bi[i+1] - bi[i];
1470: bjtmp = bj + bi[i];
1471: for (j=0; j<nz; j++) {
1472: col = bjtmp[j];
1473: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1474: }
1476: /* U part */
1477: nz = bdiag[i]-bdiag[i+1];
1478: bjtmp = bj + bdiag[i+1]+1;
1479: for (j=0; j<nz; j++) {
1480: col = bjtmp[j];
1481: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482: }
1484: /* load in initial (unfactored row) */
1485: nz = ai[r[i]+1] - ai[r[i]];
1486: ajtmp = aj + ai[r[i]];
1487: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1488: for (j=0; j<nz; j++) {
1489: col = ics[ajtmp[j]];
1490: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1491: }
1492: /* ZeropivotApply(): shift the diagonal of the matrix */
1493: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1495: /* elimination */
1496: bjtmp = bj + bi[i];
1497: row = *bjtmp++; /* pivot row */
1498: nzL = bi[i+1] - bi[i];
1499: for (k=0; k < nzL; k++) {
1500: pc1 = rtmp1 + row;
1501: pc2 = rtmp2 + row;
1502: pc3 = rtmp3 + row;
1503: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1504: pv = b->a + bdiag[row];
1505: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1506: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1508: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1509: pv = b->a + bdiag[row+1]+1;
1510: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1511: for (j=0; j<nz; j++) {
1512: col = pj[j];
1513: rtmp1[col] -= mul1 * pv[j];
1514: rtmp2[col] -= mul2 * pv[j];
1515: rtmp3[col] -= mul3 * pv[j];
1516: }
1517: PetscLogFlops(3+6*nz);
1518: }
1519: row = *bjtmp++;
1520: }
1522: /* finished row i; check zero pivot, then stick row i into b->a */
1523: rs = 0.0;
1524: /* L part */
1525: pc1 = b->a + bi[i];
1526: pj = b->j + bi[i];
1527: nz = bi[i+1] - bi[i];
1528: for (j=0; j<nz; j++) {
1529: col = pj[j];
1530: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1531: }
1532: /* U part */
1533: pc1 = b->a + bdiag[i+1]+1;
1534: pj = b->j + bdiag[i+1]+1;
1535: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1536: for (j=0; j<nz; j++) {
1537: col = pj[j];
1538: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539: }
1541: sctx.rs = rs;
1542: sctx.pv = rtmp1[i];
1543: MatPivotCheck(B,A,info,&sctx,i);
1544: if (sctx.newshift) break;
1545: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1546: *pc1 = 1.0/sctx.pv;
1548: /* Now take care of 1st column of diagonal 3x3 block. */
1549: pc2 = rtmp2 + i;
1550: pc3 = rtmp3 + i;
1551: if (*pc2 != 0.0 || *pc3 != 0.0) {
1552: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1553: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1554: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1555: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1556: for (j=0; j<nz; j++) {
1557: col = pj[j];
1558: rtmp2[col] -= mul2 * rtmp1[col];
1559: rtmp3[col] -= mul3 * rtmp1[col];
1560: }
1561: PetscLogFlops(2+4*nz);
1562: }
1564: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1565: rs = 0.0;
1566: /* L part */
1567: pc2 = b->a + bi[i+1];
1568: pj = b->j + bi[i+1];
1569: nz = bi[i+2] - bi[i+1];
1570: for (j=0; j<nz; j++) {
1571: col = pj[j];
1572: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1573: }
1574: /* U part */
1575: pc2 = b->a + bdiag[i+2]+1;
1576: pj = b->j + bdiag[i+2]+1;
1577: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1578: for (j=0; j<nz; j++) {
1579: col = pj[j];
1580: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581: }
1583: sctx.rs = rs;
1584: sctx.pv = rtmp2[i+1];
1585: MatPivotCheck(B,A,info,&sctx,i+1);
1586: if (sctx.newshift) break;
1587: pc2 = b->a + bdiag[i+1];
1588: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1590: /* Now take care of 2nd column of diagonal 3x3 block. */
1591: pc3 = rtmp3 + i+1;
1592: if (*pc3 != 0.0) {
1593: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1594: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1595: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1596: for (j=0; j<nz; j++) {
1597: col = pj[j];
1598: rtmp3[col] -= mul3 * rtmp2[col];
1599: }
1600: PetscLogFlops(1+2*nz);
1601: }
1603: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1604: rs = 0.0;
1605: /* L part */
1606: pc3 = b->a + bi[i+2];
1607: pj = b->j + bi[i+2];
1608: nz = bi[i+3] - bi[i+2];
1609: for (j=0; j<nz; j++) {
1610: col = pj[j];
1611: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1612: }
1613: /* U part */
1614: pc3 = b->a + bdiag[i+3]+1;
1615: pj = b->j + bdiag[i+3]+1;
1616: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1617: for (j=0; j<nz; j++) {
1618: col = pj[j];
1619: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620: }
1622: sctx.rs = rs;
1623: sctx.pv = rtmp3[i+2];
1624: MatPivotCheck(B,A,info,&sctx,i+2);
1625: if (sctx.newshift) break;
1626: pc3 = b->a + bdiag[i+2];
1627: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1628: break;
1629: case 4:
1630: /*----------*/
1631: /* zero rtmp */
1632: /* L part */
1633: nz = bi[i+1] - bi[i];
1634: bjtmp = bj + bi[i];
1635: for (j=0; j<nz; j++) {
1636: col = bjtmp[j];
1637: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1638: }
1640: /* U part */
1641: nz = bdiag[i]-bdiag[i+1];
1642: bjtmp = bj + bdiag[i+1]+1;
1643: for (j=0; j<nz; j++) {
1644: col = bjtmp[j];
1645: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1646: }
1648: /* load in initial (unfactored row) */
1649: nz = ai[r[i]+1] - ai[r[i]];
1650: ajtmp = aj + ai[r[i]];
1651: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1652: for (j=0; j<nz; j++) {
1653: col = ics[ajtmp[j]];
1654: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1655: }
1656: /* ZeropivotApply(): shift the diagonal of the matrix */
1657: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1659: /* elimination */
1660: bjtmp = bj + bi[i];
1661: row = *bjtmp++; /* pivot row */
1662: nzL = bi[i+1] - bi[i];
1663: for (k=0; k < nzL; k++) {
1664: pc1 = rtmp1 + row;
1665: pc2 = rtmp2 + row;
1666: pc3 = rtmp3 + row;
1667: pc4 = rtmp4 + row;
1668: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1669: pv = b->a + bdiag[row];
1670: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1671: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1673: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1674: pv = b->a + bdiag[row+1]+1;
1675: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1676: for (j=0; j<nz; j++) {
1677: col = pj[j];
1678: rtmp1[col] -= mul1 * pv[j];
1679: rtmp2[col] -= mul2 * pv[j];
1680: rtmp3[col] -= mul3 * pv[j];
1681: rtmp4[col] -= mul4 * pv[j];
1682: }
1683: PetscLogFlops(4+8*nz);
1684: }
1685: row = *bjtmp++;
1686: }
1688: /* finished row i; check zero pivot, then stick row i into b->a */
1689: rs = 0.0;
1690: /* L part */
1691: pc1 = b->a + bi[i];
1692: pj = b->j + bi[i];
1693: nz = bi[i+1] - bi[i];
1694: for (j=0; j<nz; j++) {
1695: col = pj[j];
1696: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1697: }
1698: /* U part */
1699: pc1 = b->a + bdiag[i+1]+1;
1700: pj = b->j + bdiag[i+1]+1;
1701: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1702: for (j=0; j<nz; j++) {
1703: col = pj[j];
1704: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705: }
1707: sctx.rs = rs;
1708: sctx.pv = rtmp1[i];
1709: MatPivotCheck(B,A,info,&sctx,i);
1710: if (sctx.newshift) break;
1711: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1712: *pc1 = 1.0/sctx.pv;
1714: /* Now take care of 1st column of diagonal 4x4 block. */
1715: pc2 = rtmp2 + i;
1716: pc3 = rtmp3 + i;
1717: pc4 = rtmp4 + i;
1718: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1719: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1720: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1721: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1722: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1723: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1724: for (j=0; j<nz; j++) {
1725: col = pj[j];
1726: rtmp2[col] -= mul2 * rtmp1[col];
1727: rtmp3[col] -= mul3 * rtmp1[col];
1728: rtmp4[col] -= mul4 * rtmp1[col];
1729: }
1730: PetscLogFlops(3+6*nz);
1731: }
1733: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1734: rs = 0.0;
1735: /* L part */
1736: pc2 = b->a + bi[i+1];
1737: pj = b->j + bi[i+1];
1738: nz = bi[i+2] - bi[i+1];
1739: for (j=0; j<nz; j++) {
1740: col = pj[j];
1741: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1742: }
1743: /* U part */
1744: pc2 = b->a + bdiag[i+2]+1;
1745: pj = b->j + bdiag[i+2]+1;
1746: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1747: for (j=0; j<nz; j++) {
1748: col = pj[j];
1749: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750: }
1752: sctx.rs = rs;
1753: sctx.pv = rtmp2[i+1];
1754: MatPivotCheck(B,A,info,&sctx,i+1);
1755: if (sctx.newshift) break;
1756: pc2 = b->a + bdiag[i+1];
1757: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1759: /* Now take care of 2nd column of diagonal 4x4 block. */
1760: pc3 = rtmp3 + i+1;
1761: pc4 = rtmp4 + i+1;
1762: if (*pc3 != 0.0 || *pc4 != 0.0) {
1763: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1764: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1765: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1766: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1767: for (j=0; j<nz; j++) {
1768: col = pj[j];
1769: rtmp3[col] -= mul3 * rtmp2[col];
1770: rtmp4[col] -= mul4 * rtmp2[col];
1771: }
1772: PetscLogFlops(4*nz);
1773: }
1775: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1776: rs = 0.0;
1777: /* L part */
1778: pc3 = b->a + bi[i+2];
1779: pj = b->j + bi[i+2];
1780: nz = bi[i+3] - bi[i+2];
1781: for (j=0; j<nz; j++) {
1782: col = pj[j];
1783: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1784: }
1785: /* U part */
1786: pc3 = b->a + bdiag[i+3]+1;
1787: pj = b->j + bdiag[i+3]+1;
1788: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1789: for (j=0; j<nz; j++) {
1790: col = pj[j];
1791: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792: }
1794: sctx.rs = rs;
1795: sctx.pv = rtmp3[i+2];
1796: MatPivotCheck(B,A,info,&sctx,i+2);
1797: if (sctx.newshift) break;
1798: pc3 = b->a + bdiag[i+2];
1799: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1801: /* Now take care of 3rd column of diagonal 4x4 block. */
1802: pc4 = rtmp4 + i+2;
1803: if (*pc4 != 0.0) {
1804: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1805: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1806: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1807: for (j=0; j<nz; j++) {
1808: col = pj[j];
1809: rtmp4[col] -= mul4 * rtmp3[col];
1810: }
1811: PetscLogFlops(1+2*nz);
1812: }
1814: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1815: rs = 0.0;
1816: /* L part */
1817: pc4 = b->a + bi[i+3];
1818: pj = b->j + bi[i+3];
1819: nz = bi[i+4] - bi[i+3];
1820: for (j=0; j<nz; j++) {
1821: col = pj[j];
1822: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1823: }
1824: /* U part */
1825: pc4 = b->a + bdiag[i+4]+1;
1826: pj = b->j + bdiag[i+4]+1;
1827: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1828: for (j=0; j<nz; j++) {
1829: col = pj[j];
1830: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831: }
1833: sctx.rs = rs;
1834: sctx.pv = rtmp4[i+3];
1835: MatPivotCheck(B,A,info,&sctx,i+3);
1836: if (sctx.newshift) break;
1837: pc4 = b->a + bdiag[i+3];
1838: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1839: break;
1841: default:
1842: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1843: }
1844: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1845: i += nodesz; /* Update the row */
1846: }
1848: /* MatPivotRefine() */
1849: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1850: /*
1851: * if no shift in this attempt & shifting & started shifting & can refine,
1852: * then try lower shift
1853: */
1854: sctx.shift_hi = sctx.shift_fraction;
1855: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1856: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1857: sctx.newshift = PETSC_TRUE;
1858: sctx.nshift++;
1859: }
1860: } while (sctx.newshift);
1862: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1863: PetscFree(tmp_vec2);
1864: ISRestoreIndices(isicol,&ic);
1865: ISRestoreIndices(isrow,&r);
1867: if (b->inode.size) {
1868: C->ops->solve = MatSolve_SeqAIJ_Inode;
1869: } else {
1870: C->ops->solve = MatSolve_SeqAIJ;
1871: }
1872: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1873: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1874: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1875: C->ops->matsolve = MatMatSolve_SeqAIJ;
1876: C->assembled = PETSC_TRUE;
1877: C->preallocated = PETSC_TRUE;
1879: PetscLogFlops(C->cmap->n);
1881: /* MatShiftView(A,info,&sctx) */
1882: if (sctx.nshift) {
1883: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1884: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1885: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1886: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1887: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1888: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1889: }
1890: }
1891: return(0);
1892: }
1896: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1897: {
1898: Mat C = B;
1899: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1900: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1901: PetscErrorCode ierr;
1902: const PetscInt *r,*ic,*c,*ics;
1903: PetscInt n = A->rmap->n,*bi = b->i;
1904: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1905: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1906: PetscInt *ai = a->i,*aj = a->j;
1907: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1908: PetscScalar mul1,mul2,mul3,tmp;
1909: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1910: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1911: PetscReal rs=0.0;
1912: FactorShiftCtx sctx;
1915: sctx.shift_top = 0;
1916: sctx.nshift_max = 0;
1917: sctx.shift_lo = 0;
1918: sctx.shift_hi = 0;
1919: sctx.shift_fraction = 0;
1921: /* if both shift schemes are chosen by user, only use info->shiftpd */
1922: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1923: sctx.shift_top = 0;
1924: for (i=0; i<n; i++) {
1925: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1926: rs = 0.0;
1927: ajtmp = aj + ai[i];
1928: rtmp1 = aa + ai[i];
1929: nz = ai[i+1] - ai[i];
1930: for (j=0; j<nz; j++) {
1931: if (*ajtmp != i) {
1932: rs += PetscAbsScalar(*rtmp1++);
1933: } else {
1934: rs -= PetscRealPart(*rtmp1++);
1935: }
1936: ajtmp++;
1937: }
1938: if (rs>sctx.shift_top) sctx.shift_top = rs;
1939: }
1940: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1941: sctx.shift_top *= 1.1;
1942: sctx.nshift_max = 5;
1943: sctx.shift_lo = 0.;
1944: sctx.shift_hi = 1.;
1945: }
1946: sctx.shift_amount = 0;
1947: sctx.nshift = 0;
1949: ISGetIndices(isrow,&r);
1950: ISGetIndices(iscol,&c);
1951: ISGetIndices(isicol,&ic);
1952: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1953: ics = ic;
1955: node_max = a->inode.node_count;
1956: ns = a->inode.size;
1957: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1959: /* If max inode size > 3, split it into two inodes.*/
1960: /* also map the inode sizes according to the ordering */
1961: PetscMalloc1(n+1,&tmp_vec1);
1962: for (i=0,j=0; i<node_max; ++i,++j) {
1963: if (ns[i]>3) {
1964: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1965: ++j;
1966: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1967: } else {
1968: tmp_vec1[j] = ns[i];
1969: }
1970: }
1971: /* Use the correct node_max */
1972: node_max = j;
1974: /* Now reorder the inode info based on mat re-ordering info */
1975: /* First create a row -> inode_size_array_index map */
1976: PetscMalloc1(n+1,&nsmap);
1977: PetscMalloc1(node_max+1,&tmp_vec2);
1978: for (i=0,row=0; i<node_max; i++) {
1979: nodesz = tmp_vec1[i];
1980: for (j=0; j<nodesz; j++,row++) {
1981: nsmap[row] = i;
1982: }
1983: }
1984: /* Using nsmap, create a reordered ns structure */
1985: for (i=0,j=0; i< node_max; i++) {
1986: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1987: tmp_vec2[i] = nodesz;
1988: j += nodesz;
1989: }
1990: PetscFree(nsmap);
1991: PetscFree(tmp_vec1);
1992: /* Now use the correct ns */
1993: ns = tmp_vec2;
1995: do {
1996: sctx.newshift = PETSC_FALSE;
1997: /* Now loop over each block-row, and do the factorization */
1998: for (i=0,row=0; i<node_max; i++) {
1999: nodesz = ns[i];
2000: nz = bi[row+1] - bi[row];
2001: bjtmp = bj + bi[row];
2003: switch (nodesz) {
2004: case 1:
2005: for (j=0; j<nz; j++) {
2006: idx = bjtmp[j];
2007: rtmp11[idx] = 0.0;
2008: }
2010: /* load in initial (unfactored row) */
2011: idx = r[row];
2012: nz_tmp = ai[idx+1] - ai[idx];
2013: ajtmp = aj + ai[idx];
2014: v1 = aa + ai[idx];
2016: for (j=0; j<nz_tmp; j++) {
2017: idx = ics[ajtmp[j]];
2018: rtmp11[idx] = v1[j];
2019: }
2020: rtmp11[ics[r[row]]] += sctx.shift_amount;
2022: prow = *bjtmp++;
2023: while (prow < row) {
2024: pc1 = rtmp11 + prow;
2025: if (*pc1 != 0.0) {
2026: pv = ba + bd[prow];
2027: pj = nbj + bd[prow];
2028: mul1 = *pc1 * *pv++;
2029: *pc1 = mul1;
2030: nz_tmp = bi[prow+1] - bd[prow] - 1;
2031: PetscLogFlops(1+2*nz_tmp);
2032: for (j=0; j<nz_tmp; j++) {
2033: tmp = pv[j];
2034: idx = pj[j];
2035: rtmp11[idx] -= mul1 * tmp;
2036: }
2037: }
2038: prow = *bjtmp++;
2039: }
2040: pj = bj + bi[row];
2041: pc1 = ba + bi[row];
2043: sctx.pv = rtmp11[row];
2044: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2045: rs = 0.0;
2046: for (j=0; j<nz; j++) {
2047: idx = pj[j];
2048: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2049: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2050: }
2051: sctx.rs = rs;
2052: MatPivotCheck(B,A,info,&sctx,row);
2053: if (sctx.newshift) goto endofwhile;
2054: break;
2056: case 2:
2057: for (j=0; j<nz; j++) {
2058: idx = bjtmp[j];
2059: rtmp11[idx] = 0.0;
2060: rtmp22[idx] = 0.0;
2061: }
2063: /* load in initial (unfactored row) */
2064: idx = r[row];
2065: nz_tmp = ai[idx+1] - ai[idx];
2066: ajtmp = aj + ai[idx];
2067: v1 = aa + ai[idx];
2068: v2 = aa + ai[idx+1];
2069: for (j=0; j<nz_tmp; j++) {
2070: idx = ics[ajtmp[j]];
2071: rtmp11[idx] = v1[j];
2072: rtmp22[idx] = v2[j];
2073: }
2074: rtmp11[ics[r[row]]] += sctx.shift_amount;
2075: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2077: prow = *bjtmp++;
2078: while (prow < row) {
2079: pc1 = rtmp11 + prow;
2080: pc2 = rtmp22 + prow;
2081: if (*pc1 != 0.0 || *pc2 != 0.0) {
2082: pv = ba + bd[prow];
2083: pj = nbj + bd[prow];
2084: mul1 = *pc1 * *pv;
2085: mul2 = *pc2 * *pv;
2086: ++pv;
2087: *pc1 = mul1;
2088: *pc2 = mul2;
2090: nz_tmp = bi[prow+1] - bd[prow] - 1;
2091: for (j=0; j<nz_tmp; j++) {
2092: tmp = pv[j];
2093: idx = pj[j];
2094: rtmp11[idx] -= mul1 * tmp;
2095: rtmp22[idx] -= mul2 * tmp;
2096: }
2097: PetscLogFlops(2+4*nz_tmp);
2098: }
2099: prow = *bjtmp++;
2100: }
2102: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2103: pc1 = rtmp11 + prow;
2104: pc2 = rtmp22 + prow;
2106: sctx.pv = *pc1;
2107: pj = bj + bi[prow];
2108: rs = 0.0;
2109: for (j=0; j<nz; j++) {
2110: idx = pj[j];
2111: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2112: }
2113: sctx.rs = rs;
2114: MatPivotCheck(B,A,info,&sctx,row);
2115: if (sctx.newshift) goto endofwhile;
2117: if (*pc2 != 0.0) {
2118: pj = nbj + bd[prow];
2119: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2120: *pc2 = mul2;
2121: nz_tmp = bi[prow+1] - bd[prow] - 1;
2122: for (j=0; j<nz_tmp; j++) {
2123: idx = pj[j];
2124: tmp = rtmp11[idx];
2125: rtmp22[idx] -= mul2 * tmp;
2126: }
2127: PetscLogFlops(1+2*nz_tmp);
2128: }
2130: pj = bj + bi[row];
2131: pc1 = ba + bi[row];
2132: pc2 = ba + bi[row+1];
2134: sctx.pv = rtmp22[row+1];
2135: rs = 0.0;
2136: rtmp11[row] = 1.0/rtmp11[row];
2137: rtmp22[row+1] = 1.0/rtmp22[row+1];
2138: /* copy row entries from dense representation to sparse */
2139: for (j=0; j<nz; j++) {
2140: idx = pj[j];
2141: pc1[j] = rtmp11[idx];
2142: pc2[j] = rtmp22[idx];
2143: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2144: }
2145: sctx.rs = rs;
2146: MatPivotCheck(B,A,info,&sctx,row+1);
2147: if (sctx.newshift) goto endofwhile;
2148: break;
2150: case 3:
2151: for (j=0; j<nz; j++) {
2152: idx = bjtmp[j];
2153: rtmp11[idx] = 0.0;
2154: rtmp22[idx] = 0.0;
2155: rtmp33[idx] = 0.0;
2156: }
2157: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2158: idx = r[row];
2159: nz_tmp = ai[idx+1] - ai[idx];
2160: ajtmp = aj + ai[idx];
2161: v1 = aa + ai[idx];
2162: v2 = aa + ai[idx+1];
2163: v3 = aa + ai[idx+2];
2164: for (j=0; j<nz_tmp; j++) {
2165: idx = ics[ajtmp[j]];
2166: rtmp11[idx] = v1[j];
2167: rtmp22[idx] = v2[j];
2168: rtmp33[idx] = v3[j];
2169: }
2170: rtmp11[ics[r[row]]] += sctx.shift_amount;
2171: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2172: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2174: /* loop over all pivot row blocks above this row block */
2175: prow = *bjtmp++;
2176: while (prow < row) {
2177: pc1 = rtmp11 + prow;
2178: pc2 = rtmp22 + prow;
2179: pc3 = rtmp33 + prow;
2180: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2181: pv = ba + bd[prow];
2182: pj = nbj + bd[prow];
2183: mul1 = *pc1 * *pv;
2184: mul2 = *pc2 * *pv;
2185: mul3 = *pc3 * *pv;
2186: ++pv;
2187: *pc1 = mul1;
2188: *pc2 = mul2;
2189: *pc3 = mul3;
2191: nz_tmp = bi[prow+1] - bd[prow] - 1;
2192: /* update this row based on pivot row */
2193: for (j=0; j<nz_tmp; j++) {
2194: tmp = pv[j];
2195: idx = pj[j];
2196: rtmp11[idx] -= mul1 * tmp;
2197: rtmp22[idx] -= mul2 * tmp;
2198: rtmp33[idx] -= mul3 * tmp;
2199: }
2200: PetscLogFlops(3+6*nz_tmp);
2201: }
2202: prow = *bjtmp++;
2203: }
2205: /* Now take care of diagonal 3x3 block in this set of rows */
2206: /* note: prow = row here */
2207: pc1 = rtmp11 + prow;
2208: pc2 = rtmp22 + prow;
2209: pc3 = rtmp33 + prow;
2211: sctx.pv = *pc1;
2212: pj = bj + bi[prow];
2213: rs = 0.0;
2214: for (j=0; j<nz; j++) {
2215: idx = pj[j];
2216: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2217: }
2218: sctx.rs = rs;
2219: MatPivotCheck(B,A,info,&sctx,row);
2220: if (sctx.newshift) goto endofwhile;
2222: if (*pc2 != 0.0 || *pc3 != 0.0) {
2223: mul2 = (*pc2)/(*pc1);
2224: mul3 = (*pc3)/(*pc1);
2225: *pc2 = mul2;
2226: *pc3 = mul3;
2227: nz_tmp = bi[prow+1] - bd[prow] - 1;
2228: pj = nbj + bd[prow];
2229: for (j=0; j<nz_tmp; j++) {
2230: idx = pj[j];
2231: tmp = rtmp11[idx];
2232: rtmp22[idx] -= mul2 * tmp;
2233: rtmp33[idx] -= mul3 * tmp;
2234: }
2235: PetscLogFlops(2+4*nz_tmp);
2236: }
2237: ++prow;
2239: pc2 = rtmp22 + prow;
2240: pc3 = rtmp33 + prow;
2241: sctx.pv = *pc2;
2242: pj = bj + bi[prow];
2243: rs = 0.0;
2244: for (j=0; j<nz; j++) {
2245: idx = pj[j];
2246: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2247: }
2248: sctx.rs = rs;
2249: MatPivotCheck(B,A,info,&sctx,row+1);
2250: if (sctx.newshift) goto endofwhile;
2252: if (*pc3 != 0.0) {
2253: mul3 = (*pc3)/(*pc2);
2254: *pc3 = mul3;
2255: pj = nbj + bd[prow];
2256: nz_tmp = bi[prow+1] - bd[prow] - 1;
2257: for (j=0; j<nz_tmp; j++) {
2258: idx = pj[j];
2259: tmp = rtmp22[idx];
2260: rtmp33[idx] -= mul3 * tmp;
2261: }
2262: PetscLogFlops(1+2*nz_tmp);
2263: }
2265: pj = bj + bi[row];
2266: pc1 = ba + bi[row];
2267: pc2 = ba + bi[row+1];
2268: pc3 = ba + bi[row+2];
2270: sctx.pv = rtmp33[row+2];
2271: rs = 0.0;
2272: rtmp11[row] = 1.0/rtmp11[row];
2273: rtmp22[row+1] = 1.0/rtmp22[row+1];
2274: rtmp33[row+2] = 1.0/rtmp33[row+2];
2275: /* copy row entries from dense representation to sparse */
2276: for (j=0; j<nz; j++) {
2277: idx = pj[j];
2278: pc1[j] = rtmp11[idx];
2279: pc2[j] = rtmp22[idx];
2280: pc3[j] = rtmp33[idx];
2281: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2282: }
2284: sctx.rs = rs;
2285: MatPivotCheck(B,A,info,&sctx,row+2);
2286: if (sctx.newshift) goto endofwhile;
2287: break;
2289: default:
2290: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2291: }
2292: row += nodesz; /* Update the row */
2293: }
2294: endofwhile:;
2295: } while (sctx.newshift);
2296: PetscFree3(rtmp11,rtmp22,rtmp33);
2297: PetscFree(tmp_vec2);
2298: ISRestoreIndices(isicol,&ic);
2299: ISRestoreIndices(isrow,&r);
2300: ISRestoreIndices(iscol,&c);
2302: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2303: /* do not set solve add, since MatSolve_Inode + Add is faster */
2304: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2305: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2306: C->assembled = PETSC_TRUE;
2307: C->preallocated = PETSC_TRUE;
2308: if (sctx.nshift) {
2309: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2310: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2311: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2312: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2313: }
2314: }
2315: PetscLogFlops(C->cmap->n);
2316: MatSeqAIJCheckInode(C);
2317: return(0);
2318: }
2321: /* ----------------------------------------------------------- */
2324: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2325: {
2326: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2327: IS iscol = a->col,isrow = a->row;
2328: PetscErrorCode ierr;
2329: const PetscInt *r,*c,*rout,*cout;
2330: PetscInt i,j,n = A->rmap->n;
2331: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2332: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2333: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2334: PetscScalar sum1,sum2,sum3,sum4,sum5;
2335: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336: const PetscScalar *b;
2339: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340: node_max = a->inode.node_count;
2341: ns = a->inode.size; /* Node Size array */
2343: VecGetArrayRead(bb,&b);
2344: VecGetArray(xx,&x);
2345: tmp = a->solve_work;
2347: ISGetIndices(isrow,&rout); r = rout;
2348: ISGetIndices(iscol,&cout); c = cout;
2350: /* forward solve the lower triangular */
2351: tmps = tmp;
2352: aa = a_a;
2353: aj = a_j;
2354: ad = a->diag;
2356: for (i = 0,row = 0; i< node_max; ++i) {
2357: nsz = ns[i];
2358: aii = ai[row];
2359: v1 = aa + aii;
2360: vi = aj + aii;
2361: nz = ai[row+1]- ai[row];
2363: if (i < node_max-1) {
2364: /* Prefetch the indices for the next block */
2365: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366: /* Prefetch the data for the next block */
2367: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368: }
2370: switch (nsz) { /* Each loop in 'case' is unrolled */
2371: case 1:
2372: sum1 = b[r[row]];
2373: for (j=0; j<nz-1; j+=2) {
2374: i0 = vi[j];
2375: i1 = vi[j+1];
2376: tmp0 = tmps[i0];
2377: tmp1 = tmps[i1];
2378: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379: }
2380: if (j == nz-1) {
2381: tmp0 = tmps[vi[j]];
2382: sum1 -= v1[j]*tmp0;
2383: }
2384: tmp[row++]=sum1;
2385: break;
2386: case 2:
2387: sum1 = b[r[row]];
2388: sum2 = b[r[row+1]];
2389: v2 = aa + ai[row+1];
2391: for (j=0; j<nz-1; j+=2) {
2392: i0 = vi[j];
2393: i1 = vi[j+1];
2394: tmp0 = tmps[i0];
2395: tmp1 = tmps[i1];
2396: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398: }
2399: if (j == nz-1) {
2400: tmp0 = tmps[vi[j]];
2401: sum1 -= v1[j] *tmp0;
2402: sum2 -= v2[j] *tmp0;
2403: }
2404: sum2 -= v2[nz] * sum1;
2405: tmp[row++]=sum1;
2406: tmp[row++]=sum2;
2407: break;
2408: case 3:
2409: sum1 = b[r[row]];
2410: sum2 = b[r[row+1]];
2411: sum3 = b[r[row+2]];
2412: v2 = aa + ai[row+1];
2413: v3 = aa + ai[row+2];
2415: for (j=0; j<nz-1; j+=2) {
2416: i0 = vi[j];
2417: i1 = vi[j+1];
2418: tmp0 = tmps[i0];
2419: tmp1 = tmps[i1];
2420: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423: }
2424: if (j == nz-1) {
2425: tmp0 = tmps[vi[j]];
2426: sum1 -= v1[j] *tmp0;
2427: sum2 -= v2[j] *tmp0;
2428: sum3 -= v3[j] *tmp0;
2429: }
2430: sum2 -= v2[nz] * sum1;
2431: sum3 -= v3[nz] * sum1;
2432: sum3 -= v3[nz+1] * sum2;
2433: tmp[row++]=sum1;
2434: tmp[row++]=sum2;
2435: tmp[row++]=sum3;
2436: break;
2438: case 4:
2439: sum1 = b[r[row]];
2440: sum2 = b[r[row+1]];
2441: sum3 = b[r[row+2]];
2442: sum4 = b[r[row+3]];
2443: v2 = aa + ai[row+1];
2444: v3 = aa + ai[row+2];
2445: v4 = aa + ai[row+3];
2447: for (j=0; j<nz-1; j+=2) {
2448: i0 = vi[j];
2449: i1 = vi[j+1];
2450: tmp0 = tmps[i0];
2451: tmp1 = tmps[i1];
2452: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456: }
2457: if (j == nz-1) {
2458: tmp0 = tmps[vi[j]];
2459: sum1 -= v1[j] *tmp0;
2460: sum2 -= v2[j] *tmp0;
2461: sum3 -= v3[j] *tmp0;
2462: sum4 -= v4[j] *tmp0;
2463: }
2464: sum2 -= v2[nz] * sum1;
2465: sum3 -= v3[nz] * sum1;
2466: sum4 -= v4[nz] * sum1;
2467: sum3 -= v3[nz+1] * sum2;
2468: sum4 -= v4[nz+1] * sum2;
2469: sum4 -= v4[nz+2] * sum3;
2471: tmp[row++]=sum1;
2472: tmp[row++]=sum2;
2473: tmp[row++]=sum3;
2474: tmp[row++]=sum4;
2475: break;
2476: case 5:
2477: sum1 = b[r[row]];
2478: sum2 = b[r[row+1]];
2479: sum3 = b[r[row+2]];
2480: sum4 = b[r[row+3]];
2481: sum5 = b[r[row+4]];
2482: v2 = aa + ai[row+1];
2483: v3 = aa + ai[row+2];
2484: v4 = aa + ai[row+3];
2485: v5 = aa + ai[row+4];
2487: for (j=0; j<nz-1; j+=2) {
2488: i0 = vi[j];
2489: i1 = vi[j+1];
2490: tmp0 = tmps[i0];
2491: tmp1 = tmps[i1];
2492: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497: }
2498: if (j == nz-1) {
2499: tmp0 = tmps[vi[j]];
2500: sum1 -= v1[j] *tmp0;
2501: sum2 -= v2[j] *tmp0;
2502: sum3 -= v3[j] *tmp0;
2503: sum4 -= v4[j] *tmp0;
2504: sum5 -= v5[j] *tmp0;
2505: }
2507: sum2 -= v2[nz] * sum1;
2508: sum3 -= v3[nz] * sum1;
2509: sum4 -= v4[nz] * sum1;
2510: sum5 -= v5[nz] * sum1;
2511: sum3 -= v3[nz+1] * sum2;
2512: sum4 -= v4[nz+1] * sum2;
2513: sum5 -= v5[nz+1] * sum2;
2514: sum4 -= v4[nz+2] * sum3;
2515: sum5 -= v5[nz+2] * sum3;
2516: sum5 -= v5[nz+3] * sum4;
2518: tmp[row++]=sum1;
2519: tmp[row++]=sum2;
2520: tmp[row++]=sum3;
2521: tmp[row++]=sum4;
2522: tmp[row++]=sum5;
2523: break;
2524: default:
2525: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526: }
2527: }
2528: /* backward solve the upper triangular */
2529: for (i=node_max -1,row = n-1; i>=0; i--) {
2530: nsz = ns[i];
2531: aii = ad[row+1] + 1;
2532: v1 = aa + aii;
2533: vi = aj + aii;
2534: nz = ad[row]- ad[row+1] - 1;
2536: if (i > 0) {
2537: /* Prefetch the indices for the next block */
2538: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539: /* Prefetch the data for the next block */
2540: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541: }
2543: switch (nsz) { /* Each loop in 'case' is unrolled */
2544: case 1:
2545: sum1 = tmp[row];
2547: for (j=0; j<nz-1; j+=2) {
2548: i0 = vi[j];
2549: i1 = vi[j+1];
2550: tmp0 = tmps[i0];
2551: tmp1 = tmps[i1];
2552: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553: }
2554: if (j == nz-1) {
2555: tmp0 = tmps[vi[j]];
2556: sum1 -= v1[j]*tmp0;
2557: }
2558: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559: break;
2560: case 2:
2561: sum1 = tmp[row];
2562: sum2 = tmp[row-1];
2563: v2 = aa + ad[row] + 1;
2564: for (j=0; j<nz-1; j+=2) {
2565: i0 = vi[j];
2566: i1 = vi[j+1];
2567: tmp0 = tmps[i0];
2568: tmp1 = tmps[i1];
2569: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571: }
2572: if (j == nz-1) {
2573: tmp0 = tmps[vi[j]];
2574: sum1 -= v1[j]* tmp0;
2575: sum2 -= v2[j+1]* tmp0;
2576: }
2578: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579: sum2 -= v2[0] * tmp0;
2580: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581: break;
2582: case 3:
2583: sum1 = tmp[row];
2584: sum2 = tmp[row -1];
2585: sum3 = tmp[row -2];
2586: v2 = aa + ad[row] + 1;
2587: v3 = aa + ad[row -1] + 1;
2588: for (j=0; j<nz-1; j+=2) {
2589: i0 = vi[j];
2590: i1 = vi[j+1];
2591: tmp0 = tmps[i0];
2592: tmp1 = tmps[i1];
2593: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596: }
2597: if (j== nz-1) {
2598: tmp0 = tmps[vi[j]];
2599: sum1 -= v1[j] * tmp0;
2600: sum2 -= v2[j+1] * tmp0;
2601: sum3 -= v3[j+2] * tmp0;
2602: }
2603: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604: sum2 -= v2[0]* tmp0;
2605: sum3 -= v3[1] * tmp0;
2606: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607: sum3 -= v3[0]* tmp0;
2608: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2610: break;
2611: case 4:
2612: sum1 = tmp[row];
2613: sum2 = tmp[row -1];
2614: sum3 = tmp[row -2];
2615: sum4 = tmp[row -3];
2616: v2 = aa + ad[row]+1;
2617: v3 = aa + ad[row -1]+1;
2618: v4 = aa + ad[row -2]+1;
2620: for (j=0; j<nz-1; j+=2) {
2621: i0 = vi[j];
2622: i1 = vi[j+1];
2623: tmp0 = tmps[i0];
2624: tmp1 = tmps[i1];
2625: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2626: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629: }
2630: if (j== nz-1) {
2631: tmp0 = tmps[vi[j]];
2632: sum1 -= v1[j] * tmp0;
2633: sum2 -= v2[j+1] * tmp0;
2634: sum3 -= v3[j+2] * tmp0;
2635: sum4 -= v4[j+3] * tmp0;
2636: }
2638: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639: sum2 -= v2[0] * tmp0;
2640: sum3 -= v3[1] * tmp0;
2641: sum4 -= v4[2] * tmp0;
2642: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643: sum3 -= v3[0] * tmp0;
2644: sum4 -= v4[1] * tmp0;
2645: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646: sum4 -= v4[0] * tmp0;
2647: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648: break;
2649: case 5:
2650: sum1 = tmp[row];
2651: sum2 = tmp[row -1];
2652: sum3 = tmp[row -2];
2653: sum4 = tmp[row -3];
2654: sum5 = tmp[row -4];
2655: v2 = aa + ad[row]+1;
2656: v3 = aa + ad[row -1]+1;
2657: v4 = aa + ad[row -2]+1;
2658: v5 = aa + ad[row -3]+1;
2659: for (j=0; j<nz-1; j+=2) {
2660: i0 = vi[j];
2661: i1 = vi[j+1];
2662: tmp0 = tmps[i0];
2663: tmp1 = tmps[i1];
2664: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669: }
2670: if (j==nz-1) {
2671: tmp0 = tmps[vi[j]];
2672: sum1 -= v1[j] * tmp0;
2673: sum2 -= v2[j+1] * tmp0;
2674: sum3 -= v3[j+2] * tmp0;
2675: sum4 -= v4[j+3] * tmp0;
2676: sum5 -= v5[j+4] * tmp0;
2677: }
2679: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680: sum2 -= v2[0] * tmp0;
2681: sum3 -= v3[1] * tmp0;
2682: sum4 -= v4[2] * tmp0;
2683: sum5 -= v5[3] * tmp0;
2684: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685: sum3 -= v3[0] * tmp0;
2686: sum4 -= v4[1] * tmp0;
2687: sum5 -= v5[2] * tmp0;
2688: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689: sum4 -= v4[0] * tmp0;
2690: sum5 -= v5[1] * tmp0;
2691: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692: sum5 -= v5[0] * tmp0;
2693: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694: break;
2695: default:
2696: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697: }
2698: }
2699: ISRestoreIndices(isrow,&rout);
2700: ISRestoreIndices(iscol,&cout);
2701: VecRestoreArrayRead(bb,&b);
2702: VecRestoreArray(xx,&x);
2703: PetscLogFlops(2.0*a->nz - A->cmap->n);
2704: return(0);
2705: }
2708: /*
2709: Makes a longer coloring[] array and calls the usual code with that
2710: */
2713: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714: {
2715: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2716: PetscErrorCode ierr;
2717: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718: PetscInt *colorused,i;
2719: ISColoringValue *newcolor;
2722: PetscMalloc1(n+1,&newcolor);
2723: /* loop over inodes, marking a color for each column*/
2724: row = 0;
2725: for (i=0; i<m; i++) {
2726: for (j=0; j<ns[i]; j++) {
2727: newcolor[row++] = coloring[i] + j*ncolors;
2728: }
2729: }
2731: /* eliminate unneeded colors */
2732: PetscCalloc1(5*ncolors,&colorused);
2733: for (i=0; i<n; i++) {
2734: colorused[newcolor[i]] = 1;
2735: }
2737: for (i=1; i<5*ncolors; i++) {
2738: colorused[i] += colorused[i-1];
2739: }
2740: ncolors = colorused[5*ncolors-1];
2741: for (i=0; i<n; i++) {
2742: newcolor[i] = colorused[newcolor[i]]-1;
2743: }
2744: PetscFree(colorused);
2745: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2746: PetscFree(coloring);
2747: return(0);
2748: }
2750: #include <petsc/private/kernels/blockinvert.h>
2754: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2755: {
2756: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2757: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2758: MatScalar *ibdiag,*bdiag,work[25],*t;
2759: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2760: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2761: const PetscScalar *xb, *b;
2762: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2763: PetscErrorCode ierr;
2764: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2765: PetscInt sz,k,ipvt[5];
2766: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
2767: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2770: allowzeropivot = PetscNot(A->erroriffailure);
2771: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2772: if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2773: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2775: if (!a->inode.ibdiagvalid) {
2776: if (!a->inode.ibdiag) {
2777: /* calculate space needed for diagonal blocks */
2778: for (i=0; i<m; i++) {
2779: cnt += sizes[i]*sizes[i];
2780: }
2781: a->inode.bdiagsize = cnt;
2783: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2784: }
2786: /* copy over the diagonal blocks and invert them */
2787: ibdiag = a->inode.ibdiag;
2788: bdiag = a->inode.bdiag;
2789: cnt = 0;
2790: for (i=0, row = 0; i<m; i++) {
2791: for (j=0; j<sizes[i]; j++) {
2792: for (k=0; k<sizes[i]; k++) {
2793: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2794: }
2795: }
2796: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2798: switch (sizes[i]) {
2799: case 1:
2800: /* Create matrix data structure */
2801: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2802: if (allowzeropivot) {
2803: zeropivotdetected = PETSC_TRUE;
2804: PetscInfo1(A,"Zero pivot, row %D\n",row);
2805: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2806: }
2807: ibdiag[cnt] = 1.0/ibdiag[cnt];
2808: break;
2809: case 2:
2810: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2811: break;
2812: case 3:
2813: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2814: break;
2815: case 4:
2816: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2817: break;
2818: case 5:
2819: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2820: break;
2821: default:
2822: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2823: }
2824: if (zeropivotdetected) {
2825: A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2826: }
2828: cnt += sizes[i]*sizes[i];
2829: row += sizes[i];
2830: }
2831: a->inode.ibdiagvalid = PETSC_TRUE;
2832: }
2833: ibdiag = a->inode.ibdiag;
2834: bdiag = a->inode.bdiag;
2835: t = a->inode.ssor_work;
2837: VecGetArray(xx,&x);
2838: VecGetArrayRead(bb,&b);
2839: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2840: if (flag & SOR_ZERO_INITIAL_GUESS) {
2841: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2843: for (i=0, row=0; i<m; i++) {
2844: sz = diag[row] - ii[row];
2845: v1 = a->a + ii[row];
2846: idx = a->j + ii[row];
2848: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2849: switch (sizes[i]) {
2850: case 1:
2852: sum1 = b[row];
2853: for (n = 0; n<sz-1; n+=2) {
2854: i1 = idx[0];
2855: i2 = idx[1];
2856: idx += 2;
2857: tmp0 = x[i1];
2858: tmp1 = x[i2];
2859: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2860: }
2862: if (n == sz-1) {
2863: tmp0 = x[*idx];
2864: sum1 -= *v1 * tmp0;
2865: }
2866: t[row] = sum1;
2867: x[row++] = sum1*(*ibdiag++);
2868: break;
2869: case 2:
2870: v2 = a->a + ii[row+1];
2871: sum1 = b[row];
2872: sum2 = b[row+1];
2873: for (n = 0; n<sz-1; n+=2) {
2874: i1 = idx[0];
2875: i2 = idx[1];
2876: idx += 2;
2877: tmp0 = x[i1];
2878: tmp1 = x[i2];
2879: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2880: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2881: }
2883: if (n == sz-1) {
2884: tmp0 = x[*idx];
2885: sum1 -= v1[0] * tmp0;
2886: sum2 -= v2[0] * tmp0;
2887: }
2888: t[row] = sum1;
2889: t[row+1] = sum2;
2890: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2891: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2892: ibdiag += 4;
2893: break;
2894: case 3:
2895: v2 = a->a + ii[row+1];
2896: v3 = a->a + ii[row+2];
2897: sum1 = b[row];
2898: sum2 = b[row+1];
2899: sum3 = b[row+2];
2900: for (n = 0; n<sz-1; n+=2) {
2901: i1 = idx[0];
2902: i2 = idx[1];
2903: idx += 2;
2904: tmp0 = x[i1];
2905: tmp1 = x[i2];
2906: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2908: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2909: }
2911: if (n == sz-1) {
2912: tmp0 = x[*idx];
2913: sum1 -= v1[0] * tmp0;
2914: sum2 -= v2[0] * tmp0;
2915: sum3 -= v3[0] * tmp0;
2916: }
2917: t[row] = sum1;
2918: t[row+1] = sum2;
2919: t[row+2] = sum3;
2920: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2921: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2922: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2923: ibdiag += 9;
2924: break;
2925: case 4:
2926: v2 = a->a + ii[row+1];
2927: v3 = a->a + ii[row+2];
2928: v4 = a->a + ii[row+3];
2929: sum1 = b[row];
2930: sum2 = b[row+1];
2931: sum3 = b[row+2];
2932: sum4 = b[row+3];
2933: for (n = 0; n<sz-1; n+=2) {
2934: i1 = idx[0];
2935: i2 = idx[1];
2936: idx += 2;
2937: tmp0 = x[i1];
2938: tmp1 = x[i2];
2939: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943: }
2945: if (n == sz-1) {
2946: tmp0 = x[*idx];
2947: sum1 -= v1[0] * tmp0;
2948: sum2 -= v2[0] * tmp0;
2949: sum3 -= v3[0] * tmp0;
2950: sum4 -= v4[0] * tmp0;
2951: }
2952: t[row] = sum1;
2953: t[row+1] = sum2;
2954: t[row+2] = sum3;
2955: t[row+3] = sum4;
2956: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2957: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2958: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2959: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2960: ibdiag += 16;
2961: break;
2962: case 5:
2963: v2 = a->a + ii[row+1];
2964: v3 = a->a + ii[row+2];
2965: v4 = a->a + ii[row+3];
2966: v5 = a->a + ii[row+4];
2967: sum1 = b[row];
2968: sum2 = b[row+1];
2969: sum3 = b[row+2];
2970: sum4 = b[row+3];
2971: sum5 = b[row+4];
2972: for (n = 0; n<sz-1; n+=2) {
2973: i1 = idx[0];
2974: i2 = idx[1];
2975: idx += 2;
2976: tmp0 = x[i1];
2977: tmp1 = x[i2];
2978: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983: }
2985: if (n == sz-1) {
2986: tmp0 = x[*idx];
2987: sum1 -= v1[0] * tmp0;
2988: sum2 -= v2[0] * tmp0;
2989: sum3 -= v3[0] * tmp0;
2990: sum4 -= v4[0] * tmp0;
2991: sum5 -= v5[0] * tmp0;
2992: }
2993: t[row] = sum1;
2994: t[row+1] = sum2;
2995: t[row+2] = sum3;
2996: t[row+3] = sum4;
2997: t[row+4] = sum5;
2998: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2999: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3000: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3001: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3002: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3003: ibdiag += 25;
3004: break;
3005: default:
3006: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3007: }
3008: }
3010: xb = t;
3011: PetscLogFlops(a->nz);
3012: } else xb = b;
3013: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3015: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3016: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3017: ibdiag -= sizes[i]*sizes[i];
3018: sz = ii[row+1] - diag[row] - 1;
3019: v1 = a->a + diag[row] + 1;
3020: idx = a->j + diag[row] + 1;
3022: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3023: switch (sizes[i]) {
3024: case 1:
3026: sum1 = xb[row];
3027: for (n = 0; n<sz-1; n+=2) {
3028: i1 = idx[0];
3029: i2 = idx[1];
3030: idx += 2;
3031: tmp0 = x[i1];
3032: tmp1 = x[i2];
3033: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3034: }
3036: if (n == sz-1) {
3037: tmp0 = x[*idx];
3038: sum1 -= *v1*tmp0;
3039: }
3040: x[row--] = sum1*(*ibdiag);
3041: break;
3043: case 2:
3045: sum1 = xb[row];
3046: sum2 = xb[row-1];
3047: /* note that sum1 is associated with the second of the two rows */
3048: v2 = a->a + diag[row-1] + 2;
3049: for (n = 0; n<sz-1; n+=2) {
3050: i1 = idx[0];
3051: i2 = idx[1];
3052: idx += 2;
3053: tmp0 = x[i1];
3054: tmp1 = x[i2];
3055: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3056: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3057: }
3059: if (n == sz-1) {
3060: tmp0 = x[*idx];
3061: sum1 -= *v1*tmp0;
3062: sum2 -= *v2*tmp0;
3063: }
3064: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3065: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3066: break;
3067: case 3:
3069: sum1 = xb[row];
3070: sum2 = xb[row-1];
3071: sum3 = xb[row-2];
3072: v2 = a->a + diag[row-1] + 2;
3073: v3 = a->a + diag[row-2] + 3;
3074: for (n = 0; n<sz-1; n+=2) {
3075: i1 = idx[0];
3076: i2 = idx[1];
3077: idx += 2;
3078: tmp0 = x[i1];
3079: tmp1 = x[i2];
3080: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083: }
3085: if (n == sz-1) {
3086: tmp0 = x[*idx];
3087: sum1 -= *v1*tmp0;
3088: sum2 -= *v2*tmp0;
3089: sum3 -= *v3*tmp0;
3090: }
3091: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3092: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3093: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3094: break;
3095: case 4:
3097: sum1 = xb[row];
3098: sum2 = xb[row-1];
3099: sum3 = xb[row-2];
3100: sum4 = xb[row-3];
3101: v2 = a->a + diag[row-1] + 2;
3102: v3 = a->a + diag[row-2] + 3;
3103: v4 = a->a + diag[row-3] + 4;
3104: for (n = 0; n<sz-1; n+=2) {
3105: i1 = idx[0];
3106: i2 = idx[1];
3107: idx += 2;
3108: tmp0 = x[i1];
3109: tmp1 = x[i2];
3110: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3111: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3112: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3113: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3114: }
3116: if (n == sz-1) {
3117: tmp0 = x[*idx];
3118: sum1 -= *v1*tmp0;
3119: sum2 -= *v2*tmp0;
3120: sum3 -= *v3*tmp0;
3121: sum4 -= *v4*tmp0;
3122: }
3123: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3124: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3125: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3126: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3127: break;
3128: case 5:
3130: sum1 = xb[row];
3131: sum2 = xb[row-1];
3132: sum3 = xb[row-2];
3133: sum4 = xb[row-3];
3134: sum5 = xb[row-4];
3135: v2 = a->a + diag[row-1] + 2;
3136: v3 = a->a + diag[row-2] + 3;
3137: v4 = a->a + diag[row-3] + 4;
3138: v5 = a->a + diag[row-4] + 5;
3139: for (n = 0; n<sz-1; n+=2) {
3140: i1 = idx[0];
3141: i2 = idx[1];
3142: idx += 2;
3143: tmp0 = x[i1];
3144: tmp1 = x[i2];
3145: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3146: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3147: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3148: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3149: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3150: }
3152: if (n == sz-1) {
3153: tmp0 = x[*idx];
3154: sum1 -= *v1*tmp0;
3155: sum2 -= *v2*tmp0;
3156: sum3 -= *v3*tmp0;
3157: sum4 -= *v4*tmp0;
3158: sum5 -= *v5*tmp0;
3159: }
3160: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3161: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3162: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3163: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3164: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3165: break;
3166: default:
3167: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3168: }
3169: }
3171: PetscLogFlops(a->nz);
3172: }
3173: its--;
3174: }
3175: while (its--) {
3177: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3178: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3179: i<m;
3180: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3182: sz = diag[row] - ii[row];
3183: v1 = a->a + ii[row];
3184: idx = a->j + ii[row];
3185: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3186: switch (sizes[i]) {
3187: case 1:
3188: sum1 = b[row];
3189: for (n = 0; n<sz-1; n+=2) {
3190: i1 = idx[0];
3191: i2 = idx[1];
3192: idx += 2;
3193: tmp0 = x[i1];
3194: tmp1 = x[i2];
3195: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3196: }
3197: if (n == sz-1) {
3198: tmp0 = x[*idx++];
3199: sum1 -= *v1 * tmp0;
3200: v1++;
3201: }
3202: t[row] = sum1;
3203: sz = ii[row+1] - diag[row] - 1;
3204: idx = a->j + diag[row] + 1;
3205: v1 += 1;
3206: for (n = 0; n<sz-1; n+=2) {
3207: i1 = idx[0];
3208: i2 = idx[1];
3209: idx += 2;
3210: tmp0 = x[i1];
3211: tmp1 = x[i2];
3212: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3213: }
3214: if (n == sz-1) {
3215: tmp0 = x[*idx++];
3216: sum1 -= *v1 * tmp0;
3217: }
3218: /* in MatSOR_SeqAIJ this line would be
3219: *
3220: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3221: *
3222: * but omega == 1, so this becomes
3223: *
3224: * x[row] = sum1*(*ibdiag++);
3225: *
3226: */
3227: x[row] = sum1*(*ibdiag);
3228: break;
3229: case 2:
3230: v2 = a->a + ii[row+1];
3231: sum1 = b[row];
3232: sum2 = b[row+1];
3233: for (n = 0; n<sz-1; n+=2) {
3234: i1 = idx[0];
3235: i2 = idx[1];
3236: idx += 2;
3237: tmp0 = x[i1];
3238: tmp1 = x[i2];
3239: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3240: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3241: }
3242: if (n == sz-1) {
3243: tmp0 = x[*idx++];
3244: sum1 -= v1[0] * tmp0;
3245: sum2 -= v2[0] * tmp0;
3246: v1++; v2++;
3247: }
3248: t[row] = sum1;
3249: t[row+1] = sum2;
3250: sz = ii[row+1] - diag[row] - 2;
3251: idx = a->j + diag[row] + 2;
3252: v1 += 2;
3253: v2 += 2;
3254: for (n = 0; n<sz-1; n+=2) {
3255: i1 = idx[0];
3256: i2 = idx[1];
3257: idx += 2;
3258: tmp0 = x[i1];
3259: tmp1 = x[i2];
3260: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3261: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3262: }
3263: if (n == sz-1) {
3264: tmp0 = x[*idx];
3265: sum1 -= v1[0] * tmp0;
3266: sum2 -= v2[0] * tmp0;
3267: }
3268: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3269: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3270: break;
3271: case 3:
3272: v2 = a->a + ii[row+1];
3273: v3 = a->a + ii[row+2];
3274: sum1 = b[row];
3275: sum2 = b[row+1];
3276: sum3 = b[row+2];
3277: for (n = 0; n<sz-1; n+=2) {
3278: i1 = idx[0];
3279: i2 = idx[1];
3280: idx += 2;
3281: tmp0 = x[i1];
3282: tmp1 = x[i2];
3283: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3284: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3285: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3286: }
3287: if (n == sz-1) {
3288: tmp0 = x[*idx++];
3289: sum1 -= v1[0] * tmp0;
3290: sum2 -= v2[0] * tmp0;
3291: sum3 -= v3[0] * tmp0;
3292: v1++; v2++; v3++;
3293: }
3294: t[row] = sum1;
3295: t[row+1] = sum2;
3296: t[row+2] = sum3;
3297: sz = ii[row+1] - diag[row] - 3;
3298: idx = a->j + diag[row] + 3;
3299: v1 += 3;
3300: v2 += 3;
3301: v3 += 3;
3302: for (n = 0; n<sz-1; n+=2) {
3303: i1 = idx[0];
3304: i2 = idx[1];
3305: idx += 2;
3306: tmp0 = x[i1];
3307: tmp1 = x[i2];
3308: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3309: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3310: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3311: }
3312: if (n == sz-1) {
3313: tmp0 = x[*idx];
3314: sum1 -= v1[0] * tmp0;
3315: sum2 -= v2[0] * tmp0;
3316: sum3 -= v3[0] * tmp0;
3317: }
3318: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321: break;
3322: case 4:
3323: v2 = a->a + ii[row+1];
3324: v3 = a->a + ii[row+2];
3325: v4 = a->a + ii[row+3];
3326: sum1 = b[row];
3327: sum2 = b[row+1];
3328: sum3 = b[row+2];
3329: sum4 = b[row+3];
3330: for (n = 0; n<sz-1; n+=2) {
3331: i1 = idx[0];
3332: i2 = idx[1];
3333: idx += 2;
3334: tmp0 = x[i1];
3335: tmp1 = x[i2];
3336: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3337: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3338: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3339: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3340: }
3341: if (n == sz-1) {
3342: tmp0 = x[*idx++];
3343: sum1 -= v1[0] * tmp0;
3344: sum2 -= v2[0] * tmp0;
3345: sum3 -= v3[0] * tmp0;
3346: sum4 -= v4[0] * tmp0;
3347: v1++; v2++; v3++; v4++;
3348: }
3349: t[row] = sum1;
3350: t[row+1] = sum2;
3351: t[row+2] = sum3;
3352: t[row+3] = sum4;
3353: sz = ii[row+1] - diag[row] - 4;
3354: idx = a->j + diag[row] + 4;
3355: v1 += 4;
3356: v2 += 4;
3357: v3 += 4;
3358: v4 += 4;
3359: for (n = 0; n<sz-1; n+=2) {
3360: i1 = idx[0];
3361: i2 = idx[1];
3362: idx += 2;
3363: tmp0 = x[i1];
3364: tmp1 = x[i2];
3365: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3366: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3367: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3368: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3369: }
3370: if (n == sz-1) {
3371: tmp0 = x[*idx];
3372: sum1 -= v1[0] * tmp0;
3373: sum2 -= v2[0] * tmp0;
3374: sum3 -= v3[0] * tmp0;
3375: sum4 -= v4[0] * tmp0;
3376: }
3377: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3378: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3379: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3380: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3381: break;
3382: case 5:
3383: v2 = a->a + ii[row+1];
3384: v3 = a->a + ii[row+2];
3385: v4 = a->a + ii[row+3];
3386: v5 = a->a + ii[row+4];
3387: sum1 = b[row];
3388: sum2 = b[row+1];
3389: sum3 = b[row+2];
3390: sum4 = b[row+3];
3391: sum5 = b[row+4];
3392: for (n = 0; n<sz-1; n+=2) {
3393: i1 = idx[0];
3394: i2 = idx[1];
3395: idx += 2;
3396: tmp0 = x[i1];
3397: tmp1 = x[i2];
3398: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3399: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3400: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3401: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3402: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3403: }
3404: if (n == sz-1) {
3405: tmp0 = x[*idx++];
3406: sum1 -= v1[0] * tmp0;
3407: sum2 -= v2[0] * tmp0;
3408: sum3 -= v3[0] * tmp0;
3409: sum4 -= v4[0] * tmp0;
3410: sum5 -= v5[0] * tmp0;
3411: v1++; v2++; v3++; v4++; v5++;
3412: }
3413: t[row] = sum1;
3414: t[row+1] = sum2;
3415: t[row+2] = sum3;
3416: t[row+3] = sum4;
3417: t[row+4] = sum5;
3418: sz = ii[row+1] - diag[row] - 5;
3419: idx = a->j + diag[row] + 5;
3420: v1 += 5;
3421: v2 += 5;
3422: v3 += 5;
3423: v4 += 5;
3424: v5 += 5;
3425: for (n = 0; n<sz-1; n+=2) {
3426: i1 = idx[0];
3427: i2 = idx[1];
3428: idx += 2;
3429: tmp0 = x[i1];
3430: tmp1 = x[i2];
3431: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3432: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3433: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3434: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3435: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3436: }
3437: if (n == sz-1) {
3438: tmp0 = x[*idx];
3439: sum1 -= v1[0] * tmp0;
3440: sum2 -= v2[0] * tmp0;
3441: sum3 -= v3[0] * tmp0;
3442: sum4 -= v4[0] * tmp0;
3443: sum5 -= v5[0] * tmp0;
3444: }
3445: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3446: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3447: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3448: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3449: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3450: break;
3451: default:
3452: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3453: }
3454: }
3455: xb = t;
3456: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3457: } else xb = b;
3459: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3461: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3462: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3463: ibdiag -= sizes[i]*sizes[i];
3465: /* set RHS */
3466: if (xb == b) {
3467: /* whole (old way) */
3468: sz = ii[row+1] - ii[row];
3469: idx = a->j + ii[row];
3470: switch (sizes[i]) {
3471: case 5:
3472: v5 = a->a + ii[row-4];
3473: case 4: /* fall through */
3474: v4 = a->a + ii[row-3];
3475: case 3:
3476: v3 = a->a + ii[row-2];
3477: case 2:
3478: v2 = a->a + ii[row-1];
3479: case 1:
3480: v1 = a->a + ii[row];
3481: break;
3482: default:
3483: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3484: }
3485: } else {
3486: /* upper, no diag */
3487: sz = ii[row+1] - diag[row] - 1;
3488: idx = a->j + diag[row] + 1;
3489: switch (sizes[i]) {
3490: case 5:
3491: v5 = a->a + diag[row-4] + 5;
3492: case 4: /* fall through */
3493: v4 = a->a + diag[row-3] + 4;
3494: case 3:
3495: v3 = a->a + diag[row-2] + 3;
3496: case 2:
3497: v2 = a->a + diag[row-1] + 2;
3498: case 1:
3499: v1 = a->a + diag[row] + 1;
3500: }
3501: }
3502: /* set sum */
3503: switch (sizes[i]) {
3504: case 5:
3505: sum5 = xb[row-4];
3506: case 4: /* fall through */
3507: sum4 = xb[row-3];
3508: case 3:
3509: sum3 = xb[row-2];
3510: case 2:
3511: sum2 = xb[row-1];
3512: case 1:
3513: /* note that sum1 is associated with the last row */
3514: sum1 = xb[row];
3515: }
3516: /* do sums */
3517: for (n = 0; n<sz-1; n+=2) {
3518: i1 = idx[0];
3519: i2 = idx[1];
3520: idx += 2;
3521: tmp0 = x[i1];
3522: tmp1 = x[i2];
3523: switch (sizes[i]) {
3524: case 5:
3525: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3526: case 4: /* fall through */
3527: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3528: case 3:
3529: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3530: case 2:
3531: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3532: case 1:
3533: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3534: }
3535: }
3536: /* ragged edge */
3537: if (n == sz-1) {
3538: tmp0 = x[*idx];
3539: switch (sizes[i]) {
3540: case 5:
3541: sum5 -= *v5*tmp0;
3542: case 4: /* fall through */
3543: sum4 -= *v4*tmp0;
3544: case 3:
3545: sum3 -= *v3*tmp0;
3546: case 2:
3547: sum2 -= *v2*tmp0;
3548: case 1:
3549: sum1 -= *v1*tmp0;
3550: }
3551: }
3552: /* update */
3553: if (xb == b) {
3554: /* whole (old way) w/ diag */
3555: switch (sizes[i]) {
3556: case 5:
3557: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3558: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3559: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3560: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3561: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3562: break;
3563: case 4:
3564: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3565: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3566: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3567: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3568: break;
3569: case 3:
3570: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3571: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3572: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3573: break;
3574: case 2:
3575: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3576: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3577: break;
3578: case 1:
3579: x[row--] += sum1*(*ibdiag);
3580: break;
3581: }
3582: } else {
3583: /* no diag so set = */
3584: switch (sizes[i]) {
3585: case 5:
3586: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3587: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3588: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3589: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3590: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3591: break;
3592: case 4:
3593: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3594: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3595: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3596: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3597: break;
3598: case 3:
3599: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3600: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3601: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3602: break;
3603: case 2:
3604: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3605: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3606: break;
3607: case 1:
3608: x[row--] = sum1*(*ibdiag);
3609: break;
3610: }
3611: }
3612: }
3613: if (xb == b) {
3614: PetscLogFlops(2.0*a->nz);
3615: } else {
3616: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3617: }
3618: }
3619: }
3620: if (flag & SOR_EISENSTAT) {
3621: /*
3622: Apply (U + D)^-1 where D is now the block diagonal
3623: */
3624: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3625: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3626: ibdiag -= sizes[i]*sizes[i];
3627: sz = ii[row+1] - diag[row] - 1;
3628: v1 = a->a + diag[row] + 1;
3629: idx = a->j + diag[row] + 1;
3630: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3631: switch (sizes[i]) {
3632: case 1:
3634: sum1 = b[row];
3635: for (n = 0; n<sz-1; n+=2) {
3636: i1 = idx[0];
3637: i2 = idx[1];
3638: idx += 2;
3639: tmp0 = x[i1];
3640: tmp1 = x[i2];
3641: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3642: }
3644: if (n == sz-1) {
3645: tmp0 = x[*idx];
3646: sum1 -= *v1*tmp0;
3647: }
3648: x[row] = sum1*(*ibdiag);row--;
3649: break;
3651: case 2:
3653: sum1 = b[row];
3654: sum2 = b[row-1];
3655: /* note that sum1 is associated with the second of the two rows */
3656: v2 = a->a + diag[row-1] + 2;
3657: for (n = 0; n<sz-1; n+=2) {
3658: i1 = idx[0];
3659: i2 = idx[1];
3660: idx += 2;
3661: tmp0 = x[i1];
3662: tmp1 = x[i2];
3663: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3664: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3665: }
3667: if (n == sz-1) {
3668: tmp0 = x[*idx];
3669: sum1 -= *v1*tmp0;
3670: sum2 -= *v2*tmp0;
3671: }
3672: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3673: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3674: row -= 2;
3675: break;
3676: case 3:
3678: sum1 = b[row];
3679: sum2 = b[row-1];
3680: sum3 = b[row-2];
3681: v2 = a->a + diag[row-1] + 2;
3682: v3 = a->a + diag[row-2] + 3;
3683: for (n = 0; n<sz-1; n+=2) {
3684: i1 = idx[0];
3685: i2 = idx[1];
3686: idx += 2;
3687: tmp0 = x[i1];
3688: tmp1 = x[i2];
3689: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3690: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3691: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3692: }
3694: if (n == sz-1) {
3695: tmp0 = x[*idx];
3696: sum1 -= *v1*tmp0;
3697: sum2 -= *v2*tmp0;
3698: sum3 -= *v3*tmp0;
3699: }
3700: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3701: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3702: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3703: row -= 3;
3704: break;
3705: case 4:
3707: sum1 = b[row];
3708: sum2 = b[row-1];
3709: sum3 = b[row-2];
3710: sum4 = b[row-3];
3711: v2 = a->a + diag[row-1] + 2;
3712: v3 = a->a + diag[row-2] + 3;
3713: v4 = a->a + diag[row-3] + 4;
3714: for (n = 0; n<sz-1; n+=2) {
3715: i1 = idx[0];
3716: i2 = idx[1];
3717: idx += 2;
3718: tmp0 = x[i1];
3719: tmp1 = x[i2];
3720: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724: }
3726: if (n == sz-1) {
3727: tmp0 = x[*idx];
3728: sum1 -= *v1*tmp0;
3729: sum2 -= *v2*tmp0;
3730: sum3 -= *v3*tmp0;
3731: sum4 -= *v4*tmp0;
3732: }
3733: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3734: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3735: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3736: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3737: row -= 4;
3738: break;
3739: case 5:
3741: sum1 = b[row];
3742: sum2 = b[row-1];
3743: sum3 = b[row-2];
3744: sum4 = b[row-3];
3745: sum5 = b[row-4];
3746: v2 = a->a + diag[row-1] + 2;
3747: v3 = a->a + diag[row-2] + 3;
3748: v4 = a->a + diag[row-3] + 4;
3749: v5 = a->a + diag[row-4] + 5;
3750: for (n = 0; n<sz-1; n+=2) {
3751: i1 = idx[0];
3752: i2 = idx[1];
3753: idx += 2;
3754: tmp0 = x[i1];
3755: tmp1 = x[i2];
3756: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3757: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3758: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3759: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3760: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3761: }
3763: if (n == sz-1) {
3764: tmp0 = x[*idx];
3765: sum1 -= *v1*tmp0;
3766: sum2 -= *v2*tmp0;
3767: sum3 -= *v3*tmp0;
3768: sum4 -= *v4*tmp0;
3769: sum5 -= *v5*tmp0;
3770: }
3771: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3772: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3773: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3774: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3775: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3776: row -= 5;
3777: break;
3778: default:
3779: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3780: }
3781: }
3782: PetscLogFlops(a->nz);
3784: /*
3785: t = b - D x where D is the block diagonal
3786: */
3787: cnt = 0;
3788: for (i=0, row=0; i<m; i++) {
3789: switch (sizes[i]) {
3790: case 1:
3791: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3792: break;
3793: case 2:
3794: x1 = x[row]; x2 = x[row+1];
3795: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3796: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3797: t[row] = b[row] - tmp1;
3798: t[row+1] = b[row+1] - tmp2; row += 2;
3799: cnt += 4;
3800: break;
3801: case 3:
3802: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3803: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3804: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3805: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3806: t[row] = b[row] - tmp1;
3807: t[row+1] = b[row+1] - tmp2;
3808: t[row+2] = b[row+2] - tmp3; row += 3;
3809: cnt += 9;
3810: break;
3811: case 4:
3812: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3813: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3814: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3815: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3816: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3817: t[row] = b[row] - tmp1;
3818: t[row+1] = b[row+1] - tmp2;
3819: t[row+2] = b[row+2] - tmp3;
3820: t[row+3] = b[row+3] - tmp4; row += 4;
3821: cnt += 16;
3822: break;
3823: case 5:
3824: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3825: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3826: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3827: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3828: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3829: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3830: t[row] = b[row] - tmp1;
3831: t[row+1] = b[row+1] - tmp2;
3832: t[row+2] = b[row+2] - tmp3;
3833: t[row+3] = b[row+3] - tmp4;
3834: t[row+4] = b[row+4] - tmp5;row += 5;
3835: cnt += 25;
3836: break;
3837: default:
3838: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3839: }
3840: }
3841: PetscLogFlops(m);
3845: /*
3846: Apply (L + D)^-1 where D is the block diagonal
3847: */
3848: for (i=0, row=0; i<m; i++) {
3849: sz = diag[row] - ii[row];
3850: v1 = a->a + ii[row];
3851: idx = a->j + ii[row];
3852: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3853: switch (sizes[i]) {
3854: case 1:
3856: sum1 = t[row];
3857: for (n = 0; n<sz-1; n+=2) {
3858: i1 = idx[0];
3859: i2 = idx[1];
3860: idx += 2;
3861: tmp0 = t[i1];
3862: tmp1 = t[i2];
3863: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3864: }
3866: if (n == sz-1) {
3867: tmp0 = t[*idx];
3868: sum1 -= *v1 * tmp0;
3869: }
3870: x[row] += t[row] = sum1*(*ibdiag++); row++;
3871: break;
3872: case 2:
3873: v2 = a->a + ii[row+1];
3874: sum1 = t[row];
3875: sum2 = t[row+1];
3876: for (n = 0; n<sz-1; n+=2) {
3877: i1 = idx[0];
3878: i2 = idx[1];
3879: idx += 2;
3880: tmp0 = t[i1];
3881: tmp1 = t[i2];
3882: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3883: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3884: }
3886: if (n == sz-1) {
3887: tmp0 = t[*idx];
3888: sum1 -= v1[0] * tmp0;
3889: sum2 -= v2[0] * tmp0;
3890: }
3891: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3892: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3893: ibdiag += 4; row += 2;
3894: break;
3895: case 3:
3896: v2 = a->a + ii[row+1];
3897: v3 = a->a + ii[row+2];
3898: sum1 = t[row];
3899: sum2 = t[row+1];
3900: sum3 = t[row+2];
3901: for (n = 0; n<sz-1; n+=2) {
3902: i1 = idx[0];
3903: i2 = idx[1];
3904: idx += 2;
3905: tmp0 = t[i1];
3906: tmp1 = t[i2];
3907: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910: }
3912: if (n == sz-1) {
3913: tmp0 = t[*idx];
3914: sum1 -= v1[0] * tmp0;
3915: sum2 -= v2[0] * tmp0;
3916: sum3 -= v3[0] * tmp0;
3917: }
3918: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3919: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3920: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3921: ibdiag += 9; row += 3;
3922: break;
3923: case 4:
3924: v2 = a->a + ii[row+1];
3925: v3 = a->a + ii[row+2];
3926: v4 = a->a + ii[row+3];
3927: sum1 = t[row];
3928: sum2 = t[row+1];
3929: sum3 = t[row+2];
3930: sum4 = t[row+3];
3931: for (n = 0; n<sz-1; n+=2) {
3932: i1 = idx[0];
3933: i2 = idx[1];
3934: idx += 2;
3935: tmp0 = t[i1];
3936: tmp1 = t[i2];
3937: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3938: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3939: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3940: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3941: }
3943: if (n == sz-1) {
3944: tmp0 = t[*idx];
3945: sum1 -= v1[0] * tmp0;
3946: sum2 -= v2[0] * tmp0;
3947: sum3 -= v3[0] * tmp0;
3948: sum4 -= v4[0] * tmp0;
3949: }
3950: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3951: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3952: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3953: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3954: ibdiag += 16; row += 4;
3955: break;
3956: case 5:
3957: v2 = a->a + ii[row+1];
3958: v3 = a->a + ii[row+2];
3959: v4 = a->a + ii[row+3];
3960: v5 = a->a + ii[row+4];
3961: sum1 = t[row];
3962: sum2 = t[row+1];
3963: sum3 = t[row+2];
3964: sum4 = t[row+3];
3965: sum5 = t[row+4];
3966: for (n = 0; n<sz-1; n+=2) {
3967: i1 = idx[0];
3968: i2 = idx[1];
3969: idx += 2;
3970: tmp0 = t[i1];
3971: tmp1 = t[i2];
3972: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3973: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3974: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3975: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3976: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3977: }
3979: if (n == sz-1) {
3980: tmp0 = t[*idx];
3981: sum1 -= v1[0] * tmp0;
3982: sum2 -= v2[0] * tmp0;
3983: sum3 -= v3[0] * tmp0;
3984: sum4 -= v4[0] * tmp0;
3985: sum5 -= v5[0] * tmp0;
3986: }
3987: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3988: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3989: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3990: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3991: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3992: ibdiag += 25; row += 5;
3993: break;
3994: default:
3995: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3996: }
3997: }
3998: PetscLogFlops(a->nz);
3999: }
4000: VecRestoreArray(xx,&x);
4001: VecRestoreArrayRead(bb,&b);
4002: return(0);
4003: }
4007: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4008: {
4009: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4010: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4011: const MatScalar *bdiag = a->inode.bdiag;
4012: const PetscScalar *b;
4013: PetscErrorCode ierr;
4014: PetscInt m = a->inode.node_count,cnt = 0,i,row;
4015: const PetscInt *sizes = a->inode.size;
4018: VecGetArray(xx,&x);
4019: VecGetArrayRead(bb,&b);
4020: cnt = 0;
4021: for (i=0, row=0; i<m; i++) {
4022: switch (sizes[i]) {
4023: case 1:
4024: x[row] = b[row]*bdiag[cnt++];row++;
4025: break;
4026: case 2:
4027: x1 = b[row]; x2 = b[row+1];
4028: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4029: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4030: x[row++] = tmp1;
4031: x[row++] = tmp2;
4032: cnt += 4;
4033: break;
4034: case 3:
4035: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4036: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4037: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4038: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4039: x[row++] = tmp1;
4040: x[row++] = tmp2;
4041: x[row++] = tmp3;
4042: cnt += 9;
4043: break;
4044: case 4:
4045: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4046: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4047: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4048: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4049: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4050: x[row++] = tmp1;
4051: x[row++] = tmp2;
4052: x[row++] = tmp3;
4053: x[row++] = tmp4;
4054: cnt += 16;
4055: break;
4056: case 5:
4057: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4058: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4059: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4060: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4061: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4062: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4063: x[row++] = tmp1;
4064: x[row++] = tmp2;
4065: x[row++] = tmp3;
4066: x[row++] = tmp4;
4067: x[row++] = tmp5;
4068: cnt += 25;
4069: break;
4070: default:
4071: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4072: }
4073: }
4074: PetscLogFlops(2*cnt);
4075: VecRestoreArray(xx,&x);
4076: VecRestoreArrayRead(bb,&b);
4077: return(0);
4078: }
4080: /*
4081: samestructure indicates that the matrix has not changed its nonzero structure so we
4082: do not need to recompute the inodes
4083: */
4086: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4087: {
4088: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4090: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4091: PetscBool flag;
4092: const PetscInt *idx,*idy,*ii;
4095: if (!a->inode.use) return(0);
4096: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4098: m = A->rmap->n;
4099: if (a->inode.size) ns = a->inode.size;
4100: else {
4101: PetscMalloc1(m+1,&ns);
4102: }
4104: i = 0;
4105: node_count = 0;
4106: idx = a->j;
4107: ii = a->i;
4108: while (i < m) { /* For each row */
4109: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4110: /* Limits the number of elements in a node to 'a->inode.limit' */
4111: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4112: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4113: if (nzy != nzx) break;
4114: idy += nzx; /* Same nonzero pattern */
4115: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4116: if (!flag) break;
4117: }
4118: ns[node_count++] = blk_size;
4119: idx += blk_size*nzx;
4120: i = j;
4121: }
4122: /* If not enough inodes found,, do not use inode version of the routines */
4123: if (!m || node_count > .8*m) {
4124: PetscFree(ns);
4126: a->inode.node_count = 0;
4127: a->inode.size = NULL;
4128: a->inode.use = PETSC_FALSE;
4129: A->ops->mult = MatMult_SeqAIJ;
4130: A->ops->sor = MatSOR_SeqAIJ;
4131: A->ops->multadd = MatMultAdd_SeqAIJ;
4132: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4133: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4134: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4135: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4136: A->ops->coloringpatch = 0;
4137: A->ops->multdiagonalblock = 0;
4139: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4140: } else {
4141: if (!A->factortype) {
4142: A->ops->mult = MatMult_SeqAIJ_Inode;
4143: A->ops->sor = MatSOR_SeqAIJ_Inode;
4144: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4145: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4146: if (A->rmap->n == A->cmap->n) {
4147: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4148: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4149: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4150: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4151: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4152: }
4153: } else {
4154: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4155: }
4156: a->inode.node_count = node_count;
4157: a->inode.size = ns;
4158: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4159: }
4160: a->inode.checked = PETSC_TRUE;
4161: a->inode.mat_nonzerostate = A->nonzerostate;
4162: return(0);
4163: }
4167: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4168: {
4169: Mat B =*C;
4170: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4172: PetscInt m=A->rmap->n;
4175: c->inode.use = a->inode.use;
4176: c->inode.limit = a->inode.limit;
4177: c->inode.max_limit = a->inode.max_limit;
4178: if (a->inode.size) {
4179: PetscMalloc1(m+1,&c->inode.size);
4180: c->inode.node_count = a->inode.node_count;
4181: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4182: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4183: if (!B->factortype) {
4184: B->ops->mult = MatMult_SeqAIJ_Inode;
4185: B->ops->sor = MatSOR_SeqAIJ_Inode;
4186: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4187: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4188: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4189: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4190: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4191: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4192: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4193: } else {
4194: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4195: }
4196: } else {
4197: c->inode.size = 0;
4198: c->inode.node_count = 0;
4199: }
4200: c->inode.ibdiagvalid = PETSC_FALSE;
4201: c->inode.ibdiag = 0;
4202: c->inode.bdiag = 0;
4203: return(0);
4204: }
4208: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4209: {
4210: PetscInt k;
4211: const PetscInt *vi;
4214: vi = aj + ai[row];
4215: for (k=0; k<nzl; k++) cols[k] = vi[k];
4216: vi = aj + adiag[row];
4217: cols[nzl] = vi[0];
4218: vi = aj + adiag[row+1]+1;
4219: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4220: return(0);
4221: }
4222: /*
4223: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4224: Modified from MatSeqAIJCheckInode().
4226: Input Parameters:
4227: . Mat A - ILU or LU matrix factor
4229: */
4232: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4233: {
4234: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4236: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4237: PetscInt *cols1,*cols2,*ns;
4238: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4239: PetscBool flag;
4242: if (!a->inode.use) return(0);
4243: if (a->inode.checked) return(0);
4245: m = A->rmap->n;
4246: if (a->inode.size) ns = a->inode.size;
4247: else {
4248: PetscMalloc1(m+1,&ns);
4249: }
4251: i = 0;
4252: node_count = 0;
4253: PetscMalloc2(m,&cols1,m,&cols2);
4254: while (i < m) { /* For each row */
4255: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4256: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4257: nzx = nzl1 + nzu1 + 1;
4258: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4260: /* Limits the number of elements in a node to 'a->inode.limit' */
4261: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4262: nzl2 = ai[j+1] - ai[j];
4263: nzu2 = adiag[j] - adiag[j+1] - 1;
4264: nzy = nzl2 + nzu2 + 1;
4265: if (nzy != nzx) break;
4266: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4267: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4268: if (!flag) break;
4269: }
4270: ns[node_count++] = blk_size;
4271: i = j;
4272: }
4273: PetscFree2(cols1,cols2);
4274: /* If not enough inodes found,, do not use inode version of the routines */
4275: if (!m || node_count > .8*m) {
4276: PetscFree(ns);
4278: a->inode.node_count = 0;
4279: a->inode.size = NULL;
4280: a->inode.use = PETSC_FALSE;
4282: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4283: } else {
4284: A->ops->mult = 0;
4285: A->ops->sor = 0;
4286: A->ops->multadd = 0;
4287: A->ops->getrowij = 0;
4288: A->ops->restorerowij = 0;
4289: A->ops->getcolumnij = 0;
4290: A->ops->restorecolumnij = 0;
4291: A->ops->coloringpatch = 0;
4292: A->ops->multdiagonalblock = 0;
4293: a->inode.node_count = node_count;
4294: a->inode.size = ns;
4296: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4297: }
4298: a->inode.checked = PETSC_TRUE;
4299: return(0);
4300: }
4304: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4305: {
4306: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4309: a->inode.ibdiagvalid = PETSC_FALSE;
4310: return(0);
4311: }
4313: /*
4314: This is really ugly. if inodes are used this replaces the
4315: permutations with ones that correspond to rows/cols of the matrix
4316: rather then inode blocks
4317: */
4320: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4321: {
4325: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4326: return(0);
4327: }
4331: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4332: {
4333: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4335: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4336: const PetscInt *ridx,*cidx;
4337: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4338: PetscInt nslim_col,*ns_col;
4339: IS ris = *rperm,cis = *cperm;
4342: if (!a->inode.size) return(0); /* no inodes so return */
4343: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4345: Mat_CreateColInode(A,&nslim_col,&ns_col);
4346: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4347: PetscMalloc2(m,&permr,n,&permc);
4349: ISGetIndices(ris,&ridx);
4350: ISGetIndices(cis,&cidx);
4352: /* Form the inode structure for the rows of permuted matric using inv perm*/
4353: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4355: /* Construct the permutations for rows*/
4356: for (i=0,row = 0; i<nslim_row; ++i) {
4357: indx = ridx[i];
4358: start_val = tns[indx];
4359: end_val = tns[indx + 1];
4360: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4361: }
4363: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4364: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4366: /* Construct permutations for columns */
4367: for (i=0,col=0; i<nslim_col; ++i) {
4368: indx = cidx[i];
4369: start_val = tns[indx];
4370: end_val = tns[indx + 1];
4371: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4372: }
4374: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4375: ISSetPermutation(*rperm);
4376: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4377: ISSetPermutation(*cperm);
4379: ISRestoreIndices(ris,&ridx);
4380: ISRestoreIndices(cis,&cidx);
4382: PetscFree(ns_col);
4383: PetscFree2(permr,permc);
4384: ISDestroy(&cis);
4385: ISDestroy(&ris);
4386: PetscFree(tns);
4387: return(0);
4388: }
4392: /*@C
4393: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4395: Not Collective
4397: Input Parameter:
4398: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4400: Output Parameter:
4401: + node_count - no of inodes present in the matrix.
4402: . sizes - an array of size node_count,with sizes of each inode.
4403: - limit - the max size used to generate the inodes.
4405: Level: advanced
4407: Notes: This routine returns some internal storage information
4408: of the matrix, it is intended to be used by advanced users.
4409: It should be called after the matrix is assembled.
4410: The contents of the sizes[] array should not be changed.
4411: NULL may be passed for information not requested.
4413: .keywords: matrix, seqaij, get, inode
4415: .seealso: MatGetInfo()
4416: @*/
4417: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4418: {
4419: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4422: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4423: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4424: if (f) {
4425: (*f)(A,node_count,sizes,limit);
4426: }
4427: return(0);
4428: }
4432: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4433: {
4434: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4437: if (node_count) *node_count = a->inode.node_count;
4438: if (sizes) *sizes = a->inode.size;
4439: if (limit) *limit = a->inode.limit;
4440: return(0);
4441: }