Actual source code: inode.c
petsc-3.14.6 2021-03-30
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
12: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
15: n = A->cmap->n;
16: m = A->rmap->n;
17: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
18: ns_row = a->inode.size;
20: min_mn = (m < n) ? m : n;
21: if (!ns) {
22: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
23: for (; count+1 < n; count++,i++) ;
24: if (count < n) {
25: i++;
26: }
27: *size = i;
28: return(0);
29: }
30: PetscMalloc1(n+1,&ns_col);
32: /* Use the same row structure wherever feasible. */
33: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
34: ns_col[i] = ns_row[i];
35: }
37: /* if m < n; pad up the remainder with inode_limit */
38: for (; count+1 < n; count++,i++) {
39: ns_col[i] = 1;
40: }
41: /* The last node is the odd ball. padd it up with the remaining rows; */
42: if (count < n) {
43: ns_col[i] = n - count;
44: i++;
45: } else if (count > n) {
46: /* Adjust for the over estimation */
47: ns_col[i-1] += n - count;
48: }
49: *size = i;
50: *ns = ns_col;
51: return(0);
52: }
55: /*
56: This builds symmetric version of nonzero structure,
57: */
58: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
59: {
60: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
62: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
63: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
64: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
67: nslim_row = a->inode.node_count;
68: m = A->rmap->n;
69: n = A->cmap->n;
70: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
71: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
73: /* Use the row_inode as column_inode */
74: nslim_col = nslim_row;
75: ns_col = ns_row;
77: /* allocate space for reformated inode structure */
78: PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
79: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
81: for (i1=0,col=0; i1<nslim_col; ++i1) {
82: nsz = ns_col[i1];
83: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
84: }
85: /* allocate space for row pointers */
86: PetscCalloc1(nslim_row+1,&ia);
87: *iia = ia;
88: PetscMalloc1(nslim_row+1,&work);
90: /* determine the number of columns in each row */
91: ia[0] = oshift;
92: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
94: j = aj + ai[row] + ishift;
95: jmax = aj + ai[row+1] + ishift;
96: if (j==jmax) continue; /* empty row */
97: col = *j++ + ishift;
98: i2 = tvc[col];
99: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100: ia[i1+1]++;
101: ia[i2+1]++;
102: i2++; /* Start col of next node */
103: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104: i2 = tvc[col];
105: }
106: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
107: }
109: /* shift ia[i] to point to next row */
110: for (i1=1; i1<nslim_row+1; i1++) {
111: row = ia[i1-1];
112: ia[i1] += row;
113: work[i1-1] = row - oshift;
114: }
116: /* allocate space for column pointers */
117: nz = ia[nslim_row] + (!ishift);
118: PetscMalloc1(nz,&ja);
119: *jja = ja;
121: /* loop over lower triangular part putting into ja */
122: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123: j = aj + ai[row] + ishift;
124: jmax = aj + ai[row+1] + ishift;
125: if (j==jmax) continue; /* empty row */
126: col = *j++ + ishift;
127: i2 = tvc[col];
128: while (i2<i1 && j<jmax) {
129: ja[work[i2]++] = i1 + oshift;
130: ja[work[i1]++] = i2 + oshift;
131: ++i2;
132: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133: i2 = tvc[col];
134: }
135: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
137: }
138: PetscFree(work);
139: PetscFree2(tns,tvc);
140: return(0);
141: }
143: /*
144: This builds nonsymmetric version of nonzero structure,
145: */
146: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
147: {
148: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
150: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
151: PetscInt *tns,*tvc,nsz,i1,i2;
152: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
155: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
156: nslim_row = a->inode.node_count;
157: n = A->cmap->n;
159: /* Create The column_inode for this matrix */
160: MatCreateColInode_Private(A,&nslim_col,&ns_col);
162: /* allocate space for reformated column_inode structure */
163: PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
164: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
166: for (i1=0,col=0; i1<nslim_col; ++i1) {
167: nsz = ns_col[i1];
168: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
169: }
170: /* allocate space for row pointers */
171: PetscCalloc1(nslim_row+1,&ia);
172: *iia = ia;
173: PetscMalloc1(nslim_row+1,&work);
175: /* determine the number of columns in each row */
176: ia[0] = oshift;
177: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
178: j = aj + ai[row] + ishift;
179: nz = ai[row+1] - ai[row];
180: if (!nz) continue; /* empty row */
181: col = *j++ + ishift;
182: i2 = tvc[col];
183: while (nz-- > 0) { /* off-diagonal elemets */
184: ia[i1+1]++;
185: i2++; /* Start col of next node */
186: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
187: if (nz > 0) i2 = tvc[col];
188: }
189: }
191: /* shift ia[i] to point to next row */
192: for (i1=1; i1<nslim_row+1; i1++) {
193: row = ia[i1-1];
194: ia[i1] += row;
195: work[i1-1] = row - oshift;
196: }
198: /* allocate space for column pointers */
199: nz = ia[nslim_row] + (!ishift);
200: PetscMalloc1(nz,&ja);
201: *jja = ja;
203: /* loop over matrix putting into ja */
204: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
205: j = aj + ai[row] + ishift;
206: nz = ai[row+1] - ai[row];
207: if (!nz) continue; /* empty row */
208: col = *j++ + ishift;
209: i2 = tvc[col];
210: while (nz-- > 0) {
211: ja[work[i1]++] = i2 + oshift;
212: ++i2;
213: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
214: if (nz > 0) i2 = tvc[col];
215: }
216: }
217: PetscFree(ns_col);
218: PetscFree(work);
219: PetscFree2(tns,tvc);
220: return(0);
221: }
223: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
224: {
225: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
229: *n = a->inode.node_count;
230: if (!ia) return(0);
231: if (!blockcompressed) {
232: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
233: } else if (symmetric) {
234: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
235: } else {
236: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
237: }
238: return(0);
239: }
241: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
242: {
246: if (!ia) return(0);
248: if (!blockcompressed) {
249: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
250: } else {
251: PetscFree(*ia);
252: PetscFree(*ja);
253: }
254: return(0);
255: }
257: /* ----------------------------------------------------------- */
259: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
260: {
261: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
263: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
264: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
267: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
268: nslim_row = a->inode.node_count;
269: n = A->cmap->n;
271: /* Create The column_inode for this matrix */
272: MatCreateColInode_Private(A,&nslim_col,&ns_col);
274: /* allocate space for reformated column_inode structure */
275: PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
276: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
278: for (i1=0,col=0; i1<nslim_col; ++i1) {
279: nsz = ns_col[i1];
280: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
281: }
282: /* allocate space for column pointers */
283: PetscCalloc1(nslim_col+1,&ia);
284: *iia = ia;
285: PetscMalloc1(nslim_col+1,&work);
287: /* determine the number of columns in each row */
288: ia[0] = oshift;
289: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
290: j = aj + ai[row] + ishift;
291: col = *j++ + ishift;
292: i2 = tvc[col];
293: nz = ai[row+1] - ai[row];
294: while (nz-- > 0) { /* off-diagonal elemets */
295: /* ia[i1+1]++; */
296: ia[i2+1]++;
297: i2++;
298: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
299: if (nz > 0) i2 = tvc[col];
300: }
301: }
303: /* shift ia[i] to point to next col */
304: for (i1=1; i1<nslim_col+1; i1++) {
305: col = ia[i1-1];
306: ia[i1] += col;
307: work[i1-1] = col - oshift;
308: }
310: /* allocate space for column pointers */
311: nz = ia[nslim_col] + (!ishift);
312: PetscMalloc1(nz,&ja);
313: *jja = ja;
315: /* loop over matrix putting into ja */
316: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
317: j = aj + ai[row] + ishift;
318: col = *j++ + ishift;
319: i2 = tvc[col];
320: nz = ai[row+1] - ai[row];
321: while (nz-- > 0) {
322: /* ja[work[i1]++] = i2 + oshift; */
323: ja[work[i2]++] = i1 + oshift;
324: i2++;
325: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
326: if (nz > 0) i2 = tvc[col];
327: }
328: }
329: PetscFree(ns_col);
330: PetscFree(work);
331: PetscFree2(tns,tvc);
332: return(0);
333: }
335: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
336: {
340: MatCreateColInode_Private(A,n,NULL);
341: if (!ia) return(0);
343: if (!blockcompressed) {
344: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
345: } else if (symmetric) {
346: /* Since the indices are symmetric it does'nt matter */
347: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
348: } else {
349: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
350: }
351: return(0);
352: }
354: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
355: {
359: if (!ia) return(0);
360: if (!blockcompressed) {
361: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
362: } else {
363: PetscFree(*ia);
364: PetscFree(*ja);
365: }
366: return(0);
367: }
369: /* ----------------------------------------------------------- */
371: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
372: {
373: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
374: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
375: PetscScalar *y;
376: const PetscScalar *x;
377: const MatScalar *v1,*v2,*v3,*v4,*v5;
378: PetscErrorCode ierr;
379: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
380: const PetscInt *idx,*ns,*ii;
382: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
384: #endif
387: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
388: node_max = a->inode.node_count;
389: ns = a->inode.size; /* Node Size array */
390: VecGetArrayRead(xx,&x);
391: VecGetArray(yy,&y);
392: idx = a->j;
393: v1 = a->a;
394: ii = a->i;
396: for (i = 0,row = 0; i< node_max; ++i) {
397: nsz = ns[i];
398: n = ii[1] - ii[0];
399: nonzerorow += (n>0)*nsz;
400: ii += nsz;
401: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
402: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
403: sz = n; /* No of non zeros in this row */
404: /* Switch on the size of Node */
405: switch (nsz) { /* Each loop in 'case' is unrolled */
406: case 1:
407: sum1 = 0.;
409: for (n = 0; n< sz-1; n+=2) {
410: i1 = idx[0]; /* The instructions are ordered to */
411: i2 = idx[1]; /* make the compiler's job easy */
412: idx += 2;
413: tmp0 = x[i1];
414: tmp1 = x[i2];
415: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
416: }
418: if (n == sz-1) { /* Take care of the last nonzero */
419: tmp0 = x[*idx++];
420: sum1 += *v1++ *tmp0;
421: }
422: y[row++]=sum1;
423: break;
424: case 2:
425: sum1 = 0.;
426: sum2 = 0.;
427: v2 = v1 + n;
429: for (n = 0; n< sz-1; n+=2) {
430: i1 = idx[0];
431: i2 = idx[1];
432: idx += 2;
433: tmp0 = x[i1];
434: tmp1 = x[i2];
435: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
437: }
438: if (n == sz-1) {
439: tmp0 = x[*idx++];
440: sum1 += *v1++ * tmp0;
441: sum2 += *v2++ * tmp0;
442: }
443: y[row++]=sum1;
444: y[row++]=sum2;
445: v1 =v2; /* Since the next block to be processed starts there*/
446: idx +=sz;
447: break;
448: case 3:
449: sum1 = 0.;
450: sum2 = 0.;
451: sum3 = 0.;
452: v2 = v1 + n;
453: v3 = v2 + n;
455: for (n = 0; n< sz-1; n+=2) {
456: i1 = idx[0];
457: i2 = idx[1];
458: idx += 2;
459: tmp0 = x[i1];
460: tmp1 = x[i2];
461: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
462: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
463: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
464: }
465: if (n == sz-1) {
466: tmp0 = x[*idx++];
467: sum1 += *v1++ * tmp0;
468: sum2 += *v2++ * tmp0;
469: sum3 += *v3++ * tmp0;
470: }
471: y[row++]=sum1;
472: y[row++]=sum2;
473: y[row++]=sum3;
474: v1 =v3; /* Since the next block to be processed starts there*/
475: idx +=2*sz;
476: break;
477: case 4:
478: sum1 = 0.;
479: sum2 = 0.;
480: sum3 = 0.;
481: sum4 = 0.;
482: v2 = v1 + n;
483: v3 = v2 + n;
484: v4 = v3 + n;
486: for (n = 0; n< sz-1; n+=2) {
487: i1 = idx[0];
488: i2 = idx[1];
489: idx += 2;
490: tmp0 = x[i1];
491: tmp1 = x[i2];
492: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
493: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
494: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
495: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
496: }
497: if (n == sz-1) {
498: tmp0 = x[*idx++];
499: sum1 += *v1++ * tmp0;
500: sum2 += *v2++ * tmp0;
501: sum3 += *v3++ * tmp0;
502: sum4 += *v4++ * tmp0;
503: }
504: y[row++]=sum1;
505: y[row++]=sum2;
506: y[row++]=sum3;
507: y[row++]=sum4;
508: v1 =v4; /* Since the next block to be processed starts there*/
509: idx +=3*sz;
510: break;
511: case 5:
512: sum1 = 0.;
513: sum2 = 0.;
514: sum3 = 0.;
515: sum4 = 0.;
516: sum5 = 0.;
517: v2 = v1 + n;
518: v3 = v2 + n;
519: v4 = v3 + n;
520: v5 = v4 + n;
522: for (n = 0; n<sz-1; n+=2) {
523: i1 = idx[0];
524: i2 = idx[1];
525: idx += 2;
526: tmp0 = x[i1];
527: tmp1 = x[i2];
528: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
529: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
530: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
531: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
532: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
533: }
534: if (n == sz-1) {
535: tmp0 = x[*idx++];
536: sum1 += *v1++ * tmp0;
537: sum2 += *v2++ * tmp0;
538: sum3 += *v3++ * tmp0;
539: sum4 += *v4++ * tmp0;
540: sum5 += *v5++ * tmp0;
541: }
542: y[row++]=sum1;
543: y[row++]=sum2;
544: y[row++]=sum3;
545: y[row++]=sum4;
546: y[row++]=sum5;
547: v1 =v5; /* Since the next block to be processed starts there */
548: idx +=4*sz;
549: break;
550: default:
551: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
552: }
553: }
554: VecRestoreArrayRead(xx,&x);
555: VecRestoreArray(yy,&y);
556: PetscLogFlops(2.0*a->nz - nonzerorow);
557: return(0);
558: }
559: /* ----------------------------------------------------------- */
560: /* Almost same code as the MatMult_SeqAIJ_Inode() */
561: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
562: {
563: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
564: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
565: const MatScalar *v1,*v2,*v3,*v4,*v5;
566: const PetscScalar *x;
567: PetscScalar *y,*z,*zt;
568: PetscErrorCode ierr;
569: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
570: const PetscInt *idx,*ns,*ii;
573: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
574: node_max = a->inode.node_count;
575: ns = a->inode.size; /* Node Size array */
577: VecGetArrayRead(xx,&x);
578: VecGetArrayPair(zz,yy,&z,&y);
579: zt = z;
581: idx = a->j;
582: v1 = a->a;
583: ii = a->i;
585: for (i = 0,row = 0; i< node_max; ++i) {
586: nsz = ns[i];
587: n = ii[1] - ii[0];
588: ii += nsz;
589: sz = n; /* No of non zeros in this row */
590: /* Switch on the size of Node */
591: switch (nsz) { /* Each loop in 'case' is unrolled */
592: case 1:
593: sum1 = *zt++;
595: for (n = 0; n< sz-1; n+=2) {
596: i1 = idx[0]; /* The instructions are ordered to */
597: i2 = idx[1]; /* make the compiler's job easy */
598: idx += 2;
599: tmp0 = x[i1];
600: tmp1 = x[i2];
601: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
602: }
604: if (n == sz-1) { /* Take care of the last nonzero */
605: tmp0 = x[*idx++];
606: sum1 += *v1++ * tmp0;
607: }
608: y[row++]=sum1;
609: break;
610: case 2:
611: sum1 = *zt++;
612: sum2 = *zt++;
613: v2 = v1 + n;
615: for (n = 0; n< sz-1; n+=2) {
616: i1 = idx[0];
617: i2 = idx[1];
618: idx += 2;
619: tmp0 = x[i1];
620: tmp1 = x[i2];
621: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
622: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
623: }
624: if (n == sz-1) {
625: tmp0 = x[*idx++];
626: sum1 += *v1++ * tmp0;
627: sum2 += *v2++ * tmp0;
628: }
629: y[row++]=sum1;
630: y[row++]=sum2;
631: v1 =v2; /* Since the next block to be processed starts there*/
632: idx +=sz;
633: break;
634: case 3:
635: sum1 = *zt++;
636: sum2 = *zt++;
637: sum3 = *zt++;
638: v2 = v1 + n;
639: v3 = v2 + n;
641: for (n = 0; n< sz-1; n+=2) {
642: i1 = idx[0];
643: i2 = idx[1];
644: idx += 2;
645: tmp0 = x[i1];
646: tmp1 = x[i2];
647: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
650: }
651: if (n == sz-1) {
652: tmp0 = x[*idx++];
653: sum1 += *v1++ * tmp0;
654: sum2 += *v2++ * tmp0;
655: sum3 += *v3++ * tmp0;
656: }
657: y[row++]=sum1;
658: y[row++]=sum2;
659: y[row++]=sum3;
660: v1 =v3; /* Since the next block to be processed starts there*/
661: idx +=2*sz;
662: break;
663: case 4:
664: sum1 = *zt++;
665: sum2 = *zt++;
666: sum3 = *zt++;
667: sum4 = *zt++;
668: v2 = v1 + n;
669: v3 = v2 + n;
670: v4 = v3 + n;
672: for (n = 0; n< sz-1; n+=2) {
673: i1 = idx[0];
674: i2 = idx[1];
675: idx += 2;
676: tmp0 = x[i1];
677: tmp1 = x[i2];
678: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
679: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
680: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
681: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
682: }
683: if (n == sz-1) {
684: tmp0 = x[*idx++];
685: sum1 += *v1++ * tmp0;
686: sum2 += *v2++ * tmp0;
687: sum3 += *v3++ * tmp0;
688: sum4 += *v4++ * tmp0;
689: }
690: y[row++]=sum1;
691: y[row++]=sum2;
692: y[row++]=sum3;
693: y[row++]=sum4;
694: v1 =v4; /* Since the next block to be processed starts there*/
695: idx +=3*sz;
696: break;
697: case 5:
698: sum1 = *zt++;
699: sum2 = *zt++;
700: sum3 = *zt++;
701: sum4 = *zt++;
702: sum5 = *zt++;
703: v2 = v1 + n;
704: v3 = v2 + n;
705: v4 = v3 + n;
706: v5 = v4 + n;
708: for (n = 0; n<sz-1; n+=2) {
709: i1 = idx[0];
710: i2 = idx[1];
711: idx += 2;
712: tmp0 = x[i1];
713: tmp1 = x[i2];
714: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
715: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
716: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
717: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
718: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
719: }
720: if (n == sz-1) {
721: tmp0 = x[*idx++];
722: sum1 += *v1++ * tmp0;
723: sum2 += *v2++ * tmp0;
724: sum3 += *v3++ * tmp0;
725: sum4 += *v4++ * tmp0;
726: sum5 += *v5++ * tmp0;
727: }
728: y[row++]=sum1;
729: y[row++]=sum2;
730: y[row++]=sum3;
731: y[row++]=sum4;
732: y[row++]=sum5;
733: v1 =v5; /* Since the next block to be processed starts there */
734: idx +=4*sz;
735: break;
736: default:
737: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
738: }
739: }
740: VecRestoreArrayRead(xx,&x);
741: VecRestoreArrayPair(zz,yy,&z,&y);
742: PetscLogFlops(2.0*a->nz);
743: return(0);
744: }
746: /* ----------------------------------------------------------- */
747: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
748: {
749: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
750: IS iscol = a->col,isrow = a->row;
751: PetscErrorCode ierr;
752: const PetscInt *r,*c,*rout,*cout;
753: PetscInt i,j,n = A->rmap->n,nz;
754: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
755: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
756: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
757: PetscScalar sum1,sum2,sum3,sum4,sum5;
758: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
759: const PetscScalar *b;
762: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
763: node_max = a->inode.node_count;
764: ns = a->inode.size; /* Node Size array */
766: VecGetArrayRead(bb,&b);
767: VecGetArrayWrite(xx,&x);
768: tmp = a->solve_work;
770: ISGetIndices(isrow,&rout); r = rout;
771: ISGetIndices(iscol,&cout); c = cout + (n-1);
773: /* forward solve the lower triangular */
774: tmps = tmp;
775: aa = a_a;
776: aj = a_j;
777: ad = a->diag;
779: for (i = 0,row = 0; i< node_max; ++i) {
780: nsz = ns[i];
781: aii = ai[row];
782: v1 = aa + aii;
783: vi = aj + aii;
784: nz = ad[row]- aii;
785: if (i < node_max-1) {
786: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
787: * but our indexing to determine it's size could. */
788: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
789: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
790: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
791: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
792: }
794: switch (nsz) { /* Each loop in 'case' is unrolled */
795: case 1:
796: sum1 = b[*r++];
797: for (j=0; j<nz-1; j+=2) {
798: i0 = vi[0];
799: i1 = vi[1];
800: vi +=2;
801: tmp0 = tmps[i0];
802: tmp1 = tmps[i1];
803: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
804: }
805: if (j == nz-1) {
806: tmp0 = tmps[*vi++];
807: sum1 -= *v1++ *tmp0;
808: }
809: tmp[row++]=sum1;
810: break;
811: case 2:
812: sum1 = b[*r++];
813: sum2 = b[*r++];
814: v2 = aa + ai[row+1];
816: for (j=0; j<nz-1; j+=2) {
817: i0 = vi[0];
818: i1 = vi[1];
819: vi +=2;
820: tmp0 = tmps[i0];
821: tmp1 = tmps[i1];
822: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
823: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
824: }
825: if (j == nz-1) {
826: tmp0 = tmps[*vi++];
827: sum1 -= *v1++ *tmp0;
828: sum2 -= *v2++ *tmp0;
829: }
830: sum2 -= *v2++ *sum1;
831: tmp[row++]=sum1;
832: tmp[row++]=sum2;
833: break;
834: case 3:
835: sum1 = b[*r++];
836: sum2 = b[*r++];
837: sum3 = b[*r++];
838: v2 = aa + ai[row+1];
839: v3 = aa + ai[row+2];
841: for (j=0; j<nz-1; j+=2) {
842: i0 = vi[0];
843: i1 = vi[1];
844: vi +=2;
845: tmp0 = tmps[i0];
846: tmp1 = tmps[i1];
847: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
850: }
851: if (j == nz-1) {
852: tmp0 = tmps[*vi++];
853: sum1 -= *v1++ *tmp0;
854: sum2 -= *v2++ *tmp0;
855: sum3 -= *v3++ *tmp0;
856: }
857: sum2 -= *v2++ * sum1;
858: sum3 -= *v3++ * sum1;
859: sum3 -= *v3++ * sum2;
861: tmp[row++]=sum1;
862: tmp[row++]=sum2;
863: tmp[row++]=sum3;
864: break;
866: case 4:
867: sum1 = b[*r++];
868: sum2 = b[*r++];
869: sum3 = b[*r++];
870: sum4 = b[*r++];
871: v2 = aa + ai[row+1];
872: v3 = aa + ai[row+2];
873: v4 = aa + ai[row+3];
875: for (j=0; j<nz-1; j+=2) {
876: i0 = vi[0];
877: i1 = vi[1];
878: vi +=2;
879: tmp0 = tmps[i0];
880: tmp1 = tmps[i1];
881: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
882: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
883: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
884: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
885: }
886: if (j == nz-1) {
887: tmp0 = tmps[*vi++];
888: sum1 -= *v1++ *tmp0;
889: sum2 -= *v2++ *tmp0;
890: sum3 -= *v3++ *tmp0;
891: sum4 -= *v4++ *tmp0;
892: }
893: sum2 -= *v2++ * sum1;
894: sum3 -= *v3++ * sum1;
895: sum4 -= *v4++ * sum1;
896: sum3 -= *v3++ * sum2;
897: sum4 -= *v4++ * sum2;
898: sum4 -= *v4++ * sum3;
900: tmp[row++]=sum1;
901: tmp[row++]=sum2;
902: tmp[row++]=sum3;
903: tmp[row++]=sum4;
904: break;
905: case 5:
906: sum1 = b[*r++];
907: sum2 = b[*r++];
908: sum3 = b[*r++];
909: sum4 = b[*r++];
910: sum5 = b[*r++];
911: v2 = aa + ai[row+1];
912: v3 = aa + ai[row+2];
913: v4 = aa + ai[row+3];
914: v5 = aa + ai[row+4];
916: for (j=0; j<nz-1; j+=2) {
917: i0 = vi[0];
918: i1 = vi[1];
919: vi +=2;
920: tmp0 = tmps[i0];
921: tmp1 = tmps[i1];
922: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
923: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
924: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
925: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
926: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
927: }
928: if (j == nz-1) {
929: tmp0 = tmps[*vi++];
930: sum1 -= *v1++ *tmp0;
931: sum2 -= *v2++ *tmp0;
932: sum3 -= *v3++ *tmp0;
933: sum4 -= *v4++ *tmp0;
934: sum5 -= *v5++ *tmp0;
935: }
937: sum2 -= *v2++ * sum1;
938: sum3 -= *v3++ * sum1;
939: sum4 -= *v4++ * sum1;
940: sum5 -= *v5++ * sum1;
941: sum3 -= *v3++ * sum2;
942: sum4 -= *v4++ * sum2;
943: sum5 -= *v5++ * sum2;
944: sum4 -= *v4++ * sum3;
945: sum5 -= *v5++ * sum3;
946: sum5 -= *v5++ * sum4;
948: tmp[row++]=sum1;
949: tmp[row++]=sum2;
950: tmp[row++]=sum3;
951: tmp[row++]=sum4;
952: tmp[row++]=sum5;
953: break;
954: default:
955: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
956: }
957: }
958: /* backward solve the upper triangular */
959: for (i=node_max -1,row = n-1; i>=0; i--) {
960: nsz = ns[i];
961: aii = ai[row+1] -1;
962: v1 = aa + aii;
963: vi = aj + aii;
964: nz = aii- ad[row];
965: switch (nsz) { /* Each loop in 'case' is unrolled */
966: case 1:
967: sum1 = tmp[row];
969: for (j=nz; j>1; j-=2) {
970: vi -=2;
971: i0 = vi[2];
972: i1 = vi[1];
973: tmp0 = tmps[i0];
974: tmp1 = tmps[i1];
975: v1 -= 2;
976: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
977: }
978: if (j==1) {
979: tmp0 = tmps[*vi--];
980: sum1 -= *v1-- * tmp0;
981: }
982: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
983: break;
984: case 2:
985: sum1 = tmp[row];
986: sum2 = tmp[row -1];
987: v2 = aa + ai[row]-1;
988: for (j=nz; j>1; j-=2) {
989: vi -=2;
990: i0 = vi[2];
991: i1 = vi[1];
992: tmp0 = tmps[i0];
993: tmp1 = tmps[i1];
994: v1 -= 2;
995: v2 -= 2;
996: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
998: }
999: if (j==1) {
1000: tmp0 = tmps[*vi--];
1001: sum1 -= *v1-- * tmp0;
1002: sum2 -= *v2-- * tmp0;
1003: }
1005: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006: sum2 -= *v2-- * tmp0;
1007: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1008: break;
1009: case 3:
1010: sum1 = tmp[row];
1011: sum2 = tmp[row -1];
1012: sum3 = tmp[row -2];
1013: v2 = aa + ai[row]-1;
1014: v3 = aa + ai[row -1]-1;
1015: for (j=nz; j>1; j-=2) {
1016: vi -=2;
1017: i0 = vi[2];
1018: i1 = vi[1];
1019: tmp0 = tmps[i0];
1020: tmp1 = tmps[i1];
1021: v1 -= 2;
1022: v2 -= 2;
1023: v3 -= 2;
1024: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1025: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1026: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1027: }
1028: if (j==1) {
1029: tmp0 = tmps[*vi--];
1030: sum1 -= *v1-- * tmp0;
1031: sum2 -= *v2-- * tmp0;
1032: sum3 -= *v3-- * tmp0;
1033: }
1034: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1035: sum2 -= *v2-- * tmp0;
1036: sum3 -= *v3-- * tmp0;
1037: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1038: sum3 -= *v3-- * tmp0;
1039: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1041: break;
1042: case 4:
1043: sum1 = tmp[row];
1044: sum2 = tmp[row -1];
1045: sum3 = tmp[row -2];
1046: sum4 = tmp[row -3];
1047: v2 = aa + ai[row]-1;
1048: v3 = aa + ai[row -1]-1;
1049: v4 = aa + ai[row -2]-1;
1051: for (j=nz; j>1; j-=2) {
1052: vi -=2;
1053: i0 = vi[2];
1054: i1 = vi[1];
1055: tmp0 = tmps[i0];
1056: tmp1 = tmps[i1];
1057: v1 -= 2;
1058: v2 -= 2;
1059: v3 -= 2;
1060: v4 -= 2;
1061: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1065: }
1066: if (j==1) {
1067: tmp0 = tmps[*vi--];
1068: sum1 -= *v1-- * tmp0;
1069: sum2 -= *v2-- * tmp0;
1070: sum3 -= *v3-- * tmp0;
1071: sum4 -= *v4-- * tmp0;
1072: }
1074: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1075: sum2 -= *v2-- * tmp0;
1076: sum3 -= *v3-- * tmp0;
1077: sum4 -= *v4-- * tmp0;
1078: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1079: sum3 -= *v3-- * tmp0;
1080: sum4 -= *v4-- * tmp0;
1081: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1082: sum4 -= *v4-- * tmp0;
1083: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1084: break;
1085: case 5:
1086: sum1 = tmp[row];
1087: sum2 = tmp[row -1];
1088: sum3 = tmp[row -2];
1089: sum4 = tmp[row -3];
1090: sum5 = tmp[row -4];
1091: v2 = aa + ai[row]-1;
1092: v3 = aa + ai[row -1]-1;
1093: v4 = aa + ai[row -2]-1;
1094: v5 = aa + ai[row -3]-1;
1095: for (j=nz; j>1; j-=2) {
1096: vi -= 2;
1097: i0 = vi[2];
1098: i1 = vi[1];
1099: tmp0 = tmps[i0];
1100: tmp1 = tmps[i1];
1101: v1 -= 2;
1102: v2 -= 2;
1103: v3 -= 2;
1104: v4 -= 2;
1105: v5 -= 2;
1106: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1107: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1108: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1109: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1110: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1111: }
1112: if (j==1) {
1113: tmp0 = tmps[*vi--];
1114: sum1 -= *v1-- * tmp0;
1115: sum2 -= *v2-- * tmp0;
1116: sum3 -= *v3-- * tmp0;
1117: sum4 -= *v4-- * tmp0;
1118: sum5 -= *v5-- * tmp0;
1119: }
1121: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1122: sum2 -= *v2-- * tmp0;
1123: sum3 -= *v3-- * tmp0;
1124: sum4 -= *v4-- * tmp0;
1125: sum5 -= *v5-- * tmp0;
1126: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1127: sum3 -= *v3-- * tmp0;
1128: sum4 -= *v4-- * tmp0;
1129: sum5 -= *v5-- * tmp0;
1130: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1131: sum4 -= *v4-- * tmp0;
1132: sum5 -= *v5-- * tmp0;
1133: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1134: sum5 -= *v5-- * tmp0;
1135: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1136: break;
1137: default:
1138: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1139: }
1140: }
1141: ISRestoreIndices(isrow,&rout);
1142: ISRestoreIndices(iscol,&cout);
1143: VecRestoreArrayRead(bb,&b);
1144: VecRestoreArrayWrite(xx,&x);
1145: PetscLogFlops(2.0*a->nz - A->cmap->n);
1146: return(0);
1147: }
1149: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1150: {
1151: Mat C =B;
1152: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1153: IS isrow = b->row,isicol = b->icol;
1154: PetscErrorCode ierr;
1155: const PetscInt *r,*ic,*ics;
1156: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1157: PetscInt i,j,k,nz,nzL,row,*pj;
1158: const PetscInt *ajtmp,*bjtmp;
1159: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1160: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1161: FactorShiftCtx sctx;
1162: const PetscInt *ddiag;
1163: PetscReal rs;
1164: MatScalar d;
1165: PetscInt inod,nodesz,node_max,col;
1166: const PetscInt *ns;
1167: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1170: /* MatPivotSetUp(): initialize shift context sctx */
1171: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1173: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1174: ddiag = a->diag;
1175: sctx.shift_top = info->zeropivot;
1176: for (i=0; i<n; i++) {
1177: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1178: d = (aa)[ddiag[i]];
1179: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1180: v = aa+ai[i];
1181: nz = ai[i+1] - ai[i];
1182: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1183: if (rs>sctx.shift_top) sctx.shift_top = rs;
1184: }
1185: sctx.shift_top *= 1.1;
1186: sctx.nshift_max = 5;
1187: sctx.shift_lo = 0.;
1188: sctx.shift_hi = 1.;
1189: }
1191: ISGetIndices(isrow,&r);
1192: ISGetIndices(isicol,&ic);
1194: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1195: ics = ic;
1197: node_max = a->inode.node_count;
1198: ns = a->inode.size;
1199: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1201: /* If max inode size > 4, split it into two inodes.*/
1202: /* also map the inode sizes according to the ordering */
1203: PetscMalloc1(n+1,&tmp_vec1);
1204: for (i=0,j=0; i<node_max; ++i,++j) {
1205: if (ns[i] > 4) {
1206: tmp_vec1[j] = 4;
1207: ++j;
1208: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1209: } else {
1210: tmp_vec1[j] = ns[i];
1211: }
1212: }
1213: /* Use the correct node_max */
1214: node_max = j;
1216: /* Now reorder the inode info based on mat re-ordering info */
1217: /* First create a row -> inode_size_array_index map */
1218: PetscMalloc1(n+1,&nsmap);
1219: PetscMalloc1(node_max+1,&tmp_vec2);
1220: for (i=0,row=0; i<node_max; i++) {
1221: nodesz = tmp_vec1[i];
1222: for (j=0; j<nodesz; j++,row++) {
1223: nsmap[row] = i;
1224: }
1225: }
1226: /* Using nsmap, create a reordered ns structure */
1227: for (i=0,j=0; i< node_max; i++) {
1228: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1229: tmp_vec2[i] = nodesz;
1230: j += nodesz;
1231: }
1232: PetscFree(nsmap);
1233: PetscFree(tmp_vec1);
1235: /* Now use the correct ns */
1236: ns = tmp_vec2;
1238: do {
1239: sctx.newshift = PETSC_FALSE;
1240: /* Now loop over each block-row, and do the factorization */
1241: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1242: nodesz = ns[inod];
1244: switch (nodesz) {
1245: case 1:
1246: /*----------*/
1247: /* zero rtmp1 */
1248: /* L part */
1249: nz = bi[i+1] - bi[i];
1250: bjtmp = bj + bi[i];
1251: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1253: /* U part */
1254: nz = bdiag[i]-bdiag[i+1];
1255: bjtmp = bj + bdiag[i+1]+1;
1256: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1258: /* load in initial (unfactored row) */
1259: nz = ai[r[i]+1] - ai[r[i]];
1260: ajtmp = aj + ai[r[i]];
1261: v = aa + ai[r[i]];
1262: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1264: /* ZeropivotApply() */
1265: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1267: /* elimination */
1268: bjtmp = bj + bi[i];
1269: row = *bjtmp++;
1270: nzL = bi[i+1] - bi[i];
1271: for (k=0; k < nzL; k++) {
1272: pc = rtmp1 + row;
1273: if (*pc != 0.0) {
1274: pv = b->a + bdiag[row];
1275: mul1 = *pc * (*pv);
1276: *pc = mul1;
1277: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1278: pv = b->a + bdiag[row+1]+1;
1279: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1280: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1281: PetscLogFlops(1+2.0*nz);
1282: }
1283: row = *bjtmp++;
1284: }
1286: /* finished row so stick it into b->a */
1287: rs = 0.0;
1288: /* L part */
1289: pv = b->a + bi[i];
1290: pj = b->j + bi[i];
1291: nz = bi[i+1] - bi[i];
1292: for (j=0; j<nz; j++) {
1293: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1294: }
1296: /* U part */
1297: pv = b->a + bdiag[i+1]+1;
1298: pj = b->j + bdiag[i+1]+1;
1299: nz = bdiag[i] - bdiag[i+1]-1;
1300: for (j=0; j<nz; j++) {
1301: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1302: }
1304: /* Check zero pivot */
1305: sctx.rs = rs;
1306: sctx.pv = rtmp1[i];
1307: MatPivotCheck(B,A,info,&sctx,i);
1308: if (sctx.newshift) break;
1310: /* Mark diagonal and invert diagonal for simplier triangular solves */
1311: pv = b->a + bdiag[i];
1312: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1313: break;
1315: case 2:
1316: /*----------*/
1317: /* zero rtmp1 and rtmp2 */
1318: /* L part */
1319: nz = bi[i+1] - bi[i];
1320: bjtmp = bj + bi[i];
1321: for (j=0; j<nz; j++) {
1322: col = bjtmp[j];
1323: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1324: }
1326: /* U part */
1327: nz = bdiag[i]-bdiag[i+1];
1328: bjtmp = bj + bdiag[i+1]+1;
1329: for (j=0; j<nz; j++) {
1330: col = bjtmp[j];
1331: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1332: }
1334: /* load in initial (unfactored row) */
1335: nz = ai[r[i]+1] - ai[r[i]];
1336: ajtmp = aj + ai[r[i]];
1337: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1338: for (j=0; j<nz; j++) {
1339: col = ics[ajtmp[j]];
1340: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1341: }
1342: /* ZeropivotApply(): shift the diagonal of the matrix */
1343: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1345: /* elimination */
1346: bjtmp = bj + bi[i];
1347: row = *bjtmp++; /* pivot row */
1348: nzL = bi[i+1] - bi[i];
1349: for (k=0; k < nzL; k++) {
1350: pc1 = rtmp1 + row;
1351: pc2 = rtmp2 + row;
1352: if (*pc1 != 0.0 || *pc2 != 0.0) {
1353: pv = b->a + bdiag[row];
1354: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1355: *pc1 = mul1; *pc2 = mul2;
1357: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1358: pv = b->a + bdiag[row+1]+1;
1359: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1360: for (j=0; j<nz; j++) {
1361: col = pj[j];
1362: rtmp1[col] -= mul1 * pv[j];
1363: rtmp2[col] -= mul2 * pv[j];
1364: }
1365: PetscLogFlops(2+4.0*nz);
1366: }
1367: row = *bjtmp++;
1368: }
1370: /* finished row i; check zero pivot, then stick row i into b->a */
1371: rs = 0.0;
1372: /* L part */
1373: pc1 = b->a + bi[i];
1374: pj = b->j + bi[i];
1375: nz = bi[i+1] - bi[i];
1376: for (j=0; j<nz; j++) {
1377: col = pj[j];
1378: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1379: }
1380: /* U part */
1381: pc1 = b->a + bdiag[i+1]+1;
1382: pj = b->j + bdiag[i+1]+1;
1383: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1384: for (j=0; j<nz; j++) {
1385: col = pj[j];
1386: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1387: }
1389: sctx.rs = rs;
1390: sctx.pv = rtmp1[i];
1391: MatPivotCheck(B,A,info,&sctx,i);
1392: if (sctx.newshift) break;
1393: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1394: *pc1 = 1.0/sctx.pv;
1396: /* Now take care of diagonal 2x2 block. */
1397: pc2 = rtmp2 + i;
1398: if (*pc2 != 0.0) {
1399: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1400: *pc2 = mul1; /* insert L entry */
1401: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1402: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1403: for (j=0; j<nz; j++) {
1404: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1405: }
1406: PetscLogFlops(1+2.0*nz);
1407: }
1409: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1410: rs = 0.0;
1411: /* L part */
1412: pc2 = b->a + bi[i+1];
1413: pj = b->j + bi[i+1];
1414: nz = bi[i+2] - bi[i+1];
1415: for (j=0; j<nz; j++) {
1416: col = pj[j];
1417: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1418: }
1419: /* U part */
1420: pc2 = b->a + bdiag[i+2]+1;
1421: pj = b->j + bdiag[i+2]+1;
1422: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1423: for (j=0; j<nz; j++) {
1424: col = pj[j];
1425: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1426: }
1428: sctx.rs = rs;
1429: sctx.pv = rtmp2[i+1];
1430: MatPivotCheck(B,A,info,&sctx,i+1);
1431: if (sctx.newshift) break;
1432: pc2 = b->a + bdiag[i+1];
1433: *pc2 = 1.0/sctx.pv;
1434: break;
1436: case 3:
1437: /*----------*/
1438: /* zero rtmp */
1439: /* L part */
1440: nz = bi[i+1] - bi[i];
1441: bjtmp = bj + bi[i];
1442: for (j=0; j<nz; j++) {
1443: col = bjtmp[j];
1444: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1445: }
1447: /* U part */
1448: nz = bdiag[i]-bdiag[i+1];
1449: bjtmp = bj + bdiag[i+1]+1;
1450: for (j=0; j<nz; j++) {
1451: col = bjtmp[j];
1452: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1453: }
1455: /* load in initial (unfactored row) */
1456: nz = ai[r[i]+1] - ai[r[i]];
1457: ajtmp = aj + ai[r[i]];
1458: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1459: for (j=0; j<nz; j++) {
1460: col = ics[ajtmp[j]];
1461: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1462: }
1463: /* ZeropivotApply(): shift the diagonal of the matrix */
1464: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1466: /* elimination */
1467: bjtmp = bj + bi[i];
1468: row = *bjtmp++; /* pivot row */
1469: nzL = bi[i+1] - bi[i];
1470: for (k=0; k < nzL; k++) {
1471: pc1 = rtmp1 + row;
1472: pc2 = rtmp2 + row;
1473: pc3 = rtmp3 + row;
1474: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1475: pv = b->a + bdiag[row];
1476: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1477: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1479: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1480: pv = b->a + bdiag[row+1]+1;
1481: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1482: for (j=0; j<nz; j++) {
1483: col = pj[j];
1484: rtmp1[col] -= mul1 * pv[j];
1485: rtmp2[col] -= mul2 * pv[j];
1486: rtmp3[col] -= mul3 * pv[j];
1487: }
1488: PetscLogFlops(3+6.0*nz);
1489: }
1490: row = *bjtmp++;
1491: }
1493: /* finished row i; check zero pivot, then stick row i into b->a */
1494: rs = 0.0;
1495: /* L part */
1496: pc1 = b->a + bi[i];
1497: pj = b->j + bi[i];
1498: nz = bi[i+1] - bi[i];
1499: for (j=0; j<nz; j++) {
1500: col = pj[j];
1501: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1502: }
1503: /* U part */
1504: pc1 = b->a + bdiag[i+1]+1;
1505: pj = b->j + bdiag[i+1]+1;
1506: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1507: for (j=0; j<nz; j++) {
1508: col = pj[j];
1509: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1510: }
1512: sctx.rs = rs;
1513: sctx.pv = rtmp1[i];
1514: MatPivotCheck(B,A,info,&sctx,i);
1515: if (sctx.newshift) break;
1516: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1517: *pc1 = 1.0/sctx.pv;
1519: /* Now take care of 1st column of diagonal 3x3 block. */
1520: pc2 = rtmp2 + i;
1521: pc3 = rtmp3 + i;
1522: if (*pc2 != 0.0 || *pc3 != 0.0) {
1523: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1524: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1525: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1526: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1527: for (j=0; j<nz; j++) {
1528: col = pj[j];
1529: rtmp2[col] -= mul2 * rtmp1[col];
1530: rtmp3[col] -= mul3 * rtmp1[col];
1531: }
1532: PetscLogFlops(2+4.0*nz);
1533: }
1535: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1536: rs = 0.0;
1537: /* L part */
1538: pc2 = b->a + bi[i+1];
1539: pj = b->j + bi[i+1];
1540: nz = bi[i+2] - bi[i+1];
1541: for (j=0; j<nz; j++) {
1542: col = pj[j];
1543: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1544: }
1545: /* U part */
1546: pc2 = b->a + bdiag[i+2]+1;
1547: pj = b->j + bdiag[i+2]+1;
1548: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1549: for (j=0; j<nz; j++) {
1550: col = pj[j];
1551: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1552: }
1554: sctx.rs = rs;
1555: sctx.pv = rtmp2[i+1];
1556: MatPivotCheck(B,A,info,&sctx,i+1);
1557: if (sctx.newshift) break;
1558: pc2 = b->a + bdiag[i+1];
1559: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1561: /* Now take care of 2nd column of diagonal 3x3 block. */
1562: pc3 = rtmp3 + i+1;
1563: if (*pc3 != 0.0) {
1564: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1565: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1566: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1567: for (j=0; j<nz; j++) {
1568: col = pj[j];
1569: rtmp3[col] -= mul3 * rtmp2[col];
1570: }
1571: PetscLogFlops(1+2.0*nz);
1572: }
1574: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1575: rs = 0.0;
1576: /* L part */
1577: pc3 = b->a + bi[i+2];
1578: pj = b->j + bi[i+2];
1579: nz = bi[i+3] - bi[i+2];
1580: for (j=0; j<nz; j++) {
1581: col = pj[j];
1582: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1583: }
1584: /* U part */
1585: pc3 = b->a + bdiag[i+3]+1;
1586: pj = b->j + bdiag[i+3]+1;
1587: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1588: for (j=0; j<nz; j++) {
1589: col = pj[j];
1590: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1591: }
1593: sctx.rs = rs;
1594: sctx.pv = rtmp3[i+2];
1595: MatPivotCheck(B,A,info,&sctx,i+2);
1596: if (sctx.newshift) break;
1597: pc3 = b->a + bdiag[i+2];
1598: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1599: break;
1600: case 4:
1601: /*----------*/
1602: /* zero rtmp */
1603: /* L part */
1604: nz = bi[i+1] - bi[i];
1605: bjtmp = bj + bi[i];
1606: for (j=0; j<nz; j++) {
1607: col = bjtmp[j];
1608: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1609: }
1611: /* U part */
1612: nz = bdiag[i]-bdiag[i+1];
1613: bjtmp = bj + bdiag[i+1]+1;
1614: for (j=0; j<nz; j++) {
1615: col = bjtmp[j];
1616: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1617: }
1619: /* load in initial (unfactored row) */
1620: nz = ai[r[i]+1] - ai[r[i]];
1621: ajtmp = aj + ai[r[i]];
1622: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1623: for (j=0; j<nz; j++) {
1624: col = ics[ajtmp[j]];
1625: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1626: }
1627: /* ZeropivotApply(): shift the diagonal of the matrix */
1628: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1630: /* elimination */
1631: bjtmp = bj + bi[i];
1632: row = *bjtmp++; /* pivot row */
1633: nzL = bi[i+1] - bi[i];
1634: for (k=0; k < nzL; k++) {
1635: pc1 = rtmp1 + row;
1636: pc2 = rtmp2 + row;
1637: pc3 = rtmp3 + row;
1638: pc4 = rtmp4 + row;
1639: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1640: pv = b->a + bdiag[row];
1641: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1642: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1644: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1645: pv = b->a + bdiag[row+1]+1;
1646: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1647: for (j=0; j<nz; j++) {
1648: col = pj[j];
1649: rtmp1[col] -= mul1 * pv[j];
1650: rtmp2[col] -= mul2 * pv[j];
1651: rtmp3[col] -= mul3 * pv[j];
1652: rtmp4[col] -= mul4 * pv[j];
1653: }
1654: PetscLogFlops(4+8.0*nz);
1655: }
1656: row = *bjtmp++;
1657: }
1659: /* finished row i; check zero pivot, then stick row i into b->a */
1660: rs = 0.0;
1661: /* L part */
1662: pc1 = b->a + bi[i];
1663: pj = b->j + bi[i];
1664: nz = bi[i+1] - bi[i];
1665: for (j=0; j<nz; j++) {
1666: col = pj[j];
1667: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1668: }
1669: /* U part */
1670: pc1 = b->a + bdiag[i+1]+1;
1671: pj = b->j + bdiag[i+1]+1;
1672: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1673: for (j=0; j<nz; j++) {
1674: col = pj[j];
1675: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1676: }
1678: sctx.rs = rs;
1679: sctx.pv = rtmp1[i];
1680: MatPivotCheck(B,A,info,&sctx,i);
1681: if (sctx.newshift) break;
1682: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1683: *pc1 = 1.0/sctx.pv;
1685: /* Now take care of 1st column of diagonal 4x4 block. */
1686: pc2 = rtmp2 + i;
1687: pc3 = rtmp3 + i;
1688: pc4 = rtmp4 + i;
1689: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1690: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1691: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1692: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1693: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1694: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1695: for (j=0; j<nz; j++) {
1696: col = pj[j];
1697: rtmp2[col] -= mul2 * rtmp1[col];
1698: rtmp3[col] -= mul3 * rtmp1[col];
1699: rtmp4[col] -= mul4 * rtmp1[col];
1700: }
1701: PetscLogFlops(3+6.0*nz);
1702: }
1704: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1705: rs = 0.0;
1706: /* L part */
1707: pc2 = b->a + bi[i+1];
1708: pj = b->j + bi[i+1];
1709: nz = bi[i+2] - bi[i+1];
1710: for (j=0; j<nz; j++) {
1711: col = pj[j];
1712: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1713: }
1714: /* U part */
1715: pc2 = b->a + bdiag[i+2]+1;
1716: pj = b->j + bdiag[i+2]+1;
1717: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1718: for (j=0; j<nz; j++) {
1719: col = pj[j];
1720: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1721: }
1723: sctx.rs = rs;
1724: sctx.pv = rtmp2[i+1];
1725: MatPivotCheck(B,A,info,&sctx,i+1);
1726: if (sctx.newshift) break;
1727: pc2 = b->a + bdiag[i+1];
1728: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1730: /* Now take care of 2nd column of diagonal 4x4 block. */
1731: pc3 = rtmp3 + i+1;
1732: pc4 = rtmp4 + i+1;
1733: if (*pc3 != 0.0 || *pc4 != 0.0) {
1734: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1735: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1736: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1737: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1738: for (j=0; j<nz; j++) {
1739: col = pj[j];
1740: rtmp3[col] -= mul3 * rtmp2[col];
1741: rtmp4[col] -= mul4 * rtmp2[col];
1742: }
1743: PetscLogFlops(4.0*nz);
1744: }
1746: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1747: rs = 0.0;
1748: /* L part */
1749: pc3 = b->a + bi[i+2];
1750: pj = b->j + bi[i+2];
1751: nz = bi[i+3] - bi[i+2];
1752: for (j=0; j<nz; j++) {
1753: col = pj[j];
1754: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1755: }
1756: /* U part */
1757: pc3 = b->a + bdiag[i+3]+1;
1758: pj = b->j + bdiag[i+3]+1;
1759: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1760: for (j=0; j<nz; j++) {
1761: col = pj[j];
1762: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1763: }
1765: sctx.rs = rs;
1766: sctx.pv = rtmp3[i+2];
1767: MatPivotCheck(B,A,info,&sctx,i+2);
1768: if (sctx.newshift) break;
1769: pc3 = b->a + bdiag[i+2];
1770: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1772: /* Now take care of 3rd column of diagonal 4x4 block. */
1773: pc4 = rtmp4 + i+2;
1774: if (*pc4 != 0.0) {
1775: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1776: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1777: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1778: for (j=0; j<nz; j++) {
1779: col = pj[j];
1780: rtmp4[col] -= mul4 * rtmp3[col];
1781: }
1782: PetscLogFlops(1+2.0*nz);
1783: }
1785: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1786: rs = 0.0;
1787: /* L part */
1788: pc4 = b->a + bi[i+3];
1789: pj = b->j + bi[i+3];
1790: nz = bi[i+4] - bi[i+3];
1791: for (j=0; j<nz; j++) {
1792: col = pj[j];
1793: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1794: }
1795: /* U part */
1796: pc4 = b->a + bdiag[i+4]+1;
1797: pj = b->j + bdiag[i+4]+1;
1798: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1799: for (j=0; j<nz; j++) {
1800: col = pj[j];
1801: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1802: }
1804: sctx.rs = rs;
1805: sctx.pv = rtmp4[i+3];
1806: MatPivotCheck(B,A,info,&sctx,i+3);
1807: if (sctx.newshift) break;
1808: pc4 = b->a + bdiag[i+3];
1809: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1810: break;
1812: default:
1813: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1814: }
1815: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1816: i += nodesz; /* Update the row */
1817: }
1819: /* MatPivotRefine() */
1820: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1821: /*
1822: * if no shift in this attempt & shifting & started shifting & can refine,
1823: * then try lower shift
1824: */
1825: sctx.shift_hi = sctx.shift_fraction;
1826: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1827: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1828: sctx.newshift = PETSC_TRUE;
1829: sctx.nshift++;
1830: }
1831: } while (sctx.newshift);
1833: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1834: PetscFree(tmp_vec2);
1835: ISRestoreIndices(isicol,&ic);
1836: ISRestoreIndices(isrow,&r);
1838: if (b->inode.size) {
1839: C->ops->solve = MatSolve_SeqAIJ_Inode;
1840: } else {
1841: C->ops->solve = MatSolve_SeqAIJ;
1842: }
1843: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1844: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1845: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1846: C->ops->matsolve = MatMatSolve_SeqAIJ;
1847: C->assembled = PETSC_TRUE;
1848: C->preallocated = PETSC_TRUE;
1850: PetscLogFlops(C->cmap->n);
1852: /* MatShiftView(A,info,&sctx) */
1853: if (sctx.nshift) {
1854: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1855: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1856: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1857: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1858: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1859: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1860: }
1861: }
1862: return(0);
1863: }
1865: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1866: {
1867: Mat C = B;
1868: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1869: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1870: PetscErrorCode ierr;
1871: const PetscInt *r,*ic,*c,*ics;
1872: PetscInt n = A->rmap->n,*bi = b->i;
1873: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1874: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1875: PetscInt *ai = a->i,*aj = a->j;
1876: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1877: PetscScalar mul1,mul2,mul3,tmp;
1878: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1879: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1880: PetscReal rs=0.0;
1881: FactorShiftCtx sctx;
1884: sctx.shift_top = 0;
1885: sctx.nshift_max = 0;
1886: sctx.shift_lo = 0;
1887: sctx.shift_hi = 0;
1888: sctx.shift_fraction = 0;
1890: /* if both shift schemes are chosen by user, only use info->shiftpd */
1891: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1892: sctx.shift_top = 0;
1893: for (i=0; i<n; i++) {
1894: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1895: rs = 0.0;
1896: ajtmp = aj + ai[i];
1897: rtmp1 = aa + ai[i];
1898: nz = ai[i+1] - ai[i];
1899: for (j=0; j<nz; j++) {
1900: if (*ajtmp != i) {
1901: rs += PetscAbsScalar(*rtmp1++);
1902: } else {
1903: rs -= PetscRealPart(*rtmp1++);
1904: }
1905: ajtmp++;
1906: }
1907: if (rs>sctx.shift_top) sctx.shift_top = rs;
1908: }
1909: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1910: sctx.shift_top *= 1.1;
1911: sctx.nshift_max = 5;
1912: sctx.shift_lo = 0.;
1913: sctx.shift_hi = 1.;
1914: }
1915: sctx.shift_amount = 0;
1916: sctx.nshift = 0;
1918: ISGetIndices(isrow,&r);
1919: ISGetIndices(iscol,&c);
1920: ISGetIndices(isicol,&ic);
1921: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1922: ics = ic;
1924: node_max = a->inode.node_count;
1925: ns = a->inode.size;
1926: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1928: /* If max inode size > 3, split it into two inodes.*/
1929: /* also map the inode sizes according to the ordering */
1930: PetscMalloc1(n+1,&tmp_vec1);
1931: for (i=0,j=0; i<node_max; ++i,++j) {
1932: if (ns[i]>3) {
1933: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1934: ++j;
1935: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1936: } else {
1937: tmp_vec1[j] = ns[i];
1938: }
1939: }
1940: /* Use the correct node_max */
1941: node_max = j;
1943: /* Now reorder the inode info based on mat re-ordering info */
1944: /* First create a row -> inode_size_array_index map */
1945: PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1946: for (i=0,row=0; i<node_max; i++) {
1947: nodesz = tmp_vec1[i];
1948: for (j=0; j<nodesz; j++,row++) {
1949: nsmap[row] = i;
1950: }
1951: }
1952: /* Using nsmap, create a reordered ns structure */
1953: for (i=0,j=0; i< node_max; i++) {
1954: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1955: tmp_vec2[i] = nodesz;
1956: j += nodesz;
1957: }
1958: PetscFree2(nsmap,tmp_vec1);
1959: /* Now use the correct ns */
1960: ns = tmp_vec2;
1962: do {
1963: sctx.newshift = PETSC_FALSE;
1964: /* Now loop over each block-row, and do the factorization */
1965: for (i=0,row=0; i<node_max; i++) {
1966: nodesz = ns[i];
1967: nz = bi[row+1] - bi[row];
1968: bjtmp = bj + bi[row];
1970: switch (nodesz) {
1971: case 1:
1972: for (j=0; j<nz; j++) {
1973: idx = bjtmp[j];
1974: rtmp11[idx] = 0.0;
1975: }
1977: /* load in initial (unfactored row) */
1978: idx = r[row];
1979: nz_tmp = ai[idx+1] - ai[idx];
1980: ajtmp = aj + ai[idx];
1981: v1 = aa + ai[idx];
1983: for (j=0; j<nz_tmp; j++) {
1984: idx = ics[ajtmp[j]];
1985: rtmp11[idx] = v1[j];
1986: }
1987: rtmp11[ics[r[row]]] += sctx.shift_amount;
1989: prow = *bjtmp++;
1990: while (prow < row) {
1991: pc1 = rtmp11 + prow;
1992: if (*pc1 != 0.0) {
1993: pv = ba + bd[prow];
1994: pj = nbj + bd[prow];
1995: mul1 = *pc1 * *pv++;
1996: *pc1 = mul1;
1997: nz_tmp = bi[prow+1] - bd[prow] - 1;
1998: PetscLogFlops(1+2.0*nz_tmp);
1999: for (j=0; j<nz_tmp; j++) {
2000: tmp = pv[j];
2001: idx = pj[j];
2002: rtmp11[idx] -= mul1 * tmp;
2003: }
2004: }
2005: prow = *bjtmp++;
2006: }
2007: pj = bj + bi[row];
2008: pc1 = ba + bi[row];
2010: sctx.pv = rtmp11[row];
2011: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2012: rs = 0.0;
2013: for (j=0; j<nz; j++) {
2014: idx = pj[j];
2015: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2016: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2017: }
2018: sctx.rs = rs;
2019: MatPivotCheck(B,A,info,&sctx,row);
2020: if (sctx.newshift) goto endofwhile;
2021: break;
2023: case 2:
2024: for (j=0; j<nz; j++) {
2025: idx = bjtmp[j];
2026: rtmp11[idx] = 0.0;
2027: rtmp22[idx] = 0.0;
2028: }
2030: /* load in initial (unfactored row) */
2031: idx = r[row];
2032: nz_tmp = ai[idx+1] - ai[idx];
2033: ajtmp = aj + ai[idx];
2034: v1 = aa + ai[idx];
2035: v2 = aa + ai[idx+1];
2036: for (j=0; j<nz_tmp; j++) {
2037: idx = ics[ajtmp[j]];
2038: rtmp11[idx] = v1[j];
2039: rtmp22[idx] = v2[j];
2040: }
2041: rtmp11[ics[r[row]]] += sctx.shift_amount;
2042: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2044: prow = *bjtmp++;
2045: while (prow < row) {
2046: pc1 = rtmp11 + prow;
2047: pc2 = rtmp22 + prow;
2048: if (*pc1 != 0.0 || *pc2 != 0.0) {
2049: pv = ba + bd[prow];
2050: pj = nbj + bd[prow];
2051: mul1 = *pc1 * *pv;
2052: mul2 = *pc2 * *pv;
2053: ++pv;
2054: *pc1 = mul1;
2055: *pc2 = mul2;
2057: nz_tmp = bi[prow+1] - bd[prow] - 1;
2058: for (j=0; j<nz_tmp; j++) {
2059: tmp = pv[j];
2060: idx = pj[j];
2061: rtmp11[idx] -= mul1 * tmp;
2062: rtmp22[idx] -= mul2 * tmp;
2063: }
2064: PetscLogFlops(2+4.0*nz_tmp);
2065: }
2066: prow = *bjtmp++;
2067: }
2069: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2070: pc1 = rtmp11 + prow;
2071: pc2 = rtmp22 + prow;
2073: sctx.pv = *pc1;
2074: pj = bj + bi[prow];
2075: rs = 0.0;
2076: for (j=0; j<nz; j++) {
2077: idx = pj[j];
2078: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2079: }
2080: sctx.rs = rs;
2081: MatPivotCheck(B,A,info,&sctx,row);
2082: if (sctx.newshift) goto endofwhile;
2084: if (*pc2 != 0.0) {
2085: pj = nbj + bd[prow];
2086: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2087: *pc2 = mul2;
2088: nz_tmp = bi[prow+1] - bd[prow] - 1;
2089: for (j=0; j<nz_tmp; j++) {
2090: idx = pj[j];
2091: tmp = rtmp11[idx];
2092: rtmp22[idx] -= mul2 * tmp;
2093: }
2094: PetscLogFlops(1+2.0*nz_tmp);
2095: }
2097: pj = bj + bi[row];
2098: pc1 = ba + bi[row];
2099: pc2 = ba + bi[row+1];
2101: sctx.pv = rtmp22[row+1];
2102: rs = 0.0;
2103: rtmp11[row] = 1.0/rtmp11[row];
2104: rtmp22[row+1] = 1.0/rtmp22[row+1];
2105: /* copy row entries from dense representation to sparse */
2106: for (j=0; j<nz; j++) {
2107: idx = pj[j];
2108: pc1[j] = rtmp11[idx];
2109: pc2[j] = rtmp22[idx];
2110: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2111: }
2112: sctx.rs = rs;
2113: MatPivotCheck(B,A,info,&sctx,row+1);
2114: if (sctx.newshift) goto endofwhile;
2115: break;
2117: case 3:
2118: for (j=0; j<nz; j++) {
2119: idx = bjtmp[j];
2120: rtmp11[idx] = 0.0;
2121: rtmp22[idx] = 0.0;
2122: rtmp33[idx] = 0.0;
2123: }
2124: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2125: idx = r[row];
2126: nz_tmp = ai[idx+1] - ai[idx];
2127: ajtmp = aj + ai[idx];
2128: v1 = aa + ai[idx];
2129: v2 = aa + ai[idx+1];
2130: v3 = aa + ai[idx+2];
2131: for (j=0; j<nz_tmp; j++) {
2132: idx = ics[ajtmp[j]];
2133: rtmp11[idx] = v1[j];
2134: rtmp22[idx] = v2[j];
2135: rtmp33[idx] = v3[j];
2136: }
2137: rtmp11[ics[r[row]]] += sctx.shift_amount;
2138: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2139: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2141: /* loop over all pivot row blocks above this row block */
2142: prow = *bjtmp++;
2143: while (prow < row) {
2144: pc1 = rtmp11 + prow;
2145: pc2 = rtmp22 + prow;
2146: pc3 = rtmp33 + prow;
2147: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2148: pv = ba + bd[prow];
2149: pj = nbj + bd[prow];
2150: mul1 = *pc1 * *pv;
2151: mul2 = *pc2 * *pv;
2152: mul3 = *pc3 * *pv;
2153: ++pv;
2154: *pc1 = mul1;
2155: *pc2 = mul2;
2156: *pc3 = mul3;
2158: nz_tmp = bi[prow+1] - bd[prow] - 1;
2159: /* update this row based on pivot row */
2160: for (j=0; j<nz_tmp; j++) {
2161: tmp = pv[j];
2162: idx = pj[j];
2163: rtmp11[idx] -= mul1 * tmp;
2164: rtmp22[idx] -= mul2 * tmp;
2165: rtmp33[idx] -= mul3 * tmp;
2166: }
2167: PetscLogFlops(3+6.0*nz_tmp);
2168: }
2169: prow = *bjtmp++;
2170: }
2172: /* Now take care of diagonal 3x3 block in this set of rows */
2173: /* note: prow = row here */
2174: pc1 = rtmp11 + prow;
2175: pc2 = rtmp22 + prow;
2176: pc3 = rtmp33 + prow;
2178: sctx.pv = *pc1;
2179: pj = bj + bi[prow];
2180: rs = 0.0;
2181: for (j=0; j<nz; j++) {
2182: idx = pj[j];
2183: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2184: }
2185: sctx.rs = rs;
2186: MatPivotCheck(B,A,info,&sctx,row);
2187: if (sctx.newshift) goto endofwhile;
2189: if (*pc2 != 0.0 || *pc3 != 0.0) {
2190: mul2 = (*pc2)/(*pc1);
2191: mul3 = (*pc3)/(*pc1);
2192: *pc2 = mul2;
2193: *pc3 = mul3;
2194: nz_tmp = bi[prow+1] - bd[prow] - 1;
2195: pj = nbj + bd[prow];
2196: for (j=0; j<nz_tmp; j++) {
2197: idx = pj[j];
2198: tmp = rtmp11[idx];
2199: rtmp22[idx] -= mul2 * tmp;
2200: rtmp33[idx] -= mul3 * tmp;
2201: }
2202: PetscLogFlops(2+4.0*nz_tmp);
2203: }
2204: ++prow;
2206: pc2 = rtmp22 + prow;
2207: pc3 = rtmp33 + prow;
2208: sctx.pv = *pc2;
2209: pj = bj + bi[prow];
2210: rs = 0.0;
2211: for (j=0; j<nz; j++) {
2212: idx = pj[j];
2213: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2214: }
2215: sctx.rs = rs;
2216: MatPivotCheck(B,A,info,&sctx,row+1);
2217: if (sctx.newshift) goto endofwhile;
2219: if (*pc3 != 0.0) {
2220: mul3 = (*pc3)/(*pc2);
2221: *pc3 = mul3;
2222: pj = nbj + bd[prow];
2223: nz_tmp = bi[prow+1] - bd[prow] - 1;
2224: for (j=0; j<nz_tmp; j++) {
2225: idx = pj[j];
2226: tmp = rtmp22[idx];
2227: rtmp33[idx] -= mul3 * tmp;
2228: }
2229: PetscLogFlops(1+2.0*nz_tmp);
2230: }
2232: pj = bj + bi[row];
2233: pc1 = ba + bi[row];
2234: pc2 = ba + bi[row+1];
2235: pc3 = ba + bi[row+2];
2237: sctx.pv = rtmp33[row+2];
2238: rs = 0.0;
2239: rtmp11[row] = 1.0/rtmp11[row];
2240: rtmp22[row+1] = 1.0/rtmp22[row+1];
2241: rtmp33[row+2] = 1.0/rtmp33[row+2];
2242: /* copy row entries from dense representation to sparse */
2243: for (j=0; j<nz; j++) {
2244: idx = pj[j];
2245: pc1[j] = rtmp11[idx];
2246: pc2[j] = rtmp22[idx];
2247: pc3[j] = rtmp33[idx];
2248: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2249: }
2251: sctx.rs = rs;
2252: MatPivotCheck(B,A,info,&sctx,row+2);
2253: if (sctx.newshift) goto endofwhile;
2254: break;
2256: default:
2257: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2258: }
2259: row += nodesz; /* Update the row */
2260: }
2261: endofwhile:;
2262: } while (sctx.newshift);
2263: PetscFree3(rtmp11,rtmp22,rtmp33);
2264: PetscFree(tmp_vec2);
2265: ISRestoreIndices(isicol,&ic);
2266: ISRestoreIndices(isrow,&r);
2267: ISRestoreIndices(iscol,&c);
2269: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2270: /* do not set solve add, since MatSolve_Inode + Add is faster */
2271: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2272: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2273: C->assembled = PETSC_TRUE;
2274: C->preallocated = PETSC_TRUE;
2275: if (sctx.nshift) {
2276: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2277: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2278: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2279: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2280: }
2281: }
2282: PetscLogFlops(C->cmap->n);
2283: MatSeqAIJCheckInode(C);
2284: return(0);
2285: }
2288: /* ----------------------------------------------------------- */
2289: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2290: {
2291: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2292: IS iscol = a->col,isrow = a->row;
2293: PetscErrorCode ierr;
2294: const PetscInt *r,*c,*rout,*cout;
2295: PetscInt i,j,n = A->rmap->n;
2296: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2297: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2298: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2299: PetscScalar sum1,sum2,sum3,sum4,sum5;
2300: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2301: const PetscScalar *b;
2304: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2305: node_max = a->inode.node_count;
2306: ns = a->inode.size; /* Node Size array */
2308: VecGetArrayRead(bb,&b);
2309: VecGetArrayWrite(xx,&x);
2310: tmp = a->solve_work;
2312: ISGetIndices(isrow,&rout); r = rout;
2313: ISGetIndices(iscol,&cout); c = cout;
2315: /* forward solve the lower triangular */
2316: tmps = tmp;
2317: aa = a_a;
2318: aj = a_j;
2319: ad = a->diag;
2321: for (i = 0,row = 0; i< node_max; ++i) {
2322: nsz = ns[i];
2323: aii = ai[row];
2324: v1 = aa + aii;
2325: vi = aj + aii;
2326: nz = ai[row+1]- ai[row];
2328: if (i < node_max-1) {
2329: /* Prefetch the indices for the next block */
2330: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2331: /* Prefetch the data for the next block */
2332: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2333: }
2335: switch (nsz) { /* Each loop in 'case' is unrolled */
2336: case 1:
2337: sum1 = b[r[row]];
2338: for (j=0; j<nz-1; j+=2) {
2339: i0 = vi[j];
2340: i1 = vi[j+1];
2341: tmp0 = tmps[i0];
2342: tmp1 = tmps[i1];
2343: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2344: }
2345: if (j == nz-1) {
2346: tmp0 = tmps[vi[j]];
2347: sum1 -= v1[j]*tmp0;
2348: }
2349: tmp[row++]=sum1;
2350: break;
2351: case 2:
2352: sum1 = b[r[row]];
2353: sum2 = b[r[row+1]];
2354: v2 = aa + ai[row+1];
2356: for (j=0; j<nz-1; j+=2) {
2357: i0 = vi[j];
2358: i1 = vi[j+1];
2359: tmp0 = tmps[i0];
2360: tmp1 = tmps[i1];
2361: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2362: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2363: }
2364: if (j == nz-1) {
2365: tmp0 = tmps[vi[j]];
2366: sum1 -= v1[j] *tmp0;
2367: sum2 -= v2[j] *tmp0;
2368: }
2369: sum2 -= v2[nz] * sum1;
2370: tmp[row++]=sum1;
2371: tmp[row++]=sum2;
2372: break;
2373: case 3:
2374: sum1 = b[r[row]];
2375: sum2 = b[r[row+1]];
2376: sum3 = b[r[row+2]];
2377: v2 = aa + ai[row+1];
2378: v3 = aa + ai[row+2];
2380: for (j=0; j<nz-1; j+=2) {
2381: i0 = vi[j];
2382: i1 = vi[j+1];
2383: tmp0 = tmps[i0];
2384: tmp1 = tmps[i1];
2385: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2386: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2387: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2388: }
2389: if (j == nz-1) {
2390: tmp0 = tmps[vi[j]];
2391: sum1 -= v1[j] *tmp0;
2392: sum2 -= v2[j] *tmp0;
2393: sum3 -= v3[j] *tmp0;
2394: }
2395: sum2 -= v2[nz] * sum1;
2396: sum3 -= v3[nz] * sum1;
2397: sum3 -= v3[nz+1] * sum2;
2398: tmp[row++]=sum1;
2399: tmp[row++]=sum2;
2400: tmp[row++]=sum3;
2401: break;
2403: case 4:
2404: sum1 = b[r[row]];
2405: sum2 = b[r[row+1]];
2406: sum3 = b[r[row+2]];
2407: sum4 = b[r[row+3]];
2408: v2 = aa + ai[row+1];
2409: v3 = aa + ai[row+2];
2410: v4 = aa + ai[row+3];
2412: for (j=0; j<nz-1; j+=2) {
2413: i0 = vi[j];
2414: i1 = vi[j+1];
2415: tmp0 = tmps[i0];
2416: tmp1 = tmps[i1];
2417: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2418: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2419: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2420: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2421: }
2422: if (j == nz-1) {
2423: tmp0 = tmps[vi[j]];
2424: sum1 -= v1[j] *tmp0;
2425: sum2 -= v2[j] *tmp0;
2426: sum3 -= v3[j] *tmp0;
2427: sum4 -= v4[j] *tmp0;
2428: }
2429: sum2 -= v2[nz] * sum1;
2430: sum3 -= v3[nz] * sum1;
2431: sum4 -= v4[nz] * sum1;
2432: sum3 -= v3[nz+1] * sum2;
2433: sum4 -= v4[nz+1] * sum2;
2434: sum4 -= v4[nz+2] * sum3;
2436: tmp[row++]=sum1;
2437: tmp[row++]=sum2;
2438: tmp[row++]=sum3;
2439: tmp[row++]=sum4;
2440: break;
2441: case 5:
2442: sum1 = b[r[row]];
2443: sum2 = b[r[row+1]];
2444: sum3 = b[r[row+2]];
2445: sum4 = b[r[row+3]];
2446: sum5 = b[r[row+4]];
2447: v2 = aa + ai[row+1];
2448: v3 = aa + ai[row+2];
2449: v4 = aa + ai[row+3];
2450: v5 = aa + ai[row+4];
2452: for (j=0; j<nz-1; j+=2) {
2453: i0 = vi[j];
2454: i1 = vi[j+1];
2455: tmp0 = tmps[i0];
2456: tmp1 = tmps[i1];
2457: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2459: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2460: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2461: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2462: }
2463: if (j == nz-1) {
2464: tmp0 = tmps[vi[j]];
2465: sum1 -= v1[j] *tmp0;
2466: sum2 -= v2[j] *tmp0;
2467: sum3 -= v3[j] *tmp0;
2468: sum4 -= v4[j] *tmp0;
2469: sum5 -= v5[j] *tmp0;
2470: }
2472: sum2 -= v2[nz] * sum1;
2473: sum3 -= v3[nz] * sum1;
2474: sum4 -= v4[nz] * sum1;
2475: sum5 -= v5[nz] * sum1;
2476: sum3 -= v3[nz+1] * sum2;
2477: sum4 -= v4[nz+1] * sum2;
2478: sum5 -= v5[nz+1] * sum2;
2479: sum4 -= v4[nz+2] * sum3;
2480: sum5 -= v5[nz+2] * sum3;
2481: sum5 -= v5[nz+3] * sum4;
2483: tmp[row++]=sum1;
2484: tmp[row++]=sum2;
2485: tmp[row++]=sum3;
2486: tmp[row++]=sum4;
2487: tmp[row++]=sum5;
2488: break;
2489: default:
2490: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2491: }
2492: }
2493: /* backward solve the upper triangular */
2494: for (i=node_max -1,row = n-1; i>=0; i--) {
2495: nsz = ns[i];
2496: aii = ad[row+1] + 1;
2497: v1 = aa + aii;
2498: vi = aj + aii;
2499: nz = ad[row]- ad[row+1] - 1;
2501: if (i > 0) {
2502: /* Prefetch the indices for the next block */
2503: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504: /* Prefetch the data for the next block */
2505: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2506: }
2508: switch (nsz) { /* Each loop in 'case' is unrolled */
2509: case 1:
2510: sum1 = tmp[row];
2512: for (j=0; j<nz-1; j+=2) {
2513: i0 = vi[j];
2514: i1 = vi[j+1];
2515: tmp0 = tmps[i0];
2516: tmp1 = tmps[i1];
2517: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2518: }
2519: if (j == nz-1) {
2520: tmp0 = tmps[vi[j]];
2521: sum1 -= v1[j]*tmp0;
2522: }
2523: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2524: break;
2525: case 2:
2526: sum1 = tmp[row];
2527: sum2 = tmp[row-1];
2528: v2 = aa + ad[row] + 1;
2529: for (j=0; j<nz-1; j+=2) {
2530: i0 = vi[j];
2531: i1 = vi[j+1];
2532: tmp0 = tmps[i0];
2533: tmp1 = tmps[i1];
2534: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2535: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2536: }
2537: if (j == nz-1) {
2538: tmp0 = tmps[vi[j]];
2539: sum1 -= v1[j]* tmp0;
2540: sum2 -= v2[j+1]* tmp0;
2541: }
2543: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2544: sum2 -= v2[0] * tmp0;
2545: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2546: break;
2547: case 3:
2548: sum1 = tmp[row];
2549: sum2 = tmp[row -1];
2550: sum3 = tmp[row -2];
2551: v2 = aa + ad[row] + 1;
2552: v3 = aa + ad[row -1] + 1;
2553: for (j=0; j<nz-1; j+=2) {
2554: i0 = vi[j];
2555: i1 = vi[j+1];
2556: tmp0 = tmps[i0];
2557: tmp1 = tmps[i1];
2558: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2560: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2561: }
2562: if (j== nz-1) {
2563: tmp0 = tmps[vi[j]];
2564: sum1 -= v1[j] * tmp0;
2565: sum2 -= v2[j+1] * tmp0;
2566: sum3 -= v3[j+2] * tmp0;
2567: }
2568: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2569: sum2 -= v2[0]* tmp0;
2570: sum3 -= v3[1] * tmp0;
2571: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2572: sum3 -= v3[0]* tmp0;
2573: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2575: break;
2576: case 4:
2577: sum1 = tmp[row];
2578: sum2 = tmp[row -1];
2579: sum3 = tmp[row -2];
2580: sum4 = tmp[row -3];
2581: v2 = aa + ad[row]+1;
2582: v3 = aa + ad[row -1]+1;
2583: v4 = aa + ad[row -2]+1;
2585: for (j=0; j<nz-1; j+=2) {
2586: i0 = vi[j];
2587: i1 = vi[j+1];
2588: tmp0 = tmps[i0];
2589: tmp1 = tmps[i1];
2590: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2591: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2592: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2593: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2594: }
2595: if (j== nz-1) {
2596: tmp0 = tmps[vi[j]];
2597: sum1 -= v1[j] * tmp0;
2598: sum2 -= v2[j+1] * tmp0;
2599: sum3 -= v3[j+2] * tmp0;
2600: sum4 -= v4[j+3] * tmp0;
2601: }
2603: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604: sum2 -= v2[0] * tmp0;
2605: sum3 -= v3[1] * tmp0;
2606: sum4 -= v4[2] * tmp0;
2607: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2608: sum3 -= v3[0] * tmp0;
2609: sum4 -= v4[1] * tmp0;
2610: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2611: sum4 -= v4[0] * tmp0;
2612: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2613: break;
2614: case 5:
2615: sum1 = tmp[row];
2616: sum2 = tmp[row -1];
2617: sum3 = tmp[row -2];
2618: sum4 = tmp[row -3];
2619: sum5 = tmp[row -4];
2620: v2 = aa + ad[row]+1;
2621: v3 = aa + ad[row -1]+1;
2622: v4 = aa + ad[row -2]+1;
2623: v5 = aa + ad[row -3]+1;
2624: for (j=0; j<nz-1; j+=2) {
2625: i0 = vi[j];
2626: i1 = vi[j+1];
2627: tmp0 = tmps[i0];
2628: tmp1 = tmps[i1];
2629: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2630: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2634: }
2635: if (j==nz-1) {
2636: tmp0 = tmps[vi[j]];
2637: sum1 -= v1[j] * tmp0;
2638: sum2 -= v2[j+1] * tmp0;
2639: sum3 -= v3[j+2] * tmp0;
2640: sum4 -= v4[j+3] * tmp0;
2641: sum5 -= v5[j+4] * tmp0;
2642: }
2644: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645: sum2 -= v2[0] * tmp0;
2646: sum3 -= v3[1] * tmp0;
2647: sum4 -= v4[2] * tmp0;
2648: sum5 -= v5[3] * tmp0;
2649: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650: sum3 -= v3[0] * tmp0;
2651: sum4 -= v4[1] * tmp0;
2652: sum5 -= v5[2] * tmp0;
2653: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2654: sum4 -= v4[0] * tmp0;
2655: sum5 -= v5[1] * tmp0;
2656: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2657: sum5 -= v5[0] * tmp0;
2658: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2659: break;
2660: default:
2661: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2662: }
2663: }
2664: ISRestoreIndices(isrow,&rout);
2665: ISRestoreIndices(iscol,&cout);
2666: VecRestoreArrayRead(bb,&b);
2667: VecRestoreArrayWrite(xx,&x);
2668: PetscLogFlops(2.0*a->nz - A->cmap->n);
2669: return(0);
2670: }
2673: /*
2674: Makes a longer coloring[] array and calls the usual code with that
2675: */
2676: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2677: {
2678: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2679: PetscErrorCode ierr;
2680: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2681: PetscInt *colorused,i;
2682: ISColoringValue *newcolor;
2685: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2686: PetscMalloc1(n+1,&newcolor);
2687: /* loop over inodes, marking a color for each column*/
2688: row = 0;
2689: for (i=0; i<m; i++) {
2690: for (j=0; j<ns[i]; j++) {
2691: newcolor[row++] = coloring[i] + j*ncolors;
2692: }
2693: }
2695: /* eliminate unneeded colors */
2696: PetscCalloc1(5*ncolors,&colorused);
2697: for (i=0; i<n; i++) {
2698: colorused[newcolor[i]] = 1;
2699: }
2701: for (i=1; i<5*ncolors; i++) {
2702: colorused[i] += colorused[i-1];
2703: }
2704: ncolors = colorused[5*ncolors-1];
2705: for (i=0; i<n; i++) {
2706: newcolor[i] = colorused[newcolor[i]]-1;
2707: }
2708: PetscFree(colorused);
2709: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2710: PetscFree(coloring);
2711: return(0);
2712: }
2714: #include <petsc/private/kernels/blockinvert.h>
2716: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2717: {
2718: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2719: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2720: MatScalar *ibdiag,*bdiag,work[25],*t;
2721: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2722: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2723: const PetscScalar *xb, *b;
2724: PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2725: PetscErrorCode ierr;
2726: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2727: PetscInt sz,k,ipvt[5];
2728: PetscBool allowzeropivot,zeropivotdetected;
2729: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2732: allowzeropivot = PetscNot(A->erroriffailure);
2733: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2734: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2735: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2737: if (!a->inode.ibdiagvalid) {
2738: if (!a->inode.ibdiag) {
2739: /* calculate space needed for diagonal blocks */
2740: for (i=0; i<m; i++) {
2741: cnt += sizes[i]*sizes[i];
2742: }
2743: a->inode.bdiagsize = cnt;
2745: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2746: }
2748: /* copy over the diagonal blocks and invert them */
2749: ibdiag = a->inode.ibdiag;
2750: bdiag = a->inode.bdiag;
2751: cnt = 0;
2752: for (i=0, row = 0; i<m; i++) {
2753: for (j=0; j<sizes[i]; j++) {
2754: for (k=0; k<sizes[i]; k++) {
2755: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2756: }
2757: }
2758: PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);
2760: switch (sizes[i]) {
2761: case 1:
2762: /* Create matrix data structure */
2763: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2764: if (allowzeropivot) {
2765: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2766: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2767: A->factorerror_zeropivot_row = row;
2768: PetscInfo1(A,"Zero pivot, row %D\n",row);
2769: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2770: }
2771: ibdiag[cnt] = 1.0/ibdiag[cnt];
2772: break;
2773: case 2:
2774: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2775: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2776: break;
2777: case 3:
2778: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2779: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2780: break;
2781: case 4:
2782: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2783: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2784: break;
2785: case 5:
2786: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2787: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2788: break;
2789: default:
2790: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2791: }
2792: cnt += sizes[i]*sizes[i];
2793: row += sizes[i];
2794: }
2795: a->inode.ibdiagvalid = PETSC_TRUE;
2796: }
2797: ibdiag = a->inode.ibdiag;
2798: bdiag = a->inode.bdiag;
2799: t = a->inode.ssor_work;
2801: VecGetArray(xx,&x);
2802: VecGetArrayRead(bb,&b);
2803: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2804: if (flag & SOR_ZERO_INITIAL_GUESS) {
2805: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2807: for (i=0, row=0; i<m; i++) {
2808: sz = diag[row] - ii[row];
2809: v1 = a->a + ii[row];
2810: idx = a->j + ii[row];
2812: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2813: switch (sizes[i]) {
2814: case 1:
2816: sum1 = b[row];
2817: for (n = 0; n<sz-1; n+=2) {
2818: i1 = idx[0];
2819: i2 = idx[1];
2820: idx += 2;
2821: tmp0 = x[i1];
2822: tmp1 = x[i2];
2823: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2824: }
2826: if (n == sz-1) {
2827: tmp0 = x[*idx];
2828: sum1 -= *v1 * tmp0;
2829: }
2830: t[row] = sum1;
2831: x[row++] = sum1*(*ibdiag++);
2832: break;
2833: case 2:
2834: v2 = a->a + ii[row+1];
2835: sum1 = b[row];
2836: sum2 = b[row+1];
2837: for (n = 0; n<sz-1; n+=2) {
2838: i1 = idx[0];
2839: i2 = idx[1];
2840: idx += 2;
2841: tmp0 = x[i1];
2842: tmp1 = x[i2];
2843: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2844: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2845: }
2847: if (n == sz-1) {
2848: tmp0 = x[*idx];
2849: sum1 -= v1[0] * tmp0;
2850: sum2 -= v2[0] * tmp0;
2851: }
2852: t[row] = sum1;
2853: t[row+1] = sum2;
2854: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2855: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2856: ibdiag += 4;
2857: break;
2858: case 3:
2859: v2 = a->a + ii[row+1];
2860: v3 = a->a + ii[row+2];
2861: sum1 = b[row];
2862: sum2 = b[row+1];
2863: sum3 = b[row+2];
2864: for (n = 0; n<sz-1; n+=2) {
2865: i1 = idx[0];
2866: i2 = idx[1];
2867: idx += 2;
2868: tmp0 = x[i1];
2869: tmp1 = x[i2];
2870: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2871: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2873: }
2875: if (n == sz-1) {
2876: tmp0 = x[*idx];
2877: sum1 -= v1[0] * tmp0;
2878: sum2 -= v2[0] * tmp0;
2879: sum3 -= v3[0] * tmp0;
2880: }
2881: t[row] = sum1;
2882: t[row+1] = sum2;
2883: t[row+2] = sum3;
2884: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2885: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2886: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2887: ibdiag += 9;
2888: break;
2889: case 4:
2890: v2 = a->a + ii[row+1];
2891: v3 = a->a + ii[row+2];
2892: v4 = a->a + ii[row+3];
2893: sum1 = b[row];
2894: sum2 = b[row+1];
2895: sum3 = b[row+2];
2896: sum4 = b[row+3];
2897: for (n = 0; n<sz-1; n+=2) {
2898: i1 = idx[0];
2899: i2 = idx[1];
2900: idx += 2;
2901: tmp0 = x[i1];
2902: tmp1 = x[i2];
2903: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2904: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2905: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2906: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2907: }
2909: if (n == sz-1) {
2910: tmp0 = x[*idx];
2911: sum1 -= v1[0] * tmp0;
2912: sum2 -= v2[0] * tmp0;
2913: sum3 -= v3[0] * tmp0;
2914: sum4 -= v4[0] * tmp0;
2915: }
2916: t[row] = sum1;
2917: t[row+1] = sum2;
2918: t[row+2] = sum3;
2919: t[row+3] = sum4;
2920: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2921: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2922: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2923: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2924: ibdiag += 16;
2925: break;
2926: case 5:
2927: v2 = a->a + ii[row+1];
2928: v3 = a->a + ii[row+2];
2929: v4 = a->a + ii[row+3];
2930: v5 = a->a + ii[row+4];
2931: sum1 = b[row];
2932: sum2 = b[row+1];
2933: sum3 = b[row+2];
2934: sum4 = b[row+3];
2935: sum5 = b[row+4];
2936: for (n = 0; n<sz-1; n+=2) {
2937: i1 = idx[0];
2938: i2 = idx[1];
2939: idx += 2;
2940: tmp0 = x[i1];
2941: tmp1 = x[i2];
2942: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2943: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2944: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2945: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2946: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2947: }
2949: if (n == sz-1) {
2950: tmp0 = x[*idx];
2951: sum1 -= v1[0] * tmp0;
2952: sum2 -= v2[0] * tmp0;
2953: sum3 -= v3[0] * tmp0;
2954: sum4 -= v4[0] * tmp0;
2955: sum5 -= v5[0] * tmp0;
2956: }
2957: t[row] = sum1;
2958: t[row+1] = sum2;
2959: t[row+2] = sum3;
2960: t[row+3] = sum4;
2961: t[row+4] = sum5;
2962: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2963: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2964: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2965: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2966: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2967: ibdiag += 25;
2968: break;
2969: default:
2970: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2971: }
2972: }
2974: xb = t;
2975: PetscLogFlops(a->nz);
2976: } else xb = b;
2977: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2979: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2980: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2981: ibdiag -= sizes[i]*sizes[i];
2982: sz = ii[row+1] - diag[row] - 1;
2983: v1 = a->a + diag[row] + 1;
2984: idx = a->j + diag[row] + 1;
2986: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2987: switch (sizes[i]) {
2988: case 1:
2990: sum1 = xb[row];
2991: for (n = 0; n<sz-1; n+=2) {
2992: i1 = idx[0];
2993: i2 = idx[1];
2994: idx += 2;
2995: tmp0 = x[i1];
2996: tmp1 = x[i2];
2997: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2998: }
3000: if (n == sz-1) {
3001: tmp0 = x[*idx];
3002: sum1 -= *v1*tmp0;
3003: }
3004: x[row--] = sum1*(*ibdiag);
3005: break;
3007: case 2:
3009: sum1 = xb[row];
3010: sum2 = xb[row-1];
3011: /* note that sum1 is associated with the second of the two rows */
3012: v2 = a->a + diag[row-1] + 2;
3013: for (n = 0; n<sz-1; n+=2) {
3014: i1 = idx[0];
3015: i2 = idx[1];
3016: idx += 2;
3017: tmp0 = x[i1];
3018: tmp1 = x[i2];
3019: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3020: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3021: }
3023: if (n == sz-1) {
3024: tmp0 = x[*idx];
3025: sum1 -= *v1*tmp0;
3026: sum2 -= *v2*tmp0;
3027: }
3028: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3029: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3030: break;
3031: case 3:
3033: sum1 = xb[row];
3034: sum2 = xb[row-1];
3035: sum3 = xb[row-2];
3036: v2 = a->a + diag[row-1] + 2;
3037: v3 = a->a + diag[row-2] + 3;
3038: for (n = 0; n<sz-1; n+=2) {
3039: i1 = idx[0];
3040: i2 = idx[1];
3041: idx += 2;
3042: tmp0 = x[i1];
3043: tmp1 = x[i2];
3044: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3046: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3047: }
3049: if (n == sz-1) {
3050: tmp0 = x[*idx];
3051: sum1 -= *v1*tmp0;
3052: sum2 -= *v2*tmp0;
3053: sum3 -= *v3*tmp0;
3054: }
3055: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3056: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3057: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3058: break;
3059: case 4:
3061: sum1 = xb[row];
3062: sum2 = xb[row-1];
3063: sum3 = xb[row-2];
3064: sum4 = xb[row-3];
3065: v2 = a->a + diag[row-1] + 2;
3066: v3 = a->a + diag[row-2] + 3;
3067: v4 = a->a + diag[row-3] + 4;
3068: for (n = 0; n<sz-1; n+=2) {
3069: i1 = idx[0];
3070: i2 = idx[1];
3071: idx += 2;
3072: tmp0 = x[i1];
3073: tmp1 = x[i2];
3074: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3078: }
3080: if (n == sz-1) {
3081: tmp0 = x[*idx];
3082: sum1 -= *v1*tmp0;
3083: sum2 -= *v2*tmp0;
3084: sum3 -= *v3*tmp0;
3085: sum4 -= *v4*tmp0;
3086: }
3087: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3088: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3089: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3090: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3091: break;
3092: case 5:
3094: sum1 = xb[row];
3095: sum2 = xb[row-1];
3096: sum3 = xb[row-2];
3097: sum4 = xb[row-3];
3098: sum5 = xb[row-4];
3099: v2 = a->a + diag[row-1] + 2;
3100: v3 = a->a + diag[row-2] + 3;
3101: v4 = a->a + diag[row-3] + 4;
3102: v5 = a->a + diag[row-4] + 5;
3103: for (n = 0; n<sz-1; n+=2) {
3104: i1 = idx[0];
3105: i2 = idx[1];
3106: idx += 2;
3107: tmp0 = x[i1];
3108: tmp1 = x[i2];
3109: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3110: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3111: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3112: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3113: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3114: }
3116: if (n == sz-1) {
3117: tmp0 = x[*idx];
3118: sum1 -= *v1*tmp0;
3119: sum2 -= *v2*tmp0;
3120: sum3 -= *v3*tmp0;
3121: sum4 -= *v4*tmp0;
3122: sum5 -= *v5*tmp0;
3123: }
3124: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3125: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3126: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3127: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3128: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3129: break;
3130: default:
3131: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3132: }
3133: }
3135: PetscLogFlops(a->nz);
3136: }
3137: its--;
3138: }
3139: while (its--) {
3141: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3142: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3143: i<m;
3144: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3146: sz = diag[row] - ii[row];
3147: v1 = a->a + ii[row];
3148: idx = a->j + ii[row];
3149: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3150: switch (sizes[i]) {
3151: case 1:
3152: sum1 = b[row];
3153: for (n = 0; n<sz-1; n+=2) {
3154: i1 = idx[0];
3155: i2 = idx[1];
3156: idx += 2;
3157: tmp0 = x[i1];
3158: tmp1 = x[i2];
3159: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3160: }
3161: if (n == sz-1) {
3162: tmp0 = x[*idx++];
3163: sum1 -= *v1 * tmp0;
3164: v1++;
3165: }
3166: t[row] = sum1;
3167: sz = ii[row+1] - diag[row] - 1;
3168: idx = a->j + diag[row] + 1;
3169: v1 += 1;
3170: for (n = 0; n<sz-1; n+=2) {
3171: i1 = idx[0];
3172: i2 = idx[1];
3173: idx += 2;
3174: tmp0 = x[i1];
3175: tmp1 = x[i2];
3176: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3177: }
3178: if (n == sz-1) {
3179: tmp0 = x[*idx++];
3180: sum1 -= *v1 * tmp0;
3181: }
3182: /* in MatSOR_SeqAIJ this line would be
3183: *
3184: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3185: *
3186: * but omega == 1, so this becomes
3187: *
3188: * x[row] = sum1*(*ibdiag++);
3189: *
3190: */
3191: x[row] = sum1*(*ibdiag);
3192: break;
3193: case 2:
3194: v2 = a->a + ii[row+1];
3195: sum1 = b[row];
3196: sum2 = b[row+1];
3197: for (n = 0; n<sz-1; n+=2) {
3198: i1 = idx[0];
3199: i2 = idx[1];
3200: idx += 2;
3201: tmp0 = x[i1];
3202: tmp1 = x[i2];
3203: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3204: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3205: }
3206: if (n == sz-1) {
3207: tmp0 = x[*idx++];
3208: sum1 -= v1[0] * tmp0;
3209: sum2 -= v2[0] * tmp0;
3210: v1++; v2++;
3211: }
3212: t[row] = sum1;
3213: t[row+1] = sum2;
3214: sz = ii[row+1] - diag[row] - 2;
3215: idx = a->j + diag[row] + 2;
3216: v1 += 2;
3217: v2 += 2;
3218: for (n = 0; n<sz-1; n+=2) {
3219: i1 = idx[0];
3220: i2 = idx[1];
3221: idx += 2;
3222: tmp0 = x[i1];
3223: tmp1 = x[i2];
3224: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3225: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3226: }
3227: if (n == sz-1) {
3228: tmp0 = x[*idx];
3229: sum1 -= v1[0] * tmp0;
3230: sum2 -= v2[0] * tmp0;
3231: }
3232: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3233: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3234: break;
3235: case 3:
3236: v2 = a->a + ii[row+1];
3237: v3 = a->a + ii[row+2];
3238: sum1 = b[row];
3239: sum2 = b[row+1];
3240: sum3 = b[row+2];
3241: for (n = 0; n<sz-1; n+=2) {
3242: i1 = idx[0];
3243: i2 = idx[1];
3244: idx += 2;
3245: tmp0 = x[i1];
3246: tmp1 = x[i2];
3247: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250: }
3251: if (n == sz-1) {
3252: tmp0 = x[*idx++];
3253: sum1 -= v1[0] * tmp0;
3254: sum2 -= v2[0] * tmp0;
3255: sum3 -= v3[0] * tmp0;
3256: v1++; v2++; v3++;
3257: }
3258: t[row] = sum1;
3259: t[row+1] = sum2;
3260: t[row+2] = sum3;
3261: sz = ii[row+1] - diag[row] - 3;
3262: idx = a->j + diag[row] + 3;
3263: v1 += 3;
3264: v2 += 3;
3265: v3 += 3;
3266: for (n = 0; n<sz-1; n+=2) {
3267: i1 = idx[0];
3268: i2 = idx[1];
3269: idx += 2;
3270: tmp0 = x[i1];
3271: tmp1 = x[i2];
3272: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3275: }
3276: if (n == sz-1) {
3277: tmp0 = x[*idx];
3278: sum1 -= v1[0] * tmp0;
3279: sum2 -= v2[0] * tmp0;
3280: sum3 -= v3[0] * tmp0;
3281: }
3282: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3283: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3284: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3285: break;
3286: case 4:
3287: v2 = a->a + ii[row+1];
3288: v3 = a->a + ii[row+2];
3289: v4 = a->a + ii[row+3];
3290: sum1 = b[row];
3291: sum2 = b[row+1];
3292: sum3 = b[row+2];
3293: sum4 = b[row+3];
3294: for (n = 0; n<sz-1; n+=2) {
3295: i1 = idx[0];
3296: i2 = idx[1];
3297: idx += 2;
3298: tmp0 = x[i1];
3299: tmp1 = x[i2];
3300: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3301: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3302: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3303: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3304: }
3305: if (n == sz-1) {
3306: tmp0 = x[*idx++];
3307: sum1 -= v1[0] * tmp0;
3308: sum2 -= v2[0] * tmp0;
3309: sum3 -= v3[0] * tmp0;
3310: sum4 -= v4[0] * tmp0;
3311: v1++; v2++; v3++; v4++;
3312: }
3313: t[row] = sum1;
3314: t[row+1] = sum2;
3315: t[row+2] = sum3;
3316: t[row+3] = sum4;
3317: sz = ii[row+1] - diag[row] - 4;
3318: idx = a->j + diag[row] + 4;
3319: v1 += 4;
3320: v2 += 4;
3321: v3 += 4;
3322: v4 += 4;
3323: for (n = 0; n<sz-1; n+=2) {
3324: i1 = idx[0];
3325: i2 = idx[1];
3326: idx += 2;
3327: tmp0 = x[i1];
3328: tmp1 = x[i2];
3329: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333: }
3334: if (n == sz-1) {
3335: tmp0 = x[*idx];
3336: sum1 -= v1[0] * tmp0;
3337: sum2 -= v2[0] * tmp0;
3338: sum3 -= v3[0] * tmp0;
3339: sum4 -= v4[0] * tmp0;
3340: }
3341: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3342: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3343: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3344: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3345: break;
3346: case 5:
3347: v2 = a->a + ii[row+1];
3348: v3 = a->a + ii[row+2];
3349: v4 = a->a + ii[row+3];
3350: v5 = a->a + ii[row+4];
3351: sum1 = b[row];
3352: sum2 = b[row+1];
3353: sum3 = b[row+2];
3354: sum4 = b[row+3];
3355: sum5 = b[row+4];
3356: for (n = 0; n<sz-1; n+=2) {
3357: i1 = idx[0];
3358: i2 = idx[1];
3359: idx += 2;
3360: tmp0 = x[i1];
3361: tmp1 = x[i2];
3362: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3363: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3364: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3365: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3366: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3367: }
3368: if (n == sz-1) {
3369: tmp0 = x[*idx++];
3370: sum1 -= v1[0] * tmp0;
3371: sum2 -= v2[0] * tmp0;
3372: sum3 -= v3[0] * tmp0;
3373: sum4 -= v4[0] * tmp0;
3374: sum5 -= v5[0] * tmp0;
3375: v1++; v2++; v3++; v4++; v5++;
3376: }
3377: t[row] = sum1;
3378: t[row+1] = sum2;
3379: t[row+2] = sum3;
3380: t[row+3] = sum4;
3381: t[row+4] = sum5;
3382: sz = ii[row+1] - diag[row] - 5;
3383: idx = a->j + diag[row] + 5;
3384: v1 += 5;
3385: v2 += 5;
3386: v3 += 5;
3387: v4 += 5;
3388: v5 += 5;
3389: for (n = 0; n<sz-1; n+=2) {
3390: i1 = idx[0];
3391: i2 = idx[1];
3392: idx += 2;
3393: tmp0 = x[i1];
3394: tmp1 = x[i2];
3395: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3396: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3397: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3398: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3399: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3400: }
3401: if (n == sz-1) {
3402: tmp0 = x[*idx];
3403: sum1 -= v1[0] * tmp0;
3404: sum2 -= v2[0] * tmp0;
3405: sum3 -= v3[0] * tmp0;
3406: sum4 -= v4[0] * tmp0;
3407: sum5 -= v5[0] * tmp0;
3408: }
3409: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3410: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3411: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3412: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3413: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3414: break;
3415: default:
3416: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3417: }
3418: }
3419: xb = t;
3420: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3421: } else xb = b;
3423: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3425: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3426: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3427: ibdiag -= sizes[i]*sizes[i];
3429: /* set RHS */
3430: if (xb == b) {
3431: /* whole (old way) */
3432: sz = ii[row+1] - ii[row];
3433: idx = a->j + ii[row];
3434: switch (sizes[i]) {
3435: case 5:
3436: v5 = a->a + ii[row-4];
3437: case 4: /* fall through */
3438: v4 = a->a + ii[row-3];
3439: case 3:
3440: v3 = a->a + ii[row-2];
3441: case 2:
3442: v2 = a->a + ii[row-1];
3443: case 1:
3444: v1 = a->a + ii[row];
3445: break;
3446: default:
3447: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3448: }
3449: } else {
3450: /* upper, no diag */
3451: sz = ii[row+1] - diag[row] - 1;
3452: idx = a->j + diag[row] + 1;
3453: switch (sizes[i]) {
3454: case 5:
3455: v5 = a->a + diag[row-4] + 5;
3456: case 4: /* fall through */
3457: v4 = a->a + diag[row-3] + 4;
3458: case 3:
3459: v3 = a->a + diag[row-2] + 3;
3460: case 2:
3461: v2 = a->a + diag[row-1] + 2;
3462: case 1:
3463: v1 = a->a + diag[row] + 1;
3464: }
3465: }
3466: /* set sum */
3467: switch (sizes[i]) {
3468: case 5:
3469: sum5 = xb[row-4];
3470: case 4: /* fall through */
3471: sum4 = xb[row-3];
3472: case 3:
3473: sum3 = xb[row-2];
3474: case 2:
3475: sum2 = xb[row-1];
3476: case 1:
3477: /* note that sum1 is associated with the last row */
3478: sum1 = xb[row];
3479: }
3480: /* do sums */
3481: for (n = 0; n<sz-1; n+=2) {
3482: i1 = idx[0];
3483: i2 = idx[1];
3484: idx += 2;
3485: tmp0 = x[i1];
3486: tmp1 = x[i2];
3487: switch (sizes[i]) {
3488: case 5:
3489: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3490: case 4: /* fall through */
3491: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3492: case 3:
3493: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3494: case 2:
3495: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3496: case 1:
3497: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3498: }
3499: }
3500: /* ragged edge */
3501: if (n == sz-1) {
3502: tmp0 = x[*idx];
3503: switch (sizes[i]) {
3504: case 5:
3505: sum5 -= *v5*tmp0;
3506: case 4: /* fall through */
3507: sum4 -= *v4*tmp0;
3508: case 3:
3509: sum3 -= *v3*tmp0;
3510: case 2:
3511: sum2 -= *v2*tmp0;
3512: case 1:
3513: sum1 -= *v1*tmp0;
3514: }
3515: }
3516: /* update */
3517: if (xb == b) {
3518: /* whole (old way) w/ diag */
3519: switch (sizes[i]) {
3520: case 5:
3521: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3522: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3523: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3524: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3525: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3526: break;
3527: case 4:
3528: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3529: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3530: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3531: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3532: break;
3533: case 3:
3534: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3535: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3536: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3537: break;
3538: case 2:
3539: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3540: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3541: break;
3542: case 1:
3543: x[row--] += sum1*(*ibdiag);
3544: break;
3545: }
3546: } else {
3547: /* no diag so set = */
3548: switch (sizes[i]) {
3549: case 5:
3550: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555: break;
3556: case 4:
3557: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561: break;
3562: case 3:
3563: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566: break;
3567: case 2:
3568: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3569: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3570: break;
3571: case 1:
3572: x[row--] = sum1*(*ibdiag);
3573: break;
3574: }
3575: }
3576: }
3577: if (xb == b) {
3578: PetscLogFlops(2.0*a->nz);
3579: } else {
3580: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3581: }
3582: }
3583: }
3584: if (flag & SOR_EISENSTAT) {
3585: /*
3586: Apply (U + D)^-1 where D is now the block diagonal
3587: */
3588: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3589: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3590: ibdiag -= sizes[i]*sizes[i];
3591: sz = ii[row+1] - diag[row] - 1;
3592: v1 = a->a + diag[row] + 1;
3593: idx = a->j + diag[row] + 1;
3594: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3595: switch (sizes[i]) {
3596: case 1:
3598: sum1 = b[row];
3599: for (n = 0; n<sz-1; n+=2) {
3600: i1 = idx[0];
3601: i2 = idx[1];
3602: idx += 2;
3603: tmp0 = x[i1];
3604: tmp1 = x[i2];
3605: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3606: }
3608: if (n == sz-1) {
3609: tmp0 = x[*idx];
3610: sum1 -= *v1*tmp0;
3611: }
3612: x[row] = sum1*(*ibdiag);row--;
3613: break;
3615: case 2:
3617: sum1 = b[row];
3618: sum2 = b[row-1];
3619: /* note that sum1 is associated with the second of the two rows */
3620: v2 = a->a + diag[row-1] + 2;
3621: for (n = 0; n<sz-1; n+=2) {
3622: i1 = idx[0];
3623: i2 = idx[1];
3624: idx += 2;
3625: tmp0 = x[i1];
3626: tmp1 = x[i2];
3627: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3628: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3629: }
3631: if (n == sz-1) {
3632: tmp0 = x[*idx];
3633: sum1 -= *v1*tmp0;
3634: sum2 -= *v2*tmp0;
3635: }
3636: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3637: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3638: row -= 2;
3639: break;
3640: case 3:
3642: sum1 = b[row];
3643: sum2 = b[row-1];
3644: sum3 = b[row-2];
3645: v2 = a->a + diag[row-1] + 2;
3646: v3 = a->a + diag[row-2] + 3;
3647: for (n = 0; n<sz-1; n+=2) {
3648: i1 = idx[0];
3649: i2 = idx[1];
3650: idx += 2;
3651: tmp0 = x[i1];
3652: tmp1 = x[i2];
3653: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3654: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3655: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3656: }
3658: if (n == sz-1) {
3659: tmp0 = x[*idx];
3660: sum1 -= *v1*tmp0;
3661: sum2 -= *v2*tmp0;
3662: sum3 -= *v3*tmp0;
3663: }
3664: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3665: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3666: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3667: row -= 3;
3668: break;
3669: case 4:
3671: sum1 = b[row];
3672: sum2 = b[row-1];
3673: sum3 = b[row-2];
3674: sum4 = b[row-3];
3675: v2 = a->a + diag[row-1] + 2;
3676: v3 = a->a + diag[row-2] + 3;
3677: v4 = a->a + diag[row-3] + 4;
3678: for (n = 0; n<sz-1; n+=2) {
3679: i1 = idx[0];
3680: i2 = idx[1];
3681: idx += 2;
3682: tmp0 = x[i1];
3683: tmp1 = x[i2];
3684: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3685: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3686: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3687: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3688: }
3690: if (n == sz-1) {
3691: tmp0 = x[*idx];
3692: sum1 -= *v1*tmp0;
3693: sum2 -= *v2*tmp0;
3694: sum3 -= *v3*tmp0;
3695: sum4 -= *v4*tmp0;
3696: }
3697: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3698: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3699: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3700: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3701: row -= 4;
3702: break;
3703: case 5:
3705: sum1 = b[row];
3706: sum2 = b[row-1];
3707: sum3 = b[row-2];
3708: sum4 = b[row-3];
3709: sum5 = b[row-4];
3710: v2 = a->a + diag[row-1] + 2;
3711: v3 = a->a + diag[row-2] + 3;
3712: v4 = a->a + diag[row-3] + 4;
3713: v5 = a->a + diag[row-4] + 5;
3714: for (n = 0; n<sz-1; n+=2) {
3715: i1 = idx[0];
3716: i2 = idx[1];
3717: idx += 2;
3718: tmp0 = x[i1];
3719: tmp1 = x[i2];
3720: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3725: }
3727: if (n == sz-1) {
3728: tmp0 = x[*idx];
3729: sum1 -= *v1*tmp0;
3730: sum2 -= *v2*tmp0;
3731: sum3 -= *v3*tmp0;
3732: sum4 -= *v4*tmp0;
3733: sum5 -= *v5*tmp0;
3734: }
3735: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3736: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3737: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3738: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3739: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3740: row -= 5;
3741: break;
3742: default:
3743: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3744: }
3745: }
3746: PetscLogFlops(a->nz);
3748: /*
3749: t = b - D x where D is the block diagonal
3750: */
3751: cnt = 0;
3752: for (i=0, row=0; i<m; i++) {
3753: switch (sizes[i]) {
3754: case 1:
3755: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3756: break;
3757: case 2:
3758: x1 = x[row]; x2 = x[row+1];
3759: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3760: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3761: t[row] = b[row] - tmp1;
3762: t[row+1] = b[row+1] - tmp2; row += 2;
3763: cnt += 4;
3764: break;
3765: case 3:
3766: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3767: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3768: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3769: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3770: t[row] = b[row] - tmp1;
3771: t[row+1] = b[row+1] - tmp2;
3772: t[row+2] = b[row+2] - tmp3; row += 3;
3773: cnt += 9;
3774: break;
3775: case 4:
3776: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3777: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3778: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3779: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3780: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3781: t[row] = b[row] - tmp1;
3782: t[row+1] = b[row+1] - tmp2;
3783: t[row+2] = b[row+2] - tmp3;
3784: t[row+3] = b[row+3] - tmp4; row += 4;
3785: cnt += 16;
3786: break;
3787: case 5:
3788: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3789: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3790: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3791: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3792: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3793: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3794: t[row] = b[row] - tmp1;
3795: t[row+1] = b[row+1] - tmp2;
3796: t[row+2] = b[row+2] - tmp3;
3797: t[row+3] = b[row+3] - tmp4;
3798: t[row+4] = b[row+4] - tmp5;row += 5;
3799: cnt += 25;
3800: break;
3801: default:
3802: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3803: }
3804: }
3805: PetscLogFlops(m);
3809: /*
3810: Apply (L + D)^-1 where D is the block diagonal
3811: */
3812: for (i=0, row=0; i<m; i++) {
3813: sz = diag[row] - ii[row];
3814: v1 = a->a + ii[row];
3815: idx = a->j + ii[row];
3816: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3817: switch (sizes[i]) {
3818: case 1:
3820: sum1 = t[row];
3821: for (n = 0; n<sz-1; n+=2) {
3822: i1 = idx[0];
3823: i2 = idx[1];
3824: idx += 2;
3825: tmp0 = t[i1];
3826: tmp1 = t[i2];
3827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3828: }
3830: if (n == sz-1) {
3831: tmp0 = t[*idx];
3832: sum1 -= *v1 * tmp0;
3833: }
3834: x[row] += t[row] = sum1*(*ibdiag++); row++;
3835: break;
3836: case 2:
3837: v2 = a->a + ii[row+1];
3838: sum1 = t[row];
3839: sum2 = t[row+1];
3840: for (n = 0; n<sz-1; n+=2) {
3841: i1 = idx[0];
3842: i2 = idx[1];
3843: idx += 2;
3844: tmp0 = t[i1];
3845: tmp1 = t[i2];
3846: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3847: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3848: }
3850: if (n == sz-1) {
3851: tmp0 = t[*idx];
3852: sum1 -= v1[0] * tmp0;
3853: sum2 -= v2[0] * tmp0;
3854: }
3855: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3856: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3857: ibdiag += 4; row += 2;
3858: break;
3859: case 3:
3860: v2 = a->a + ii[row+1];
3861: v3 = a->a + ii[row+2];
3862: sum1 = t[row];
3863: sum2 = t[row+1];
3864: sum3 = t[row+2];
3865: for (n = 0; n<sz-1; n+=2) {
3866: i1 = idx[0];
3867: i2 = idx[1];
3868: idx += 2;
3869: tmp0 = t[i1];
3870: tmp1 = t[i2];
3871: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3872: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3873: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3874: }
3876: if (n == sz-1) {
3877: tmp0 = t[*idx];
3878: sum1 -= v1[0] * tmp0;
3879: sum2 -= v2[0] * tmp0;
3880: sum3 -= v3[0] * tmp0;
3881: }
3882: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3883: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3884: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3885: ibdiag += 9; row += 3;
3886: break;
3887: case 4:
3888: v2 = a->a + ii[row+1];
3889: v3 = a->a + ii[row+2];
3890: v4 = a->a + ii[row+3];
3891: sum1 = t[row];
3892: sum2 = t[row+1];
3893: sum3 = t[row+2];
3894: sum4 = t[row+3];
3895: for (n = 0; n<sz-1; n+=2) {
3896: i1 = idx[0];
3897: i2 = idx[1];
3898: idx += 2;
3899: tmp0 = t[i1];
3900: tmp1 = t[i2];
3901: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3902: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3903: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3904: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3905: }
3907: if (n == sz-1) {
3908: tmp0 = t[*idx];
3909: sum1 -= v1[0] * tmp0;
3910: sum2 -= v2[0] * tmp0;
3911: sum3 -= v3[0] * tmp0;
3912: sum4 -= v4[0] * tmp0;
3913: }
3914: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3915: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3916: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3917: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3918: ibdiag += 16; row += 4;
3919: break;
3920: case 5:
3921: v2 = a->a + ii[row+1];
3922: v3 = a->a + ii[row+2];
3923: v4 = a->a + ii[row+3];
3924: v5 = a->a + ii[row+4];
3925: sum1 = t[row];
3926: sum2 = t[row+1];
3927: sum3 = t[row+2];
3928: sum4 = t[row+3];
3929: sum5 = t[row+4];
3930: for (n = 0; n<sz-1; n+=2) {
3931: i1 = idx[0];
3932: i2 = idx[1];
3933: idx += 2;
3934: tmp0 = t[i1];
3935: tmp1 = t[i2];
3936: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3937: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3938: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3939: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3940: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3941: }
3943: if (n == sz-1) {
3944: tmp0 = t[*idx];
3945: sum1 -= v1[0] * tmp0;
3946: sum2 -= v2[0] * tmp0;
3947: sum3 -= v3[0] * tmp0;
3948: sum4 -= v4[0] * tmp0;
3949: sum5 -= v5[0] * tmp0;
3950: }
3951: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3952: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3953: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3954: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3955: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3956: ibdiag += 25; row += 5;
3957: break;
3958: default:
3959: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3960: }
3961: }
3962: PetscLogFlops(a->nz);
3963: }
3964: VecRestoreArray(xx,&x);
3965: VecRestoreArrayRead(bb,&b);
3966: return(0);
3967: }
3969: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3970: {
3971: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3972: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3973: const MatScalar *bdiag = a->inode.bdiag;
3974: const PetscScalar *b;
3975: PetscErrorCode ierr;
3976: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3977: const PetscInt *sizes = a->inode.size;
3980: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3981: VecGetArray(xx,&x);
3982: VecGetArrayRead(bb,&b);
3983: cnt = 0;
3984: for (i=0, row=0; i<m; i++) {
3985: switch (sizes[i]) {
3986: case 1:
3987: x[row] = b[row]*bdiag[cnt++];row++;
3988: break;
3989: case 2:
3990: x1 = b[row]; x2 = b[row+1];
3991: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3992: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3993: x[row++] = tmp1;
3994: x[row++] = tmp2;
3995: cnt += 4;
3996: break;
3997: case 3:
3998: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3999: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4000: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4001: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4002: x[row++] = tmp1;
4003: x[row++] = tmp2;
4004: x[row++] = tmp3;
4005: cnt += 9;
4006: break;
4007: case 4:
4008: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4009: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4010: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4011: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4012: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4013: x[row++] = tmp1;
4014: x[row++] = tmp2;
4015: x[row++] = tmp3;
4016: x[row++] = tmp4;
4017: cnt += 16;
4018: break;
4019: case 5:
4020: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4021: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4022: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4023: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4024: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4025: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4026: x[row++] = tmp1;
4027: x[row++] = tmp2;
4028: x[row++] = tmp3;
4029: x[row++] = tmp4;
4030: x[row++] = tmp5;
4031: cnt += 25;
4032: break;
4033: default:
4034: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4035: }
4036: }
4037: PetscLogFlops(2.0*cnt);
4038: VecRestoreArray(xx,&x);
4039: VecRestoreArrayRead(bb,&b);
4040: return(0);
4041: }
4043: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4044: {
4045: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4048: a->inode.node_count = 0;
4049: a->inode.use = PETSC_FALSE;
4050: a->inode.checked = PETSC_FALSE;
4051: a->inode.mat_nonzerostate = -1;
4052: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4053: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4054: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4055: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4056: A->ops->coloringpatch = NULL;
4057: A->ops->multdiagonalblock = NULL;
4058: if (A->factortype) {
4059: A->ops->solve = MatSolve_SeqAIJ_inplace;
4060: }
4061: return(0);
4062: }
4064: /*
4065: samestructure indicates that the matrix has not changed its nonzero structure so we
4066: do not need to recompute the inodes
4067: */
4068: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4069: {
4070: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4072: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4073: PetscBool flag;
4074: const PetscInt *idx,*idy,*ii;
4077: if (!a->inode.use) {
4078: MatSeqAIJ_Inode_ResetOps(A);
4079: PetscFree(a->inode.size);
4080: return(0);
4081: }
4082: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4084: m = A->rmap->n;
4085: if (!a->inode.size) { PetscMalloc1(m+1,&a->inode.size); }
4086: ns = a->inode.size;
4088: i = 0;
4089: node_count = 0;
4090: idx = a->j;
4091: ii = a->i;
4092: while (i < m) { /* For each row */
4093: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4094: /* Limits the number of elements in a node to 'a->inode.limit' */
4095: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4096: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4097: if (nzy != nzx) break;
4098: idy += nzx; /* Same nonzero pattern */
4099: PetscArraycmp(idx,idy,nzx,&flag);
4100: if (!flag) break;
4101: }
4102: ns[node_count++] = blk_size;
4103: idx += blk_size*nzx;
4104: i = j;
4105: }
4107: /* If not enough inodes found,, do not use inode version of the routines */
4108: if (!m || node_count > .8*m) {
4109: MatSeqAIJ_Inode_ResetOps(A);
4110: PetscFree(a->inode.size);
4111: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4112: } else {
4113: if (!A->factortype) {
4114: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4115: if (A->rmap->n == A->cmap->n) {
4116: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4117: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4118: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4119: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4120: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4121: }
4122: } else {
4123: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4124: }
4125: a->inode.node_count = node_count;
4126: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4127: }
4128: a->inode.checked = PETSC_TRUE;
4129: a->inode.mat_nonzerostate = A->nonzerostate;
4130: return(0);
4131: }
4133: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4134: {
4135: Mat B =*C;
4136: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4138: PetscInt m=A->rmap->n;
4141: c->inode.use = a->inode.use;
4142: c->inode.limit = a->inode.limit;
4143: c->inode.max_limit = a->inode.max_limit;
4144: c->inode.checked = PETSC_FALSE;
4145: c->inode.size = NULL;
4146: c->inode.node_count = 0;
4147: c->inode.ibdiagvalid = PETSC_FALSE;
4148: c->inode.ibdiag = NULL;
4149: c->inode.bdiag = NULL;
4150: c->inode.mat_nonzerostate = -1;
4151: if (a->inode.use) {
4152: if (a->inode.checked && a->inode.size) {
4153: PetscMalloc1(m+1,&c->inode.size);
4154: PetscArraycpy(c->inode.size,a->inode.size,m+1);
4156: c->inode.checked = PETSC_TRUE;
4157: c->inode.node_count = a->inode.node_count;
4158: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4159: }
4160: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4161: if (!B->factortype) {
4162: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4163: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4164: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4165: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4166: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4167: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4168: } else {
4169: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4170: }
4171: }
4172: return(0);
4173: }
4175: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4176: {
4177: PetscInt k;
4178: const PetscInt *vi;
4181: vi = aj + ai[row];
4182: for (k=0; k<nzl; k++) cols[k] = vi[k];
4183: vi = aj + adiag[row];
4184: cols[nzl] = vi[0];
4185: vi = aj + adiag[row+1]+1;
4186: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4187: return(0);
4188: }
4189: /*
4190: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4191: Modified from MatSeqAIJCheckInode().
4193: Input Parameters:
4194: . Mat A - ILU or LU matrix factor
4196: */
4197: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4198: {
4199: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4201: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4202: PetscInt *cols1,*cols2,*ns;
4203: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4204: PetscBool flag;
4207: if (!a->inode.use) return(0);
4208: if (a->inode.checked) return(0);
4210: m = A->rmap->n;
4211: if (a->inode.size) ns = a->inode.size;
4212: else {
4213: PetscMalloc1(m+1,&ns);
4214: }
4216: i = 0;
4217: node_count = 0;
4218: PetscMalloc2(m,&cols1,m,&cols2);
4219: while (i < m) { /* For each row */
4220: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4221: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4222: nzx = nzl1 + nzu1 + 1;
4223: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4225: /* Limits the number of elements in a node to 'a->inode.limit' */
4226: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4227: nzl2 = ai[j+1] - ai[j];
4228: nzu2 = adiag[j] - adiag[j+1] - 1;
4229: nzy = nzl2 + nzu2 + 1;
4230: if (nzy != nzx) break;
4231: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4232: PetscArraycmp(cols1,cols2,nzx,&flag);
4233: if (!flag) break;
4234: }
4235: ns[node_count++] = blk_size;
4236: i = j;
4237: }
4238: PetscFree2(cols1,cols2);
4239: /* If not enough inodes found,, do not use inode version of the routines */
4240: if (!m || node_count > .8*m) {
4241: PetscFree(ns);
4243: a->inode.node_count = 0;
4244: a->inode.size = NULL;
4245: a->inode.use = PETSC_FALSE;
4247: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4248: } else {
4249: A->ops->mult = NULL;
4250: A->ops->sor = NULL;
4251: A->ops->multadd = NULL;
4252: A->ops->getrowij = NULL;
4253: A->ops->restorerowij = NULL;
4254: A->ops->getcolumnij = NULL;
4255: A->ops->restorecolumnij = NULL;
4256: A->ops->coloringpatch = NULL;
4257: A->ops->multdiagonalblock = NULL;
4258: a->inode.node_count = node_count;
4259: a->inode.size = ns;
4261: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4262: }
4263: a->inode.checked = PETSC_TRUE;
4264: return(0);
4265: }
4267: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4268: {
4269: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4272: a->inode.ibdiagvalid = PETSC_FALSE;
4273: return(0);
4274: }
4276: /*
4277: This is really ugly. if inodes are used this replaces the
4278: permutations with ones that correspond to rows/cols of the matrix
4279: rather then inode blocks
4280: */
4281: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4282: {
4286: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4287: return(0);
4288: }
4290: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4291: {
4292: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4294: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4295: const PetscInt *ridx,*cidx;
4296: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4297: PetscInt nslim_col,*ns_col;
4298: IS ris = *rperm,cis = *cperm;
4301: if (!a->inode.size) return(0); /* no inodes so return */
4302: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4304: MatCreateColInode_Private(A,&nslim_col,&ns_col);
4305: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4306: PetscMalloc2(m,&permr,n,&permc);
4308: ISGetIndices(ris,&ridx);
4309: ISGetIndices(cis,&cidx);
4311: /* Form the inode structure for the rows of permuted matric using inv perm*/
4312: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4314: /* Construct the permutations for rows*/
4315: for (i=0,row = 0; i<nslim_row; ++i) {
4316: indx = ridx[i];
4317: start_val = tns[indx];
4318: end_val = tns[indx + 1];
4319: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4320: }
4322: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4323: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4325: /* Construct permutations for columns */
4326: for (i=0,col=0; i<nslim_col; ++i) {
4327: indx = cidx[i];
4328: start_val = tns[indx];
4329: end_val = tns[indx + 1];
4330: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4331: }
4333: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4334: ISSetPermutation(*rperm);
4335: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4336: ISSetPermutation(*cperm);
4338: ISRestoreIndices(ris,&ridx);
4339: ISRestoreIndices(cis,&cidx);
4341: PetscFree(ns_col);
4342: PetscFree2(permr,permc);
4343: ISDestroy(&cis);
4344: ISDestroy(&ris);
4345: PetscFree(tns);
4346: return(0);
4347: }
4349: /*@C
4350: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4352: Not Collective
4354: Input Parameter:
4355: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4357: Output Parameter:
4358: + node_count - no of inodes present in the matrix.
4359: . sizes - an array of size node_count,with sizes of each inode.
4360: - limit - the max size used to generate the inodes.
4362: Level: advanced
4364: Notes:
4365: This routine returns some internal storage information
4366: of the matrix, it is intended to be used by advanced users.
4367: It should be called after the matrix is assembled.
4368: The contents of the sizes[] array should not be changed.
4369: NULL may be passed for information not requested.
4371: .seealso: MatGetInfo()
4372: @*/
4373: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4374: {
4375: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4378: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4379: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4380: if (f) {
4381: (*f)(A,node_count,sizes,limit);
4382: }
4383: return(0);
4384: }
4386: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4387: {
4388: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4391: if (node_count) *node_count = a->inode.node_count;
4392: if (sizes) *sizes = a->inode.size;
4393: if (limit) *limit = a->inode.limit;
4394: return(0);
4395: }