Actual source code: inode.c
petsc-3.13.6 2020-09-29
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
12: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
15: n = A->cmap->n;
16: m = A->rmap->n;
17: ns_row = a->inode.size;
19: min_mn = (m < n) ? m : n;
20: if (!ns) {
21: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22: for (; count+1 < n; count++,i++) ;
23: if (count < n) {
24: i++;
25: }
26: *size = i;
27: return(0);
28: }
29: PetscMalloc1(n+1,&ns_col);
31: /* Use the same row structure wherever feasible. */
32: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33: ns_col[i] = ns_row[i];
34: }
36: /* if m < n; pad up the remainder with inode_limit */
37: for (; count+1 < n; count++,i++) {
38: ns_col[i] = 1;
39: }
40: /* The last node is the odd ball. padd it up with the remaining rows; */
41: if (count < n) {
42: ns_col[i] = n - count;
43: i++;
44: } else if (count > n) {
45: /* Adjust for the over estimation */
46: ns_col[i-1] += n - count;
47: }
48: *size = i;
49: *ns = ns_col;
50: return(0);
51: }
54: /*
55: This builds symmetric version of nonzero structure,
56: */
57: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58: {
59: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
61: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
66: nslim_row = a->inode.node_count;
67: m = A->rmap->n;
68: n = A->cmap->n;
69: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
71: /* Use the row_inode as column_inode */
72: nslim_col = nslim_row;
73: ns_col = ns_row;
75: /* allocate space for reformated inode structure */
76: PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
77: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
79: for (i1=0,col=0; i1<nslim_col; ++i1) {
80: nsz = ns_col[i1];
81: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
82: }
83: /* allocate space for row pointers */
84: PetscCalloc1(nslim_row+1,&ia);
85: *iia = ia;
86: PetscMalloc1(nslim_row+1,&work);
88: /* determine the number of columns in each row */
89: ia[0] = oshift;
90: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
92: j = aj + ai[row] + ishift;
93: jmax = aj + ai[row+1] + ishift;
94: if (j==jmax) continue; /* empty row */
95: col = *j++ + ishift;
96: i2 = tvc[col];
97: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
98: ia[i1+1]++;
99: ia[i2+1]++;
100: i2++; /* Start col of next node */
101: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
102: i2 = tvc[col];
103: }
104: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
105: }
107: /* shift ia[i] to point to next row */
108: for (i1=1; i1<nslim_row+1; i1++) {
109: row = ia[i1-1];
110: ia[i1] += row;
111: work[i1-1] = row - oshift;
112: }
114: /* allocate space for column pointers */
115: nz = ia[nslim_row] + (!ishift);
116: PetscMalloc1(nz,&ja);
117: *jja = ja;
119: /* loop over lower triangular part putting into ja */
120: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
121: j = aj + ai[row] + ishift;
122: jmax = aj + ai[row+1] + ishift;
123: if (j==jmax) continue; /* empty row */
124: col = *j++ + ishift;
125: i2 = tvc[col];
126: while (i2<i1 && j<jmax) {
127: ja[work[i2]++] = i1 + oshift;
128: ja[work[i1]++] = i2 + oshift;
129: ++i2;
130: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
131: i2 = tvc[col];
132: }
133: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
135: }
136: PetscFree(work);
137: PetscFree2(tns,tvc);
138: return(0);
139: }
141: /*
142: This builds nonsymmetric version of nonzero structure,
143: */
144: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
145: {
146: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
148: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
149: PetscInt *tns,*tvc,nsz,i1,i2;
150: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
153: nslim_row = a->inode.node_count;
154: n = A->cmap->n;
156: /* Create The column_inode for this matrix */
157: MatCreateColInode_Private(A,&nslim_col,&ns_col);
159: /* allocate space for reformated column_inode structure */
160: PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
161: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
163: for (i1=0,col=0; i1<nslim_col; ++i1) {
164: nsz = ns_col[i1];
165: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
166: }
167: /* allocate space for row pointers */
168: PetscCalloc1(nslim_row+1,&ia);
169: *iia = ia;
170: PetscMalloc1(nslim_row+1,&work);
172: /* determine the number of columns in each row */
173: ia[0] = oshift;
174: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
175: j = aj + ai[row] + ishift;
176: nz = ai[row+1] - ai[row];
177: if (!nz) continue; /* empty row */
178: col = *j++ + ishift;
179: i2 = tvc[col];
180: while (nz-- > 0) { /* off-diagonal elemets */
181: ia[i1+1]++;
182: i2++; /* Start col of next node */
183: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
184: if (nz > 0) i2 = tvc[col];
185: }
186: }
188: /* shift ia[i] to point to next row */
189: for (i1=1; i1<nslim_row+1; i1++) {
190: row = ia[i1-1];
191: ia[i1] += row;
192: work[i1-1] = row - oshift;
193: }
195: /* allocate space for column pointers */
196: nz = ia[nslim_row] + (!ishift);
197: PetscMalloc1(nz,&ja);
198: *jja = ja;
200: /* loop over matrix putting into ja */
201: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
202: j = aj + ai[row] + ishift;
203: nz = ai[row+1] - ai[row];
204: if (!nz) continue; /* empty row */
205: col = *j++ + ishift;
206: i2 = tvc[col];
207: while (nz-- > 0) {
208: ja[work[i1]++] = i2 + oshift;
209: ++i2;
210: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
211: if (nz > 0) i2 = tvc[col];
212: }
213: }
214: PetscFree(ns_col);
215: PetscFree(work);
216: PetscFree2(tns,tvc);
217: return(0);
218: }
220: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
221: {
222: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
226: *n = a->inode.node_count;
227: if (!ia) return(0);
228: if (!blockcompressed) {
229: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
230: } else if (symmetric) {
231: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
232: } else {
233: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
234: }
235: return(0);
236: }
238: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
239: {
243: if (!ia) return(0);
245: if (!blockcompressed) {
246: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
247: } else {
248: PetscFree(*ia);
249: PetscFree(*ja);
250: }
251: return(0);
252: }
254: /* ----------------------------------------------------------- */
256: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
257: {
258: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
260: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
261: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
264: nslim_row = a->inode.node_count;
265: n = A->cmap->n;
267: /* Create The column_inode for this matrix */
268: MatCreateColInode_Private(A,&nslim_col,&ns_col);
270: /* allocate space for reformated column_inode structure */
271: PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
272: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
274: for (i1=0,col=0; i1<nslim_col; ++i1) {
275: nsz = ns_col[i1];
276: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
277: }
278: /* allocate space for column pointers */
279: PetscCalloc1(nslim_col+1,&ia);
280: *iia = ia;
281: PetscMalloc1(nslim_col+1,&work);
283: /* determine the number of columns in each row */
284: ia[0] = oshift;
285: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
286: j = aj + ai[row] + ishift;
287: col = *j++ + ishift;
288: i2 = tvc[col];
289: nz = ai[row+1] - ai[row];
290: while (nz-- > 0) { /* off-diagonal elemets */
291: /* ia[i1+1]++; */
292: ia[i2+1]++;
293: i2++;
294: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
295: if (nz > 0) i2 = tvc[col];
296: }
297: }
299: /* shift ia[i] to point to next col */
300: for (i1=1; i1<nslim_col+1; i1++) {
301: col = ia[i1-1];
302: ia[i1] += col;
303: work[i1-1] = col - oshift;
304: }
306: /* allocate space for column pointers */
307: nz = ia[nslim_col] + (!ishift);
308: PetscMalloc1(nz,&ja);
309: *jja = ja;
311: /* loop over matrix putting into ja */
312: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
313: j = aj + ai[row] + ishift;
314: col = *j++ + ishift;
315: i2 = tvc[col];
316: nz = ai[row+1] - ai[row];
317: while (nz-- > 0) {
318: /* ja[work[i1]++] = i2 + oshift; */
319: ja[work[i2]++] = i1 + oshift;
320: i2++;
321: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
322: if (nz > 0) i2 = tvc[col];
323: }
324: }
325: PetscFree(ns_col);
326: PetscFree(work);
327: PetscFree2(tns,tvc);
328: return(0);
329: }
331: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
332: {
336: MatCreateColInode_Private(A,n,NULL);
337: if (!ia) return(0);
339: if (!blockcompressed) {
340: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
341: } else if (symmetric) {
342: /* Since the indices are symmetric it does'nt matter */
343: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
344: } else {
345: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
346: }
347: return(0);
348: }
350: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
351: {
355: if (!ia) return(0);
356: if (!blockcompressed) {
357: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
358: } else {
359: PetscFree(*ia);
360: PetscFree(*ja);
361: }
362: return(0);
363: }
365: /* ----------------------------------------------------------- */
367: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
368: {
369: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
370: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
371: PetscScalar *y;
372: const PetscScalar *x;
373: const MatScalar *v1,*v2,*v3,*v4,*v5;
374: PetscErrorCode ierr;
375: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
376: const PetscInt *idx,*ns,*ii;
378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
380: #endif
383: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
384: node_max = a->inode.node_count;
385: ns = a->inode.size; /* Node Size array */
386: VecGetArrayRead(xx,&x);
387: VecGetArray(yy,&y);
388: idx = a->j;
389: v1 = a->a;
390: ii = a->i;
392: for (i = 0,row = 0; i< node_max; ++i) {
393: nsz = ns[i];
394: n = ii[1] - ii[0];
395: nonzerorow += (n>0)*nsz;
396: ii += nsz;
397: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
398: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
399: sz = n; /* No of non zeros in this row */
400: /* Switch on the size of Node */
401: switch (nsz) { /* Each loop in 'case' is unrolled */
402: case 1:
403: sum1 = 0.;
405: for (n = 0; n< sz-1; n+=2) {
406: i1 = idx[0]; /* The instructions are ordered to */
407: i2 = idx[1]; /* make the compiler's job easy */
408: idx += 2;
409: tmp0 = x[i1];
410: tmp1 = x[i2];
411: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
412: }
414: if (n == sz-1) { /* Take care of the last nonzero */
415: tmp0 = x[*idx++];
416: sum1 += *v1++ *tmp0;
417: }
418: y[row++]=sum1;
419: break;
420: case 2:
421: sum1 = 0.;
422: sum2 = 0.;
423: v2 = v1 + n;
425: for (n = 0; n< sz-1; n+=2) {
426: i1 = idx[0];
427: i2 = idx[1];
428: idx += 2;
429: tmp0 = x[i1];
430: tmp1 = x[i2];
431: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
433: }
434: if (n == sz-1) {
435: tmp0 = x[*idx++];
436: sum1 += *v1++ * tmp0;
437: sum2 += *v2++ * tmp0;
438: }
439: y[row++]=sum1;
440: y[row++]=sum2;
441: v1 =v2; /* Since the next block to be processed starts there*/
442: idx +=sz;
443: break;
444: case 3:
445: sum1 = 0.;
446: sum2 = 0.;
447: sum3 = 0.;
448: v2 = v1 + n;
449: v3 = v2 + n;
451: for (n = 0; n< sz-1; n+=2) {
452: i1 = idx[0];
453: i2 = idx[1];
454: idx += 2;
455: tmp0 = x[i1];
456: tmp1 = x[i2];
457: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
460: }
461: if (n == sz-1) {
462: tmp0 = x[*idx++];
463: sum1 += *v1++ * tmp0;
464: sum2 += *v2++ * tmp0;
465: sum3 += *v3++ * tmp0;
466: }
467: y[row++]=sum1;
468: y[row++]=sum2;
469: y[row++]=sum3;
470: v1 =v3; /* Since the next block to be processed starts there*/
471: idx +=2*sz;
472: break;
473: case 4:
474: sum1 = 0.;
475: sum2 = 0.;
476: sum3 = 0.;
477: sum4 = 0.;
478: v2 = v1 + n;
479: v3 = v2 + n;
480: v4 = v3 + n;
482: for (n = 0; n< sz-1; n+=2) {
483: i1 = idx[0];
484: i2 = idx[1];
485: idx += 2;
486: tmp0 = x[i1];
487: tmp1 = x[i2];
488: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
489: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
490: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
491: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
492: }
493: if (n == sz-1) {
494: tmp0 = x[*idx++];
495: sum1 += *v1++ * tmp0;
496: sum2 += *v2++ * tmp0;
497: sum3 += *v3++ * tmp0;
498: sum4 += *v4++ * tmp0;
499: }
500: y[row++]=sum1;
501: y[row++]=sum2;
502: y[row++]=sum3;
503: y[row++]=sum4;
504: v1 =v4; /* Since the next block to be processed starts there*/
505: idx +=3*sz;
506: break;
507: case 5:
508: sum1 = 0.;
509: sum2 = 0.;
510: sum3 = 0.;
511: sum4 = 0.;
512: sum5 = 0.;
513: v2 = v1 + n;
514: v3 = v2 + n;
515: v4 = v3 + n;
516: v5 = v4 + n;
518: for (n = 0; n<sz-1; n+=2) {
519: i1 = idx[0];
520: i2 = idx[1];
521: idx += 2;
522: tmp0 = x[i1];
523: tmp1 = x[i2];
524: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
525: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
526: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
527: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
528: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
529: }
530: if (n == sz-1) {
531: tmp0 = x[*idx++];
532: sum1 += *v1++ * tmp0;
533: sum2 += *v2++ * tmp0;
534: sum3 += *v3++ * tmp0;
535: sum4 += *v4++ * tmp0;
536: sum5 += *v5++ * tmp0;
537: }
538: y[row++]=sum1;
539: y[row++]=sum2;
540: y[row++]=sum3;
541: y[row++]=sum4;
542: y[row++]=sum5;
543: v1 =v5; /* Since the next block to be processed starts there */
544: idx +=4*sz;
545: break;
546: default:
547: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
548: }
549: }
550: VecRestoreArrayRead(xx,&x);
551: VecRestoreArray(yy,&y);
552: PetscLogFlops(2.0*a->nz - nonzerorow);
553: return(0);
554: }
555: /* ----------------------------------------------------------- */
556: /* Almost same code as the MatMult_SeqAIJ_Inode() */
557: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
558: {
559: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
560: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
561: const MatScalar *v1,*v2,*v3,*v4,*v5;
562: const PetscScalar *x;
563: PetscScalar *y,*z,*zt;
564: PetscErrorCode ierr;
565: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
566: const PetscInt *idx,*ns,*ii;
569: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
570: node_max = a->inode.node_count;
571: ns = a->inode.size; /* Node Size array */
573: VecGetArrayRead(xx,&x);
574: VecGetArrayPair(zz,yy,&z,&y);
575: zt = z;
577: idx = a->j;
578: v1 = a->a;
579: ii = a->i;
581: for (i = 0,row = 0; i< node_max; ++i) {
582: nsz = ns[i];
583: n = ii[1] - ii[0];
584: ii += nsz;
585: sz = n; /* No of non zeros in this row */
586: /* Switch on the size of Node */
587: switch (nsz) { /* Each loop in 'case' is unrolled */
588: case 1:
589: sum1 = *zt++;
591: for (n = 0; n< sz-1; n+=2) {
592: i1 = idx[0]; /* The instructions are ordered to */
593: i2 = idx[1]; /* make the compiler's job easy */
594: idx += 2;
595: tmp0 = x[i1];
596: tmp1 = x[i2];
597: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
598: }
600: if (n == sz-1) { /* Take care of the last nonzero */
601: tmp0 = x[*idx++];
602: sum1 += *v1++ * tmp0;
603: }
604: y[row++]=sum1;
605: break;
606: case 2:
607: sum1 = *zt++;
608: sum2 = *zt++;
609: v2 = v1 + n;
611: for (n = 0; n< sz-1; n+=2) {
612: i1 = idx[0];
613: i2 = idx[1];
614: idx += 2;
615: tmp0 = x[i1];
616: tmp1 = x[i2];
617: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
618: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
619: }
620: if (n == sz-1) {
621: tmp0 = x[*idx++];
622: sum1 += *v1++ * tmp0;
623: sum2 += *v2++ * tmp0;
624: }
625: y[row++]=sum1;
626: y[row++]=sum2;
627: v1 =v2; /* Since the next block to be processed starts there*/
628: idx +=sz;
629: break;
630: case 3:
631: sum1 = *zt++;
632: sum2 = *zt++;
633: sum3 = *zt++;
634: v2 = v1 + n;
635: v3 = v2 + n;
637: for (n = 0; n< sz-1; n+=2) {
638: i1 = idx[0];
639: i2 = idx[1];
640: idx += 2;
641: tmp0 = x[i1];
642: tmp1 = x[i2];
643: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
644: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
645: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
646: }
647: if (n == sz-1) {
648: tmp0 = x[*idx++];
649: sum1 += *v1++ * tmp0;
650: sum2 += *v2++ * tmp0;
651: sum3 += *v3++ * tmp0;
652: }
653: y[row++]=sum1;
654: y[row++]=sum2;
655: y[row++]=sum3;
656: v1 =v3; /* Since the next block to be processed starts there*/
657: idx +=2*sz;
658: break;
659: case 4:
660: sum1 = *zt++;
661: sum2 = *zt++;
662: sum3 = *zt++;
663: sum4 = *zt++;
664: v2 = v1 + n;
665: v3 = v2 + n;
666: v4 = v3 + n;
668: for (n = 0; n< sz-1; n+=2) {
669: i1 = idx[0];
670: i2 = idx[1];
671: idx += 2;
672: tmp0 = x[i1];
673: tmp1 = x[i2];
674: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
675: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
676: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
677: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
678: }
679: if (n == sz-1) {
680: tmp0 = x[*idx++];
681: sum1 += *v1++ * tmp0;
682: sum2 += *v2++ * tmp0;
683: sum3 += *v3++ * tmp0;
684: sum4 += *v4++ * tmp0;
685: }
686: y[row++]=sum1;
687: y[row++]=sum2;
688: y[row++]=sum3;
689: y[row++]=sum4;
690: v1 =v4; /* Since the next block to be processed starts there*/
691: idx +=3*sz;
692: break;
693: case 5:
694: sum1 = *zt++;
695: sum2 = *zt++;
696: sum3 = *zt++;
697: sum4 = *zt++;
698: sum5 = *zt++;
699: v2 = v1 + n;
700: v3 = v2 + n;
701: v4 = v3 + n;
702: v5 = v4 + n;
704: for (n = 0; n<sz-1; n+=2) {
705: i1 = idx[0];
706: i2 = idx[1];
707: idx += 2;
708: tmp0 = x[i1];
709: tmp1 = x[i2];
710: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
711: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
712: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
713: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
714: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
715: }
716: if (n == sz-1) {
717: tmp0 = x[*idx++];
718: sum1 += *v1++ * tmp0;
719: sum2 += *v2++ * tmp0;
720: sum3 += *v3++ * tmp0;
721: sum4 += *v4++ * tmp0;
722: sum5 += *v5++ * tmp0;
723: }
724: y[row++]=sum1;
725: y[row++]=sum2;
726: y[row++]=sum3;
727: y[row++]=sum4;
728: y[row++]=sum5;
729: v1 =v5; /* Since the next block to be processed starts there */
730: idx +=4*sz;
731: break;
732: default:
733: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
734: }
735: }
736: VecRestoreArrayRead(xx,&x);
737: VecRestoreArrayPair(zz,yy,&z,&y);
738: PetscLogFlops(2.0*a->nz);
739: return(0);
740: }
742: /* ----------------------------------------------------------- */
743: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
744: {
745: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
746: IS iscol = a->col,isrow = a->row;
747: PetscErrorCode ierr;
748: const PetscInt *r,*c,*rout,*cout;
749: PetscInt i,j,n = A->rmap->n,nz;
750: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
751: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
752: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
753: PetscScalar sum1,sum2,sum3,sum4,sum5;
754: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
755: const PetscScalar *b;
758: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
759: node_max = a->inode.node_count;
760: ns = a->inode.size; /* Node Size array */
762: VecGetArrayRead(bb,&b);
763: VecGetArrayWrite(xx,&x);
764: tmp = a->solve_work;
766: ISGetIndices(isrow,&rout); r = rout;
767: ISGetIndices(iscol,&cout); c = cout + (n-1);
769: /* forward solve the lower triangular */
770: tmps = tmp;
771: aa = a_a;
772: aj = a_j;
773: ad = a->diag;
775: for (i = 0,row = 0; i< node_max; ++i) {
776: nsz = ns[i];
777: aii = ai[row];
778: v1 = aa + aii;
779: vi = aj + aii;
780: nz = ad[row]- aii;
781: if (i < node_max-1) {
782: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
783: * but our indexing to determine it's size could. */
784: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
785: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
786: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
787: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
788: }
790: switch (nsz) { /* Each loop in 'case' is unrolled */
791: case 1:
792: sum1 = b[*r++];
793: for (j=0; j<nz-1; j+=2) {
794: i0 = vi[0];
795: i1 = vi[1];
796: vi +=2;
797: tmp0 = tmps[i0];
798: tmp1 = tmps[i1];
799: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
800: }
801: if (j == nz-1) {
802: tmp0 = tmps[*vi++];
803: sum1 -= *v1++ *tmp0;
804: }
805: tmp[row++]=sum1;
806: break;
807: case 2:
808: sum1 = b[*r++];
809: sum2 = b[*r++];
810: v2 = aa + ai[row+1];
812: for (j=0; j<nz-1; j+=2) {
813: i0 = vi[0];
814: i1 = vi[1];
815: vi +=2;
816: tmp0 = tmps[i0];
817: tmp1 = tmps[i1];
818: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
819: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
820: }
821: if (j == nz-1) {
822: tmp0 = tmps[*vi++];
823: sum1 -= *v1++ *tmp0;
824: sum2 -= *v2++ *tmp0;
825: }
826: sum2 -= *v2++ *sum1;
827: tmp[row++]=sum1;
828: tmp[row++]=sum2;
829: break;
830: case 3:
831: sum1 = b[*r++];
832: sum2 = b[*r++];
833: sum3 = b[*r++];
834: v2 = aa + ai[row+1];
835: v3 = aa + ai[row+2];
837: for (j=0; j<nz-1; j+=2) {
838: i0 = vi[0];
839: i1 = vi[1];
840: vi +=2;
841: tmp0 = tmps[i0];
842: tmp1 = tmps[i1];
843: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
846: }
847: if (j == nz-1) {
848: tmp0 = tmps[*vi++];
849: sum1 -= *v1++ *tmp0;
850: sum2 -= *v2++ *tmp0;
851: sum3 -= *v3++ *tmp0;
852: }
853: sum2 -= *v2++ * sum1;
854: sum3 -= *v3++ * sum1;
855: sum3 -= *v3++ * sum2;
857: tmp[row++]=sum1;
858: tmp[row++]=sum2;
859: tmp[row++]=sum3;
860: break;
862: case 4:
863: sum1 = b[*r++];
864: sum2 = b[*r++];
865: sum3 = b[*r++];
866: sum4 = b[*r++];
867: v2 = aa + ai[row+1];
868: v3 = aa + ai[row+2];
869: v4 = aa + ai[row+3];
871: for (j=0; j<nz-1; j+=2) {
872: i0 = vi[0];
873: i1 = vi[1];
874: vi +=2;
875: tmp0 = tmps[i0];
876: tmp1 = tmps[i1];
877: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
878: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
879: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
880: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
881: }
882: if (j == nz-1) {
883: tmp0 = tmps[*vi++];
884: sum1 -= *v1++ *tmp0;
885: sum2 -= *v2++ *tmp0;
886: sum3 -= *v3++ *tmp0;
887: sum4 -= *v4++ *tmp0;
888: }
889: sum2 -= *v2++ * sum1;
890: sum3 -= *v3++ * sum1;
891: sum4 -= *v4++ * sum1;
892: sum3 -= *v3++ * sum2;
893: sum4 -= *v4++ * sum2;
894: sum4 -= *v4++ * sum3;
896: tmp[row++]=sum1;
897: tmp[row++]=sum2;
898: tmp[row++]=sum3;
899: tmp[row++]=sum4;
900: break;
901: case 5:
902: sum1 = b[*r++];
903: sum2 = b[*r++];
904: sum3 = b[*r++];
905: sum4 = b[*r++];
906: sum5 = b[*r++];
907: v2 = aa + ai[row+1];
908: v3 = aa + ai[row+2];
909: v4 = aa + ai[row+3];
910: v5 = aa + ai[row+4];
912: for (j=0; j<nz-1; j+=2) {
913: i0 = vi[0];
914: i1 = vi[1];
915: vi +=2;
916: tmp0 = tmps[i0];
917: tmp1 = tmps[i1];
918: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
919: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
920: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
921: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
922: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
923: }
924: if (j == nz-1) {
925: tmp0 = tmps[*vi++];
926: sum1 -= *v1++ *tmp0;
927: sum2 -= *v2++ *tmp0;
928: sum3 -= *v3++ *tmp0;
929: sum4 -= *v4++ *tmp0;
930: sum5 -= *v5++ *tmp0;
931: }
933: sum2 -= *v2++ * sum1;
934: sum3 -= *v3++ * sum1;
935: sum4 -= *v4++ * sum1;
936: sum5 -= *v5++ * sum1;
937: sum3 -= *v3++ * sum2;
938: sum4 -= *v4++ * sum2;
939: sum5 -= *v5++ * sum2;
940: sum4 -= *v4++ * sum3;
941: sum5 -= *v5++ * sum3;
942: sum5 -= *v5++ * sum4;
944: tmp[row++]=sum1;
945: tmp[row++]=sum2;
946: tmp[row++]=sum3;
947: tmp[row++]=sum4;
948: tmp[row++]=sum5;
949: break;
950: default:
951: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
952: }
953: }
954: /* backward solve the upper triangular */
955: for (i=node_max -1,row = n-1; i>=0; i--) {
956: nsz = ns[i];
957: aii = ai[row+1] -1;
958: v1 = aa + aii;
959: vi = aj + aii;
960: nz = aii- ad[row];
961: switch (nsz) { /* Each loop in 'case' is unrolled */
962: case 1:
963: sum1 = tmp[row];
965: for (j=nz; j>1; j-=2) {
966: vi -=2;
967: i0 = vi[2];
968: i1 = vi[1];
969: tmp0 = tmps[i0];
970: tmp1 = tmps[i1];
971: v1 -= 2;
972: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
973: }
974: if (j==1) {
975: tmp0 = tmps[*vi--];
976: sum1 -= *v1-- * tmp0;
977: }
978: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
979: break;
980: case 2:
981: sum1 = tmp[row];
982: sum2 = tmp[row -1];
983: v2 = aa + ai[row]-1;
984: for (j=nz; j>1; j-=2) {
985: vi -=2;
986: i0 = vi[2];
987: i1 = vi[1];
988: tmp0 = tmps[i0];
989: tmp1 = tmps[i1];
990: v1 -= 2;
991: v2 -= 2;
992: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
993: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
994: }
995: if (j==1) {
996: tmp0 = tmps[*vi--];
997: sum1 -= *v1-- * tmp0;
998: sum2 -= *v2-- * tmp0;
999: }
1001: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1002: sum2 -= *v2-- * tmp0;
1003: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1004: break;
1005: case 3:
1006: sum1 = tmp[row];
1007: sum2 = tmp[row -1];
1008: sum3 = tmp[row -2];
1009: v2 = aa + ai[row]-1;
1010: v3 = aa + ai[row -1]-1;
1011: for (j=nz; j>1; j-=2) {
1012: vi -=2;
1013: i0 = vi[2];
1014: i1 = vi[1];
1015: tmp0 = tmps[i0];
1016: tmp1 = tmps[i1];
1017: v1 -= 2;
1018: v2 -= 2;
1019: v3 -= 2;
1020: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1023: }
1024: if (j==1) {
1025: tmp0 = tmps[*vi--];
1026: sum1 -= *v1-- * tmp0;
1027: sum2 -= *v2-- * tmp0;
1028: sum3 -= *v3-- * tmp0;
1029: }
1030: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031: sum2 -= *v2-- * tmp0;
1032: sum3 -= *v3-- * tmp0;
1033: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1034: sum3 -= *v3-- * tmp0;
1035: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1037: break;
1038: case 4:
1039: sum1 = tmp[row];
1040: sum2 = tmp[row -1];
1041: sum3 = tmp[row -2];
1042: sum4 = tmp[row -3];
1043: v2 = aa + ai[row]-1;
1044: v3 = aa + ai[row -1]-1;
1045: v4 = aa + ai[row -2]-1;
1047: for (j=nz; j>1; j-=2) {
1048: vi -=2;
1049: i0 = vi[2];
1050: i1 = vi[1];
1051: tmp0 = tmps[i0];
1052: tmp1 = tmps[i1];
1053: v1 -= 2;
1054: v2 -= 2;
1055: v3 -= 2;
1056: v4 -= 2;
1057: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1058: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1059: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1060: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1061: }
1062: if (j==1) {
1063: tmp0 = tmps[*vi--];
1064: sum1 -= *v1-- * tmp0;
1065: sum2 -= *v2-- * tmp0;
1066: sum3 -= *v3-- * tmp0;
1067: sum4 -= *v4-- * tmp0;
1068: }
1070: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1071: sum2 -= *v2-- * tmp0;
1072: sum3 -= *v3-- * tmp0;
1073: sum4 -= *v4-- * tmp0;
1074: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1075: sum3 -= *v3-- * tmp0;
1076: sum4 -= *v4-- * tmp0;
1077: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1078: sum4 -= *v4-- * tmp0;
1079: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1080: break;
1081: case 5:
1082: sum1 = tmp[row];
1083: sum2 = tmp[row -1];
1084: sum3 = tmp[row -2];
1085: sum4 = tmp[row -3];
1086: sum5 = tmp[row -4];
1087: v2 = aa + ai[row]-1;
1088: v3 = aa + ai[row -1]-1;
1089: v4 = aa + ai[row -2]-1;
1090: v5 = aa + ai[row -3]-1;
1091: for (j=nz; j>1; j-=2) {
1092: vi -= 2;
1093: i0 = vi[2];
1094: i1 = vi[1];
1095: tmp0 = tmps[i0];
1096: tmp1 = tmps[i1];
1097: v1 -= 2;
1098: v2 -= 2;
1099: v3 -= 2;
1100: v4 -= 2;
1101: v5 -= 2;
1102: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1103: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1104: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1105: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1106: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1107: }
1108: if (j==1) {
1109: tmp0 = tmps[*vi--];
1110: sum1 -= *v1-- * tmp0;
1111: sum2 -= *v2-- * tmp0;
1112: sum3 -= *v3-- * tmp0;
1113: sum4 -= *v4-- * tmp0;
1114: sum5 -= *v5-- * tmp0;
1115: }
1117: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1118: sum2 -= *v2-- * tmp0;
1119: sum3 -= *v3-- * tmp0;
1120: sum4 -= *v4-- * tmp0;
1121: sum5 -= *v5-- * tmp0;
1122: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1123: sum3 -= *v3-- * tmp0;
1124: sum4 -= *v4-- * tmp0;
1125: sum5 -= *v5-- * tmp0;
1126: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1127: sum4 -= *v4-- * tmp0;
1128: sum5 -= *v5-- * tmp0;
1129: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1130: sum5 -= *v5-- * tmp0;
1131: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1132: break;
1133: default:
1134: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1135: }
1136: }
1137: ISRestoreIndices(isrow,&rout);
1138: ISRestoreIndices(iscol,&cout);
1139: VecRestoreArrayRead(bb,&b);
1140: VecRestoreArrayWrite(xx,&x);
1141: PetscLogFlops(2.0*a->nz - A->cmap->n);
1142: return(0);
1143: }
1145: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1146: {
1147: Mat C =B;
1148: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1149: IS isrow = b->row,isicol = b->icol;
1150: PetscErrorCode ierr;
1151: const PetscInt *r,*ic,*ics;
1152: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1153: PetscInt i,j,k,nz,nzL,row,*pj;
1154: const PetscInt *ajtmp,*bjtmp;
1155: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1156: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1157: FactorShiftCtx sctx;
1158: const PetscInt *ddiag;
1159: PetscReal rs;
1160: MatScalar d;
1161: PetscInt inod,nodesz,node_max,col;
1162: const PetscInt *ns;
1163: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1166: /* MatPivotSetUp(): initialize shift context sctx */
1167: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1169: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1170: ddiag = a->diag;
1171: sctx.shift_top = info->zeropivot;
1172: for (i=0; i<n; i++) {
1173: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1174: d = (aa)[ddiag[i]];
1175: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1176: v = aa+ai[i];
1177: nz = ai[i+1] - ai[i];
1178: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1179: if (rs>sctx.shift_top) sctx.shift_top = rs;
1180: }
1181: sctx.shift_top *= 1.1;
1182: sctx.nshift_max = 5;
1183: sctx.shift_lo = 0.;
1184: sctx.shift_hi = 1.;
1185: }
1187: ISGetIndices(isrow,&r);
1188: ISGetIndices(isicol,&ic);
1190: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1191: ics = ic;
1193: node_max = a->inode.node_count;
1194: ns = a->inode.size;
1195: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1197: /* If max inode size > 4, split it into two inodes.*/
1198: /* also map the inode sizes according to the ordering */
1199: PetscMalloc1(n+1,&tmp_vec1);
1200: for (i=0,j=0; i<node_max; ++i,++j) {
1201: if (ns[i] > 4) {
1202: tmp_vec1[j] = 4;
1203: ++j;
1204: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1205: } else {
1206: tmp_vec1[j] = ns[i];
1207: }
1208: }
1209: /* Use the correct node_max */
1210: node_max = j;
1212: /* Now reorder the inode info based on mat re-ordering info */
1213: /* First create a row -> inode_size_array_index map */
1214: PetscMalloc1(n+1,&nsmap);
1215: PetscMalloc1(node_max+1,&tmp_vec2);
1216: for (i=0,row=0; i<node_max; i++) {
1217: nodesz = tmp_vec1[i];
1218: for (j=0; j<nodesz; j++,row++) {
1219: nsmap[row] = i;
1220: }
1221: }
1222: /* Using nsmap, create a reordered ns structure */
1223: for (i=0,j=0; i< node_max; i++) {
1224: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1225: tmp_vec2[i] = nodesz;
1226: j += nodesz;
1227: }
1228: PetscFree(nsmap);
1229: PetscFree(tmp_vec1);
1231: /* Now use the correct ns */
1232: ns = tmp_vec2;
1234: do {
1235: sctx.newshift = PETSC_FALSE;
1236: /* Now loop over each block-row, and do the factorization */
1237: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1238: nodesz = ns[inod];
1240: switch (nodesz) {
1241: case 1:
1242: /*----------*/
1243: /* zero rtmp1 */
1244: /* L part */
1245: nz = bi[i+1] - bi[i];
1246: bjtmp = bj + bi[i];
1247: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1249: /* U part */
1250: nz = bdiag[i]-bdiag[i+1];
1251: bjtmp = bj + bdiag[i+1]+1;
1252: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1254: /* load in initial (unfactored row) */
1255: nz = ai[r[i]+1] - ai[r[i]];
1256: ajtmp = aj + ai[r[i]];
1257: v = aa + ai[r[i]];
1258: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1260: /* ZeropivotApply() */
1261: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1263: /* elimination */
1264: bjtmp = bj + bi[i];
1265: row = *bjtmp++;
1266: nzL = bi[i+1] - bi[i];
1267: for (k=0; k < nzL; k++) {
1268: pc = rtmp1 + row;
1269: if (*pc != 0.0) {
1270: pv = b->a + bdiag[row];
1271: mul1 = *pc * (*pv);
1272: *pc = mul1;
1273: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1274: pv = b->a + bdiag[row+1]+1;
1275: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1276: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1277: PetscLogFlops(1+2.0*nz);
1278: }
1279: row = *bjtmp++;
1280: }
1282: /* finished row so stick it into b->a */
1283: rs = 0.0;
1284: /* L part */
1285: pv = b->a + bi[i];
1286: pj = b->j + bi[i];
1287: nz = bi[i+1] - bi[i];
1288: for (j=0; j<nz; j++) {
1289: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1290: }
1292: /* U part */
1293: pv = b->a + bdiag[i+1]+1;
1294: pj = b->j + bdiag[i+1]+1;
1295: nz = bdiag[i] - bdiag[i+1]-1;
1296: for (j=0; j<nz; j++) {
1297: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1298: }
1300: /* Check zero pivot */
1301: sctx.rs = rs;
1302: sctx.pv = rtmp1[i];
1303: MatPivotCheck(B,A,info,&sctx,i);
1304: if (sctx.newshift) break;
1306: /* Mark diagonal and invert diagonal for simplier triangular solves */
1307: pv = b->a + bdiag[i];
1308: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1309: break;
1311: case 2:
1312: /*----------*/
1313: /* zero rtmp1 and rtmp2 */
1314: /* L part */
1315: nz = bi[i+1] - bi[i];
1316: bjtmp = bj + bi[i];
1317: for (j=0; j<nz; j++) {
1318: col = bjtmp[j];
1319: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1320: }
1322: /* U part */
1323: nz = bdiag[i]-bdiag[i+1];
1324: bjtmp = bj + bdiag[i+1]+1;
1325: for (j=0; j<nz; j++) {
1326: col = bjtmp[j];
1327: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1328: }
1330: /* load in initial (unfactored row) */
1331: nz = ai[r[i]+1] - ai[r[i]];
1332: ajtmp = aj + ai[r[i]];
1333: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1334: for (j=0; j<nz; j++) {
1335: col = ics[ajtmp[j]];
1336: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1337: }
1338: /* ZeropivotApply(): shift the diagonal of the matrix */
1339: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1341: /* elimination */
1342: bjtmp = bj + bi[i];
1343: row = *bjtmp++; /* pivot row */
1344: nzL = bi[i+1] - bi[i];
1345: for (k=0; k < nzL; k++) {
1346: pc1 = rtmp1 + row;
1347: pc2 = rtmp2 + row;
1348: if (*pc1 != 0.0 || *pc2 != 0.0) {
1349: pv = b->a + bdiag[row];
1350: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1351: *pc1 = mul1; *pc2 = mul2;
1353: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1354: pv = b->a + bdiag[row+1]+1;
1355: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1356: for (j=0; j<nz; j++) {
1357: col = pj[j];
1358: rtmp1[col] -= mul1 * pv[j];
1359: rtmp2[col] -= mul2 * pv[j];
1360: }
1361: PetscLogFlops(2+4.0*nz);
1362: }
1363: row = *bjtmp++;
1364: }
1366: /* finished row i; check zero pivot, then stick row i into b->a */
1367: rs = 0.0;
1368: /* L part */
1369: pc1 = b->a + bi[i];
1370: pj = b->j + bi[i];
1371: nz = bi[i+1] - bi[i];
1372: for (j=0; j<nz; j++) {
1373: col = pj[j];
1374: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1375: }
1376: /* U part */
1377: pc1 = b->a + bdiag[i+1]+1;
1378: pj = b->j + bdiag[i+1]+1;
1379: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1380: for (j=0; j<nz; j++) {
1381: col = pj[j];
1382: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1383: }
1385: sctx.rs = rs;
1386: sctx.pv = rtmp1[i];
1387: MatPivotCheck(B,A,info,&sctx,i);
1388: if (sctx.newshift) break;
1389: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1390: *pc1 = 1.0/sctx.pv;
1392: /* Now take care of diagonal 2x2 block. */
1393: pc2 = rtmp2 + i;
1394: if (*pc2 != 0.0) {
1395: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1396: *pc2 = mul1; /* insert L entry */
1397: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1398: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1399: for (j=0; j<nz; j++) {
1400: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1401: }
1402: PetscLogFlops(1+2.0*nz);
1403: }
1405: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1406: rs = 0.0;
1407: /* L part */
1408: pc2 = b->a + bi[i+1];
1409: pj = b->j + bi[i+1];
1410: nz = bi[i+2] - bi[i+1];
1411: for (j=0; j<nz; j++) {
1412: col = pj[j];
1413: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1414: }
1415: /* U part */
1416: pc2 = b->a + bdiag[i+2]+1;
1417: pj = b->j + bdiag[i+2]+1;
1418: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1419: for (j=0; j<nz; j++) {
1420: col = pj[j];
1421: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1422: }
1424: sctx.rs = rs;
1425: sctx.pv = rtmp2[i+1];
1426: MatPivotCheck(B,A,info,&sctx,i+1);
1427: if (sctx.newshift) break;
1428: pc2 = b->a + bdiag[i+1];
1429: *pc2 = 1.0/sctx.pv;
1430: break;
1432: case 3:
1433: /*----------*/
1434: /* zero rtmp */
1435: /* L part */
1436: nz = bi[i+1] - bi[i];
1437: bjtmp = bj + bi[i];
1438: for (j=0; j<nz; j++) {
1439: col = bjtmp[j];
1440: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1441: }
1443: /* U part */
1444: nz = bdiag[i]-bdiag[i+1];
1445: bjtmp = bj + bdiag[i+1]+1;
1446: for (j=0; j<nz; j++) {
1447: col = bjtmp[j];
1448: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1449: }
1451: /* load in initial (unfactored row) */
1452: nz = ai[r[i]+1] - ai[r[i]];
1453: ajtmp = aj + ai[r[i]];
1454: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1455: for (j=0; j<nz; j++) {
1456: col = ics[ajtmp[j]];
1457: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1458: }
1459: /* ZeropivotApply(): shift the diagonal of the matrix */
1460: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1462: /* elimination */
1463: bjtmp = bj + bi[i];
1464: row = *bjtmp++; /* pivot row */
1465: nzL = bi[i+1] - bi[i];
1466: for (k=0; k < nzL; k++) {
1467: pc1 = rtmp1 + row;
1468: pc2 = rtmp2 + row;
1469: pc3 = rtmp3 + row;
1470: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1471: pv = b->a + bdiag[row];
1472: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1473: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1475: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1476: pv = b->a + bdiag[row+1]+1;
1477: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1478: for (j=0; j<nz; j++) {
1479: col = pj[j];
1480: rtmp1[col] -= mul1 * pv[j];
1481: rtmp2[col] -= mul2 * pv[j];
1482: rtmp3[col] -= mul3 * pv[j];
1483: }
1484: PetscLogFlops(3+6.0*nz);
1485: }
1486: row = *bjtmp++;
1487: }
1489: /* finished row i; check zero pivot, then stick row i into b->a */
1490: rs = 0.0;
1491: /* L part */
1492: pc1 = b->a + bi[i];
1493: pj = b->j + bi[i];
1494: nz = bi[i+1] - bi[i];
1495: for (j=0; j<nz; j++) {
1496: col = pj[j];
1497: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1498: }
1499: /* U part */
1500: pc1 = b->a + bdiag[i+1]+1;
1501: pj = b->j + bdiag[i+1]+1;
1502: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1503: for (j=0; j<nz; j++) {
1504: col = pj[j];
1505: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1506: }
1508: sctx.rs = rs;
1509: sctx.pv = rtmp1[i];
1510: MatPivotCheck(B,A,info,&sctx,i);
1511: if (sctx.newshift) break;
1512: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1513: *pc1 = 1.0/sctx.pv;
1515: /* Now take care of 1st column of diagonal 3x3 block. */
1516: pc2 = rtmp2 + i;
1517: pc3 = rtmp3 + i;
1518: if (*pc2 != 0.0 || *pc3 != 0.0) {
1519: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1520: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1521: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1522: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1523: for (j=0; j<nz; j++) {
1524: col = pj[j];
1525: rtmp2[col] -= mul2 * rtmp1[col];
1526: rtmp3[col] -= mul3 * rtmp1[col];
1527: }
1528: PetscLogFlops(2+4.0*nz);
1529: }
1531: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1532: rs = 0.0;
1533: /* L part */
1534: pc2 = b->a + bi[i+1];
1535: pj = b->j + bi[i+1];
1536: nz = bi[i+2] - bi[i+1];
1537: for (j=0; j<nz; j++) {
1538: col = pj[j];
1539: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1540: }
1541: /* U part */
1542: pc2 = b->a + bdiag[i+2]+1;
1543: pj = b->j + bdiag[i+2]+1;
1544: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1545: for (j=0; j<nz; j++) {
1546: col = pj[j];
1547: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1548: }
1550: sctx.rs = rs;
1551: sctx.pv = rtmp2[i+1];
1552: MatPivotCheck(B,A,info,&sctx,i+1);
1553: if (sctx.newshift) break;
1554: pc2 = b->a + bdiag[i+1];
1555: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1557: /* Now take care of 2nd column of diagonal 3x3 block. */
1558: pc3 = rtmp3 + i+1;
1559: if (*pc3 != 0.0) {
1560: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1561: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1562: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1563: for (j=0; j<nz; j++) {
1564: col = pj[j];
1565: rtmp3[col] -= mul3 * rtmp2[col];
1566: }
1567: PetscLogFlops(1+2.0*nz);
1568: }
1570: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1571: rs = 0.0;
1572: /* L part */
1573: pc3 = b->a + bi[i+2];
1574: pj = b->j + bi[i+2];
1575: nz = bi[i+3] - bi[i+2];
1576: for (j=0; j<nz; j++) {
1577: col = pj[j];
1578: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1579: }
1580: /* U part */
1581: pc3 = b->a + bdiag[i+3]+1;
1582: pj = b->j + bdiag[i+3]+1;
1583: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1584: for (j=0; j<nz; j++) {
1585: col = pj[j];
1586: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1587: }
1589: sctx.rs = rs;
1590: sctx.pv = rtmp3[i+2];
1591: MatPivotCheck(B,A,info,&sctx,i+2);
1592: if (sctx.newshift) break;
1593: pc3 = b->a + bdiag[i+2];
1594: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1595: break;
1596: case 4:
1597: /*----------*/
1598: /* zero rtmp */
1599: /* L part */
1600: nz = bi[i+1] - bi[i];
1601: bjtmp = bj + bi[i];
1602: for (j=0; j<nz; j++) {
1603: col = bjtmp[j];
1604: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1605: }
1607: /* U part */
1608: nz = bdiag[i]-bdiag[i+1];
1609: bjtmp = bj + bdiag[i+1]+1;
1610: for (j=0; j<nz; j++) {
1611: col = bjtmp[j];
1612: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1613: }
1615: /* load in initial (unfactored row) */
1616: nz = ai[r[i]+1] - ai[r[i]];
1617: ajtmp = aj + ai[r[i]];
1618: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1619: for (j=0; j<nz; j++) {
1620: col = ics[ajtmp[j]];
1621: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1622: }
1623: /* ZeropivotApply(): shift the diagonal of the matrix */
1624: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1626: /* elimination */
1627: bjtmp = bj + bi[i];
1628: row = *bjtmp++; /* pivot row */
1629: nzL = bi[i+1] - bi[i];
1630: for (k=0; k < nzL; k++) {
1631: pc1 = rtmp1 + row;
1632: pc2 = rtmp2 + row;
1633: pc3 = rtmp3 + row;
1634: pc4 = rtmp4 + row;
1635: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1636: pv = b->a + bdiag[row];
1637: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1638: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1640: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1641: pv = b->a + bdiag[row+1]+1;
1642: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1643: for (j=0; j<nz; j++) {
1644: col = pj[j];
1645: rtmp1[col] -= mul1 * pv[j];
1646: rtmp2[col] -= mul2 * pv[j];
1647: rtmp3[col] -= mul3 * pv[j];
1648: rtmp4[col] -= mul4 * pv[j];
1649: }
1650: PetscLogFlops(4+8.0*nz);
1651: }
1652: row = *bjtmp++;
1653: }
1655: /* finished row i; check zero pivot, then stick row i into b->a */
1656: rs = 0.0;
1657: /* L part */
1658: pc1 = b->a + bi[i];
1659: pj = b->j + bi[i];
1660: nz = bi[i+1] - bi[i];
1661: for (j=0; j<nz; j++) {
1662: col = pj[j];
1663: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1664: }
1665: /* U part */
1666: pc1 = b->a + bdiag[i+1]+1;
1667: pj = b->j + bdiag[i+1]+1;
1668: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1669: for (j=0; j<nz; j++) {
1670: col = pj[j];
1671: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1672: }
1674: sctx.rs = rs;
1675: sctx.pv = rtmp1[i];
1676: MatPivotCheck(B,A,info,&sctx,i);
1677: if (sctx.newshift) break;
1678: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1679: *pc1 = 1.0/sctx.pv;
1681: /* Now take care of 1st column of diagonal 4x4 block. */
1682: pc2 = rtmp2 + i;
1683: pc3 = rtmp3 + i;
1684: pc4 = rtmp4 + i;
1685: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1686: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1687: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1688: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1689: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1690: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1691: for (j=0; j<nz; j++) {
1692: col = pj[j];
1693: rtmp2[col] -= mul2 * rtmp1[col];
1694: rtmp3[col] -= mul3 * rtmp1[col];
1695: rtmp4[col] -= mul4 * rtmp1[col];
1696: }
1697: PetscLogFlops(3+6.0*nz);
1698: }
1700: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1701: rs = 0.0;
1702: /* L part */
1703: pc2 = b->a + bi[i+1];
1704: pj = b->j + bi[i+1];
1705: nz = bi[i+2] - bi[i+1];
1706: for (j=0; j<nz; j++) {
1707: col = pj[j];
1708: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1709: }
1710: /* U part */
1711: pc2 = b->a + bdiag[i+2]+1;
1712: pj = b->j + bdiag[i+2]+1;
1713: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1714: for (j=0; j<nz; j++) {
1715: col = pj[j];
1716: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1717: }
1719: sctx.rs = rs;
1720: sctx.pv = rtmp2[i+1];
1721: MatPivotCheck(B,A,info,&sctx,i+1);
1722: if (sctx.newshift) break;
1723: pc2 = b->a + bdiag[i+1];
1724: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1726: /* Now take care of 2nd column of diagonal 4x4 block. */
1727: pc3 = rtmp3 + i+1;
1728: pc4 = rtmp4 + i+1;
1729: if (*pc3 != 0.0 || *pc4 != 0.0) {
1730: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1731: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1732: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1733: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1734: for (j=0; j<nz; j++) {
1735: col = pj[j];
1736: rtmp3[col] -= mul3 * rtmp2[col];
1737: rtmp4[col] -= mul4 * rtmp2[col];
1738: }
1739: PetscLogFlops(4.0*nz);
1740: }
1742: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1743: rs = 0.0;
1744: /* L part */
1745: pc3 = b->a + bi[i+2];
1746: pj = b->j + bi[i+2];
1747: nz = bi[i+3] - bi[i+2];
1748: for (j=0; j<nz; j++) {
1749: col = pj[j];
1750: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1751: }
1752: /* U part */
1753: pc3 = b->a + bdiag[i+3]+1;
1754: pj = b->j + bdiag[i+3]+1;
1755: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1756: for (j=0; j<nz; j++) {
1757: col = pj[j];
1758: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1759: }
1761: sctx.rs = rs;
1762: sctx.pv = rtmp3[i+2];
1763: MatPivotCheck(B,A,info,&sctx,i+2);
1764: if (sctx.newshift) break;
1765: pc3 = b->a + bdiag[i+2];
1766: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1768: /* Now take care of 3rd column of diagonal 4x4 block. */
1769: pc4 = rtmp4 + i+2;
1770: if (*pc4 != 0.0) {
1771: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1772: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1773: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1774: for (j=0; j<nz; j++) {
1775: col = pj[j];
1776: rtmp4[col] -= mul4 * rtmp3[col];
1777: }
1778: PetscLogFlops(1+2.0*nz);
1779: }
1781: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1782: rs = 0.0;
1783: /* L part */
1784: pc4 = b->a + bi[i+3];
1785: pj = b->j + bi[i+3];
1786: nz = bi[i+4] - bi[i+3];
1787: for (j=0; j<nz; j++) {
1788: col = pj[j];
1789: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1790: }
1791: /* U part */
1792: pc4 = b->a + bdiag[i+4]+1;
1793: pj = b->j + bdiag[i+4]+1;
1794: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1795: for (j=0; j<nz; j++) {
1796: col = pj[j];
1797: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1798: }
1800: sctx.rs = rs;
1801: sctx.pv = rtmp4[i+3];
1802: MatPivotCheck(B,A,info,&sctx,i+3);
1803: if (sctx.newshift) break;
1804: pc4 = b->a + bdiag[i+3];
1805: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1806: break;
1808: default:
1809: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1810: }
1811: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1812: i += nodesz; /* Update the row */
1813: }
1815: /* MatPivotRefine() */
1816: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1817: /*
1818: * if no shift in this attempt & shifting & started shifting & can refine,
1819: * then try lower shift
1820: */
1821: sctx.shift_hi = sctx.shift_fraction;
1822: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1823: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1824: sctx.newshift = PETSC_TRUE;
1825: sctx.nshift++;
1826: }
1827: } while (sctx.newshift);
1829: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1830: PetscFree(tmp_vec2);
1831: ISRestoreIndices(isicol,&ic);
1832: ISRestoreIndices(isrow,&r);
1834: if (b->inode.size) {
1835: C->ops->solve = MatSolve_SeqAIJ_Inode;
1836: } else {
1837: C->ops->solve = MatSolve_SeqAIJ;
1838: }
1839: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1840: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1841: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1842: C->ops->matsolve = MatMatSolve_SeqAIJ;
1843: C->assembled = PETSC_TRUE;
1844: C->preallocated = PETSC_TRUE;
1846: PetscLogFlops(C->cmap->n);
1848: /* MatShiftView(A,info,&sctx) */
1849: if (sctx.nshift) {
1850: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1851: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1852: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1853: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1854: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1855: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1856: }
1857: }
1858: return(0);
1859: }
1861: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1862: {
1863: Mat C = B;
1864: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1865: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1866: PetscErrorCode ierr;
1867: const PetscInt *r,*ic,*c,*ics;
1868: PetscInt n = A->rmap->n,*bi = b->i;
1869: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1870: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1871: PetscInt *ai = a->i,*aj = a->j;
1872: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1873: PetscScalar mul1,mul2,mul3,tmp;
1874: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1875: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1876: PetscReal rs=0.0;
1877: FactorShiftCtx sctx;
1880: sctx.shift_top = 0;
1881: sctx.nshift_max = 0;
1882: sctx.shift_lo = 0;
1883: sctx.shift_hi = 0;
1884: sctx.shift_fraction = 0;
1886: /* if both shift schemes are chosen by user, only use info->shiftpd */
1887: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1888: sctx.shift_top = 0;
1889: for (i=0; i<n; i++) {
1890: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1891: rs = 0.0;
1892: ajtmp = aj + ai[i];
1893: rtmp1 = aa + ai[i];
1894: nz = ai[i+1] - ai[i];
1895: for (j=0; j<nz; j++) {
1896: if (*ajtmp != i) {
1897: rs += PetscAbsScalar(*rtmp1++);
1898: } else {
1899: rs -= PetscRealPart(*rtmp1++);
1900: }
1901: ajtmp++;
1902: }
1903: if (rs>sctx.shift_top) sctx.shift_top = rs;
1904: }
1905: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1906: sctx.shift_top *= 1.1;
1907: sctx.nshift_max = 5;
1908: sctx.shift_lo = 0.;
1909: sctx.shift_hi = 1.;
1910: }
1911: sctx.shift_amount = 0;
1912: sctx.nshift = 0;
1914: ISGetIndices(isrow,&r);
1915: ISGetIndices(iscol,&c);
1916: ISGetIndices(isicol,&ic);
1917: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1918: ics = ic;
1920: node_max = a->inode.node_count;
1921: ns = a->inode.size;
1922: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1924: /* If max inode size > 3, split it into two inodes.*/
1925: /* also map the inode sizes according to the ordering */
1926: PetscMalloc1(n+1,&tmp_vec1);
1927: for (i=0,j=0; i<node_max; ++i,++j) {
1928: if (ns[i]>3) {
1929: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1930: ++j;
1931: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1932: } else {
1933: tmp_vec1[j] = ns[i];
1934: }
1935: }
1936: /* Use the correct node_max */
1937: node_max = j;
1939: /* Now reorder the inode info based on mat re-ordering info */
1940: /* First create a row -> inode_size_array_index map */
1941: PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1942: for (i=0,row=0; i<node_max; i++) {
1943: nodesz = tmp_vec1[i];
1944: for (j=0; j<nodesz; j++,row++) {
1945: nsmap[row] = i;
1946: }
1947: }
1948: /* Using nsmap, create a reordered ns structure */
1949: for (i=0,j=0; i< node_max; i++) {
1950: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1951: tmp_vec2[i] = nodesz;
1952: j += nodesz;
1953: }
1954: PetscFree2(nsmap,tmp_vec1);
1955: /* Now use the correct ns */
1956: ns = tmp_vec2;
1958: do {
1959: sctx.newshift = PETSC_FALSE;
1960: /* Now loop over each block-row, and do the factorization */
1961: for (i=0,row=0; i<node_max; i++) {
1962: nodesz = ns[i];
1963: nz = bi[row+1] - bi[row];
1964: bjtmp = bj + bi[row];
1966: switch (nodesz) {
1967: case 1:
1968: for (j=0; j<nz; j++) {
1969: idx = bjtmp[j];
1970: rtmp11[idx] = 0.0;
1971: }
1973: /* load in initial (unfactored row) */
1974: idx = r[row];
1975: nz_tmp = ai[idx+1] - ai[idx];
1976: ajtmp = aj + ai[idx];
1977: v1 = aa + ai[idx];
1979: for (j=0; j<nz_tmp; j++) {
1980: idx = ics[ajtmp[j]];
1981: rtmp11[idx] = v1[j];
1982: }
1983: rtmp11[ics[r[row]]] += sctx.shift_amount;
1985: prow = *bjtmp++;
1986: while (prow < row) {
1987: pc1 = rtmp11 + prow;
1988: if (*pc1 != 0.0) {
1989: pv = ba + bd[prow];
1990: pj = nbj + bd[prow];
1991: mul1 = *pc1 * *pv++;
1992: *pc1 = mul1;
1993: nz_tmp = bi[prow+1] - bd[prow] - 1;
1994: PetscLogFlops(1+2.0*nz_tmp);
1995: for (j=0; j<nz_tmp; j++) {
1996: tmp = pv[j];
1997: idx = pj[j];
1998: rtmp11[idx] -= mul1 * tmp;
1999: }
2000: }
2001: prow = *bjtmp++;
2002: }
2003: pj = bj + bi[row];
2004: pc1 = ba + bi[row];
2006: sctx.pv = rtmp11[row];
2007: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2008: rs = 0.0;
2009: for (j=0; j<nz; j++) {
2010: idx = pj[j];
2011: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2012: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2013: }
2014: sctx.rs = rs;
2015: MatPivotCheck(B,A,info,&sctx,row);
2016: if (sctx.newshift) goto endofwhile;
2017: break;
2019: case 2:
2020: for (j=0; j<nz; j++) {
2021: idx = bjtmp[j];
2022: rtmp11[idx] = 0.0;
2023: rtmp22[idx] = 0.0;
2024: }
2026: /* load in initial (unfactored row) */
2027: idx = r[row];
2028: nz_tmp = ai[idx+1] - ai[idx];
2029: ajtmp = aj + ai[idx];
2030: v1 = aa + ai[idx];
2031: v2 = aa + ai[idx+1];
2032: for (j=0; j<nz_tmp; j++) {
2033: idx = ics[ajtmp[j]];
2034: rtmp11[idx] = v1[j];
2035: rtmp22[idx] = v2[j];
2036: }
2037: rtmp11[ics[r[row]]] += sctx.shift_amount;
2038: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2040: prow = *bjtmp++;
2041: while (prow < row) {
2042: pc1 = rtmp11 + prow;
2043: pc2 = rtmp22 + prow;
2044: if (*pc1 != 0.0 || *pc2 != 0.0) {
2045: pv = ba + bd[prow];
2046: pj = nbj + bd[prow];
2047: mul1 = *pc1 * *pv;
2048: mul2 = *pc2 * *pv;
2049: ++pv;
2050: *pc1 = mul1;
2051: *pc2 = mul2;
2053: nz_tmp = bi[prow+1] - bd[prow] - 1;
2054: for (j=0; j<nz_tmp; j++) {
2055: tmp = pv[j];
2056: idx = pj[j];
2057: rtmp11[idx] -= mul1 * tmp;
2058: rtmp22[idx] -= mul2 * tmp;
2059: }
2060: PetscLogFlops(2+4.0*nz_tmp);
2061: }
2062: prow = *bjtmp++;
2063: }
2065: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2066: pc1 = rtmp11 + prow;
2067: pc2 = rtmp22 + prow;
2069: sctx.pv = *pc1;
2070: pj = bj + bi[prow];
2071: rs = 0.0;
2072: for (j=0; j<nz; j++) {
2073: idx = pj[j];
2074: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2075: }
2076: sctx.rs = rs;
2077: MatPivotCheck(B,A,info,&sctx,row);
2078: if (sctx.newshift) goto endofwhile;
2080: if (*pc2 != 0.0) {
2081: pj = nbj + bd[prow];
2082: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2083: *pc2 = mul2;
2084: nz_tmp = bi[prow+1] - bd[prow] - 1;
2085: for (j=0; j<nz_tmp; j++) {
2086: idx = pj[j];
2087: tmp = rtmp11[idx];
2088: rtmp22[idx] -= mul2 * tmp;
2089: }
2090: PetscLogFlops(1+2.0*nz_tmp);
2091: }
2093: pj = bj + bi[row];
2094: pc1 = ba + bi[row];
2095: pc2 = ba + bi[row+1];
2097: sctx.pv = rtmp22[row+1];
2098: rs = 0.0;
2099: rtmp11[row] = 1.0/rtmp11[row];
2100: rtmp22[row+1] = 1.0/rtmp22[row+1];
2101: /* copy row entries from dense representation to sparse */
2102: for (j=0; j<nz; j++) {
2103: idx = pj[j];
2104: pc1[j] = rtmp11[idx];
2105: pc2[j] = rtmp22[idx];
2106: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2107: }
2108: sctx.rs = rs;
2109: MatPivotCheck(B,A,info,&sctx,row+1);
2110: if (sctx.newshift) goto endofwhile;
2111: break;
2113: case 3:
2114: for (j=0; j<nz; j++) {
2115: idx = bjtmp[j];
2116: rtmp11[idx] = 0.0;
2117: rtmp22[idx] = 0.0;
2118: rtmp33[idx] = 0.0;
2119: }
2120: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2121: idx = r[row];
2122: nz_tmp = ai[idx+1] - ai[idx];
2123: ajtmp = aj + ai[idx];
2124: v1 = aa + ai[idx];
2125: v2 = aa + ai[idx+1];
2126: v3 = aa + ai[idx+2];
2127: for (j=0; j<nz_tmp; j++) {
2128: idx = ics[ajtmp[j]];
2129: rtmp11[idx] = v1[j];
2130: rtmp22[idx] = v2[j];
2131: rtmp33[idx] = v3[j];
2132: }
2133: rtmp11[ics[r[row]]] += sctx.shift_amount;
2134: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2135: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2137: /* loop over all pivot row blocks above this row block */
2138: prow = *bjtmp++;
2139: while (prow < row) {
2140: pc1 = rtmp11 + prow;
2141: pc2 = rtmp22 + prow;
2142: pc3 = rtmp33 + prow;
2143: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2144: pv = ba + bd[prow];
2145: pj = nbj + bd[prow];
2146: mul1 = *pc1 * *pv;
2147: mul2 = *pc2 * *pv;
2148: mul3 = *pc3 * *pv;
2149: ++pv;
2150: *pc1 = mul1;
2151: *pc2 = mul2;
2152: *pc3 = mul3;
2154: nz_tmp = bi[prow+1] - bd[prow] - 1;
2155: /* update this row based on pivot row */
2156: for (j=0; j<nz_tmp; j++) {
2157: tmp = pv[j];
2158: idx = pj[j];
2159: rtmp11[idx] -= mul1 * tmp;
2160: rtmp22[idx] -= mul2 * tmp;
2161: rtmp33[idx] -= mul3 * tmp;
2162: }
2163: PetscLogFlops(3+6.0*nz_tmp);
2164: }
2165: prow = *bjtmp++;
2166: }
2168: /* Now take care of diagonal 3x3 block in this set of rows */
2169: /* note: prow = row here */
2170: pc1 = rtmp11 + prow;
2171: pc2 = rtmp22 + prow;
2172: pc3 = rtmp33 + prow;
2174: sctx.pv = *pc1;
2175: pj = bj + bi[prow];
2176: rs = 0.0;
2177: for (j=0; j<nz; j++) {
2178: idx = pj[j];
2179: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2180: }
2181: sctx.rs = rs;
2182: MatPivotCheck(B,A,info,&sctx,row);
2183: if (sctx.newshift) goto endofwhile;
2185: if (*pc2 != 0.0 || *pc3 != 0.0) {
2186: mul2 = (*pc2)/(*pc1);
2187: mul3 = (*pc3)/(*pc1);
2188: *pc2 = mul2;
2189: *pc3 = mul3;
2190: nz_tmp = bi[prow+1] - bd[prow] - 1;
2191: pj = nbj + bd[prow];
2192: for (j=0; j<nz_tmp; j++) {
2193: idx = pj[j];
2194: tmp = rtmp11[idx];
2195: rtmp22[idx] -= mul2 * tmp;
2196: rtmp33[idx] -= mul3 * tmp;
2197: }
2198: PetscLogFlops(2+4.0*nz_tmp);
2199: }
2200: ++prow;
2202: pc2 = rtmp22 + prow;
2203: pc3 = rtmp33 + prow;
2204: sctx.pv = *pc2;
2205: pj = bj + bi[prow];
2206: rs = 0.0;
2207: for (j=0; j<nz; j++) {
2208: idx = pj[j];
2209: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2210: }
2211: sctx.rs = rs;
2212: MatPivotCheck(B,A,info,&sctx,row+1);
2213: if (sctx.newshift) goto endofwhile;
2215: if (*pc3 != 0.0) {
2216: mul3 = (*pc3)/(*pc2);
2217: *pc3 = mul3;
2218: pj = nbj + bd[prow];
2219: nz_tmp = bi[prow+1] - bd[prow] - 1;
2220: for (j=0; j<nz_tmp; j++) {
2221: idx = pj[j];
2222: tmp = rtmp22[idx];
2223: rtmp33[idx] -= mul3 * tmp;
2224: }
2225: PetscLogFlops(1+2.0*nz_tmp);
2226: }
2228: pj = bj + bi[row];
2229: pc1 = ba + bi[row];
2230: pc2 = ba + bi[row+1];
2231: pc3 = ba + bi[row+2];
2233: sctx.pv = rtmp33[row+2];
2234: rs = 0.0;
2235: rtmp11[row] = 1.0/rtmp11[row];
2236: rtmp22[row+1] = 1.0/rtmp22[row+1];
2237: rtmp33[row+2] = 1.0/rtmp33[row+2];
2238: /* copy row entries from dense representation to sparse */
2239: for (j=0; j<nz; j++) {
2240: idx = pj[j];
2241: pc1[j] = rtmp11[idx];
2242: pc2[j] = rtmp22[idx];
2243: pc3[j] = rtmp33[idx];
2244: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2245: }
2247: sctx.rs = rs;
2248: MatPivotCheck(B,A,info,&sctx,row+2);
2249: if (sctx.newshift) goto endofwhile;
2250: break;
2252: default:
2253: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2254: }
2255: row += nodesz; /* Update the row */
2256: }
2257: endofwhile:;
2258: } while (sctx.newshift);
2259: PetscFree3(rtmp11,rtmp22,rtmp33);
2260: PetscFree(tmp_vec2);
2261: ISRestoreIndices(isicol,&ic);
2262: ISRestoreIndices(isrow,&r);
2263: ISRestoreIndices(iscol,&c);
2265: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2266: /* do not set solve add, since MatSolve_Inode + Add is faster */
2267: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2268: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2269: C->assembled = PETSC_TRUE;
2270: C->preallocated = PETSC_TRUE;
2271: if (sctx.nshift) {
2272: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2273: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2274: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2275: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2276: }
2277: }
2278: PetscLogFlops(C->cmap->n);
2279: MatSeqAIJCheckInode(C);
2280: return(0);
2281: }
2284: /* ----------------------------------------------------------- */
2285: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2286: {
2287: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2288: IS iscol = a->col,isrow = a->row;
2289: PetscErrorCode ierr;
2290: const PetscInt *r,*c,*rout,*cout;
2291: PetscInt i,j,n = A->rmap->n;
2292: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2293: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2294: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2295: PetscScalar sum1,sum2,sum3,sum4,sum5;
2296: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2297: const PetscScalar *b;
2300: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2301: node_max = a->inode.node_count;
2302: ns = a->inode.size; /* Node Size array */
2304: VecGetArrayRead(bb,&b);
2305: VecGetArrayWrite(xx,&x);
2306: tmp = a->solve_work;
2308: ISGetIndices(isrow,&rout); r = rout;
2309: ISGetIndices(iscol,&cout); c = cout;
2311: /* forward solve the lower triangular */
2312: tmps = tmp;
2313: aa = a_a;
2314: aj = a_j;
2315: ad = a->diag;
2317: for (i = 0,row = 0; i< node_max; ++i) {
2318: nsz = ns[i];
2319: aii = ai[row];
2320: v1 = aa + aii;
2321: vi = aj + aii;
2322: nz = ai[row+1]- ai[row];
2324: if (i < node_max-1) {
2325: /* Prefetch the indices for the next block */
2326: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2327: /* Prefetch the data for the next block */
2328: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2329: }
2331: switch (nsz) { /* Each loop in 'case' is unrolled */
2332: case 1:
2333: sum1 = b[r[row]];
2334: for (j=0; j<nz-1; j+=2) {
2335: i0 = vi[j];
2336: i1 = vi[j+1];
2337: tmp0 = tmps[i0];
2338: tmp1 = tmps[i1];
2339: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2340: }
2341: if (j == nz-1) {
2342: tmp0 = tmps[vi[j]];
2343: sum1 -= v1[j]*tmp0;
2344: }
2345: tmp[row++]=sum1;
2346: break;
2347: case 2:
2348: sum1 = b[r[row]];
2349: sum2 = b[r[row+1]];
2350: v2 = aa + ai[row+1];
2352: for (j=0; j<nz-1; j+=2) {
2353: i0 = vi[j];
2354: i1 = vi[j+1];
2355: tmp0 = tmps[i0];
2356: tmp1 = tmps[i1];
2357: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2358: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2359: }
2360: if (j == nz-1) {
2361: tmp0 = tmps[vi[j]];
2362: sum1 -= v1[j] *tmp0;
2363: sum2 -= v2[j] *tmp0;
2364: }
2365: sum2 -= v2[nz] * sum1;
2366: tmp[row++]=sum1;
2367: tmp[row++]=sum2;
2368: break;
2369: case 3:
2370: sum1 = b[r[row]];
2371: sum2 = b[r[row+1]];
2372: sum3 = b[r[row+2]];
2373: v2 = aa + ai[row+1];
2374: v3 = aa + ai[row+2];
2376: for (j=0; j<nz-1; j+=2) {
2377: i0 = vi[j];
2378: i1 = vi[j+1];
2379: tmp0 = tmps[i0];
2380: tmp1 = tmps[i1];
2381: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2382: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2383: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2384: }
2385: if (j == nz-1) {
2386: tmp0 = tmps[vi[j]];
2387: sum1 -= v1[j] *tmp0;
2388: sum2 -= v2[j] *tmp0;
2389: sum3 -= v3[j] *tmp0;
2390: }
2391: sum2 -= v2[nz] * sum1;
2392: sum3 -= v3[nz] * sum1;
2393: sum3 -= v3[nz+1] * sum2;
2394: tmp[row++]=sum1;
2395: tmp[row++]=sum2;
2396: tmp[row++]=sum3;
2397: break;
2399: case 4:
2400: sum1 = b[r[row]];
2401: sum2 = b[r[row+1]];
2402: sum3 = b[r[row+2]];
2403: sum4 = b[r[row+3]];
2404: v2 = aa + ai[row+1];
2405: v3 = aa + ai[row+2];
2406: v4 = aa + ai[row+3];
2408: for (j=0; j<nz-1; j+=2) {
2409: i0 = vi[j];
2410: i1 = vi[j+1];
2411: tmp0 = tmps[i0];
2412: tmp1 = tmps[i1];
2413: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2414: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2415: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2416: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2417: }
2418: if (j == nz-1) {
2419: tmp0 = tmps[vi[j]];
2420: sum1 -= v1[j] *tmp0;
2421: sum2 -= v2[j] *tmp0;
2422: sum3 -= v3[j] *tmp0;
2423: sum4 -= v4[j] *tmp0;
2424: }
2425: sum2 -= v2[nz] * sum1;
2426: sum3 -= v3[nz] * sum1;
2427: sum4 -= v4[nz] * sum1;
2428: sum3 -= v3[nz+1] * sum2;
2429: sum4 -= v4[nz+1] * sum2;
2430: sum4 -= v4[nz+2] * sum3;
2432: tmp[row++]=sum1;
2433: tmp[row++]=sum2;
2434: tmp[row++]=sum3;
2435: tmp[row++]=sum4;
2436: break;
2437: case 5:
2438: sum1 = b[r[row]];
2439: sum2 = b[r[row+1]];
2440: sum3 = b[r[row+2]];
2441: sum4 = b[r[row+3]];
2442: sum5 = b[r[row+4]];
2443: v2 = aa + ai[row+1];
2444: v3 = aa + ai[row+2];
2445: v4 = aa + ai[row+3];
2446: v5 = aa + ai[row+4];
2448: for (j=0; j<nz-1; j+=2) {
2449: i0 = vi[j];
2450: i1 = vi[j+1];
2451: tmp0 = tmps[i0];
2452: tmp1 = tmps[i1];
2453: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2454: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2455: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2456: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2457: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2458: }
2459: if (j == nz-1) {
2460: tmp0 = tmps[vi[j]];
2461: sum1 -= v1[j] *tmp0;
2462: sum2 -= v2[j] *tmp0;
2463: sum3 -= v3[j] *tmp0;
2464: sum4 -= v4[j] *tmp0;
2465: sum5 -= v5[j] *tmp0;
2466: }
2468: sum2 -= v2[nz] * sum1;
2469: sum3 -= v3[nz] * sum1;
2470: sum4 -= v4[nz] * sum1;
2471: sum5 -= v5[nz] * sum1;
2472: sum3 -= v3[nz+1] * sum2;
2473: sum4 -= v4[nz+1] * sum2;
2474: sum5 -= v5[nz+1] * sum2;
2475: sum4 -= v4[nz+2] * sum3;
2476: sum5 -= v5[nz+2] * sum3;
2477: sum5 -= v5[nz+3] * sum4;
2479: tmp[row++]=sum1;
2480: tmp[row++]=sum2;
2481: tmp[row++]=sum3;
2482: tmp[row++]=sum4;
2483: tmp[row++]=sum5;
2484: break;
2485: default:
2486: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2487: }
2488: }
2489: /* backward solve the upper triangular */
2490: for (i=node_max -1,row = n-1; i>=0; i--) {
2491: nsz = ns[i];
2492: aii = ad[row+1] + 1;
2493: v1 = aa + aii;
2494: vi = aj + aii;
2495: nz = ad[row]- ad[row+1] - 1;
2497: if (i > 0) {
2498: /* Prefetch the indices for the next block */
2499: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2500: /* Prefetch the data for the next block */
2501: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2502: }
2504: switch (nsz) { /* Each loop in 'case' is unrolled */
2505: case 1:
2506: sum1 = tmp[row];
2508: for (j=0; j<nz-1; j+=2) {
2509: i0 = vi[j];
2510: i1 = vi[j+1];
2511: tmp0 = tmps[i0];
2512: tmp1 = tmps[i1];
2513: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2514: }
2515: if (j == nz-1) {
2516: tmp0 = tmps[vi[j]];
2517: sum1 -= v1[j]*tmp0;
2518: }
2519: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2520: break;
2521: case 2:
2522: sum1 = tmp[row];
2523: sum2 = tmp[row-1];
2524: v2 = aa + ad[row] + 1;
2525: for (j=0; j<nz-1; j+=2) {
2526: i0 = vi[j];
2527: i1 = vi[j+1];
2528: tmp0 = tmps[i0];
2529: tmp1 = tmps[i1];
2530: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2531: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2532: }
2533: if (j == nz-1) {
2534: tmp0 = tmps[vi[j]];
2535: sum1 -= v1[j]* tmp0;
2536: sum2 -= v2[j+1]* tmp0;
2537: }
2539: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2540: sum2 -= v2[0] * tmp0;
2541: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2542: break;
2543: case 3:
2544: sum1 = tmp[row];
2545: sum2 = tmp[row -1];
2546: sum3 = tmp[row -2];
2547: v2 = aa + ad[row] + 1;
2548: v3 = aa + ad[row -1] + 1;
2549: for (j=0; j<nz-1; j+=2) {
2550: i0 = vi[j];
2551: i1 = vi[j+1];
2552: tmp0 = tmps[i0];
2553: tmp1 = tmps[i1];
2554: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2555: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2556: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2557: }
2558: if (j== nz-1) {
2559: tmp0 = tmps[vi[j]];
2560: sum1 -= v1[j] * tmp0;
2561: sum2 -= v2[j+1] * tmp0;
2562: sum3 -= v3[j+2] * tmp0;
2563: }
2564: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565: sum2 -= v2[0]* tmp0;
2566: sum3 -= v3[1] * tmp0;
2567: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2568: sum3 -= v3[0]* tmp0;
2569: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2571: break;
2572: case 4:
2573: sum1 = tmp[row];
2574: sum2 = tmp[row -1];
2575: sum3 = tmp[row -2];
2576: sum4 = tmp[row -3];
2577: v2 = aa + ad[row]+1;
2578: v3 = aa + ad[row -1]+1;
2579: v4 = aa + ad[row -2]+1;
2581: for (j=0; j<nz-1; j+=2) {
2582: i0 = vi[j];
2583: i1 = vi[j+1];
2584: tmp0 = tmps[i0];
2585: tmp1 = tmps[i1];
2586: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2587: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2588: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2589: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2590: }
2591: if (j== nz-1) {
2592: tmp0 = tmps[vi[j]];
2593: sum1 -= v1[j] * tmp0;
2594: sum2 -= v2[j+1] * tmp0;
2595: sum3 -= v3[j+2] * tmp0;
2596: sum4 -= v4[j+3] * tmp0;
2597: }
2599: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600: sum2 -= v2[0] * tmp0;
2601: sum3 -= v3[1] * tmp0;
2602: sum4 -= v4[2] * tmp0;
2603: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2604: sum3 -= v3[0] * tmp0;
2605: sum4 -= v4[1] * tmp0;
2606: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2607: sum4 -= v4[0] * tmp0;
2608: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2609: break;
2610: case 5:
2611: sum1 = tmp[row];
2612: sum2 = tmp[row -1];
2613: sum3 = tmp[row -2];
2614: sum4 = tmp[row -3];
2615: sum5 = tmp[row -4];
2616: v2 = aa + ad[row]+1;
2617: v3 = aa + ad[row -1]+1;
2618: v4 = aa + ad[row -2]+1;
2619: v5 = aa + ad[row -3]+1;
2620: for (j=0; j<nz-1; j+=2) {
2621: i0 = vi[j];
2622: i1 = vi[j+1];
2623: tmp0 = tmps[i0];
2624: tmp1 = tmps[i1];
2625: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2626: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2630: }
2631: if (j==nz-1) {
2632: tmp0 = tmps[vi[j]];
2633: sum1 -= v1[j] * tmp0;
2634: sum2 -= v2[j+1] * tmp0;
2635: sum3 -= v3[j+2] * tmp0;
2636: sum4 -= v4[j+3] * tmp0;
2637: sum5 -= v5[j+4] * tmp0;
2638: }
2640: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2641: sum2 -= v2[0] * tmp0;
2642: sum3 -= v3[1] * tmp0;
2643: sum4 -= v4[2] * tmp0;
2644: sum5 -= v5[3] * tmp0;
2645: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2646: sum3 -= v3[0] * tmp0;
2647: sum4 -= v4[1] * tmp0;
2648: sum5 -= v5[2] * tmp0;
2649: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650: sum4 -= v4[0] * tmp0;
2651: sum5 -= v5[1] * tmp0;
2652: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2653: sum5 -= v5[0] * tmp0;
2654: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2655: break;
2656: default:
2657: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2658: }
2659: }
2660: ISRestoreIndices(isrow,&rout);
2661: ISRestoreIndices(iscol,&cout);
2662: VecRestoreArrayRead(bb,&b);
2663: VecRestoreArrayWrite(xx,&x);
2664: PetscLogFlops(2.0*a->nz - A->cmap->n);
2665: return(0);
2666: }
2669: /*
2670: Makes a longer coloring[] array and calls the usual code with that
2671: */
2672: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2673: {
2674: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2675: PetscErrorCode ierr;
2676: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2677: PetscInt *colorused,i;
2678: ISColoringValue *newcolor;
2681: PetscMalloc1(n+1,&newcolor);
2682: /* loop over inodes, marking a color for each column*/
2683: row = 0;
2684: for (i=0; i<m; i++) {
2685: for (j=0; j<ns[i]; j++) {
2686: newcolor[row++] = coloring[i] + j*ncolors;
2687: }
2688: }
2690: /* eliminate unneeded colors */
2691: PetscCalloc1(5*ncolors,&colorused);
2692: for (i=0; i<n; i++) {
2693: colorused[newcolor[i]] = 1;
2694: }
2696: for (i=1; i<5*ncolors; i++) {
2697: colorused[i] += colorused[i-1];
2698: }
2699: ncolors = colorused[5*ncolors-1];
2700: for (i=0; i<n; i++) {
2701: newcolor[i] = colorused[newcolor[i]]-1;
2702: }
2703: PetscFree(colorused);
2704: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2705: PetscFree(coloring);
2706: return(0);
2707: }
2709: #include <petsc/private/kernels/blockinvert.h>
2711: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2712: {
2713: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2714: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2715: MatScalar *ibdiag,*bdiag,work[25],*t;
2716: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2717: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2718: const PetscScalar *xb, *b;
2719: PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2720: PetscErrorCode ierr;
2721: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2722: PetscInt sz,k,ipvt[5];
2723: PetscBool allowzeropivot,zeropivotdetected;
2724: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2727: allowzeropivot = PetscNot(A->erroriffailure);
2728: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2729: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2731: if (!a->inode.ibdiagvalid) {
2732: if (!a->inode.ibdiag) {
2733: /* calculate space needed for diagonal blocks */
2734: for (i=0; i<m; i++) {
2735: cnt += sizes[i]*sizes[i];
2736: }
2737: a->inode.bdiagsize = cnt;
2739: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2740: }
2742: /* copy over the diagonal blocks and invert them */
2743: ibdiag = a->inode.ibdiag;
2744: bdiag = a->inode.bdiag;
2745: cnt = 0;
2746: for (i=0, row = 0; i<m; i++) {
2747: for (j=0; j<sizes[i]; j++) {
2748: for (k=0; k<sizes[i]; k++) {
2749: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2750: }
2751: }
2752: PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);
2754: switch (sizes[i]) {
2755: case 1:
2756: /* Create matrix data structure */
2757: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2758: if (allowzeropivot) {
2759: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2760: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2761: A->factorerror_zeropivot_row = row;
2762: PetscInfo1(A,"Zero pivot, row %D\n",row);
2763: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2764: }
2765: ibdiag[cnt] = 1.0/ibdiag[cnt];
2766: break;
2767: case 2:
2768: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2769: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2770: break;
2771: case 3:
2772: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2773: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2774: break;
2775: case 4:
2776: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2777: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2778: break;
2779: case 5:
2780: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2781: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2782: break;
2783: default:
2784: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2785: }
2786: cnt += sizes[i]*sizes[i];
2787: row += sizes[i];
2788: }
2789: a->inode.ibdiagvalid = PETSC_TRUE;
2790: }
2791: ibdiag = a->inode.ibdiag;
2792: bdiag = a->inode.bdiag;
2793: t = a->inode.ssor_work;
2795: VecGetArray(xx,&x);
2796: VecGetArrayRead(bb,&b);
2797: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2798: if (flag & SOR_ZERO_INITIAL_GUESS) {
2799: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2801: for (i=0, row=0; i<m; i++) {
2802: sz = diag[row] - ii[row];
2803: v1 = a->a + ii[row];
2804: idx = a->j + ii[row];
2806: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2807: switch (sizes[i]) {
2808: case 1:
2810: sum1 = b[row];
2811: for (n = 0; n<sz-1; n+=2) {
2812: i1 = idx[0];
2813: i2 = idx[1];
2814: idx += 2;
2815: tmp0 = x[i1];
2816: tmp1 = x[i2];
2817: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2818: }
2820: if (n == sz-1) {
2821: tmp0 = x[*idx];
2822: sum1 -= *v1 * tmp0;
2823: }
2824: t[row] = sum1;
2825: x[row++] = sum1*(*ibdiag++);
2826: break;
2827: case 2:
2828: v2 = a->a + ii[row+1];
2829: sum1 = b[row];
2830: sum2 = b[row+1];
2831: for (n = 0; n<sz-1; n+=2) {
2832: i1 = idx[0];
2833: i2 = idx[1];
2834: idx += 2;
2835: tmp0 = x[i1];
2836: tmp1 = x[i2];
2837: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2838: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2839: }
2841: if (n == sz-1) {
2842: tmp0 = x[*idx];
2843: sum1 -= v1[0] * tmp0;
2844: sum2 -= v2[0] * tmp0;
2845: }
2846: t[row] = sum1;
2847: t[row+1] = sum2;
2848: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2849: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2850: ibdiag += 4;
2851: break;
2852: case 3:
2853: v2 = a->a + ii[row+1];
2854: v3 = a->a + ii[row+2];
2855: sum1 = b[row];
2856: sum2 = b[row+1];
2857: sum3 = b[row+2];
2858: for (n = 0; n<sz-1; n+=2) {
2859: i1 = idx[0];
2860: i2 = idx[1];
2861: idx += 2;
2862: tmp0 = x[i1];
2863: tmp1 = x[i2];
2864: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2865: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2866: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2867: }
2869: if (n == sz-1) {
2870: tmp0 = x[*idx];
2871: sum1 -= v1[0] * tmp0;
2872: sum2 -= v2[0] * tmp0;
2873: sum3 -= v3[0] * tmp0;
2874: }
2875: t[row] = sum1;
2876: t[row+1] = sum2;
2877: t[row+2] = sum3;
2878: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2879: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2880: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2881: ibdiag += 9;
2882: break;
2883: case 4:
2884: v2 = a->a + ii[row+1];
2885: v3 = a->a + ii[row+2];
2886: v4 = a->a + ii[row+3];
2887: sum1 = b[row];
2888: sum2 = b[row+1];
2889: sum3 = b[row+2];
2890: sum4 = b[row+3];
2891: for (n = 0; n<sz-1; n+=2) {
2892: i1 = idx[0];
2893: i2 = idx[1];
2894: idx += 2;
2895: tmp0 = x[i1];
2896: tmp1 = x[i2];
2897: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2898: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2899: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2900: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2901: }
2903: if (n == sz-1) {
2904: tmp0 = x[*idx];
2905: sum1 -= v1[0] * tmp0;
2906: sum2 -= v2[0] * tmp0;
2907: sum3 -= v3[0] * tmp0;
2908: sum4 -= v4[0] * tmp0;
2909: }
2910: t[row] = sum1;
2911: t[row+1] = sum2;
2912: t[row+2] = sum3;
2913: t[row+3] = sum4;
2914: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2915: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2916: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2917: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2918: ibdiag += 16;
2919: break;
2920: case 5:
2921: v2 = a->a + ii[row+1];
2922: v3 = a->a + ii[row+2];
2923: v4 = a->a + ii[row+3];
2924: v5 = a->a + ii[row+4];
2925: sum1 = b[row];
2926: sum2 = b[row+1];
2927: sum3 = b[row+2];
2928: sum4 = b[row+3];
2929: sum5 = b[row+4];
2930: for (n = 0; n<sz-1; n+=2) {
2931: i1 = idx[0];
2932: i2 = idx[1];
2933: idx += 2;
2934: tmp0 = x[i1];
2935: tmp1 = x[i2];
2936: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2937: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2938: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2939: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2940: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2941: }
2943: if (n == sz-1) {
2944: tmp0 = x[*idx];
2945: sum1 -= v1[0] * tmp0;
2946: sum2 -= v2[0] * tmp0;
2947: sum3 -= v3[0] * tmp0;
2948: sum4 -= v4[0] * tmp0;
2949: sum5 -= v5[0] * tmp0;
2950: }
2951: t[row] = sum1;
2952: t[row+1] = sum2;
2953: t[row+2] = sum3;
2954: t[row+3] = sum4;
2955: t[row+4] = sum5;
2956: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2957: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2958: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2959: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2960: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2961: ibdiag += 25;
2962: break;
2963: default:
2964: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2965: }
2966: }
2968: xb = t;
2969: PetscLogFlops(a->nz);
2970: } else xb = b;
2971: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2973: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2974: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2975: ibdiag -= sizes[i]*sizes[i];
2976: sz = ii[row+1] - diag[row] - 1;
2977: v1 = a->a + diag[row] + 1;
2978: idx = a->j + diag[row] + 1;
2980: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2981: switch (sizes[i]) {
2982: case 1:
2984: sum1 = xb[row];
2985: for (n = 0; n<sz-1; n+=2) {
2986: i1 = idx[0];
2987: i2 = idx[1];
2988: idx += 2;
2989: tmp0 = x[i1];
2990: tmp1 = x[i2];
2991: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2992: }
2994: if (n == sz-1) {
2995: tmp0 = x[*idx];
2996: sum1 -= *v1*tmp0;
2997: }
2998: x[row--] = sum1*(*ibdiag);
2999: break;
3001: case 2:
3003: sum1 = xb[row];
3004: sum2 = xb[row-1];
3005: /* note that sum1 is associated with the second of the two rows */
3006: v2 = a->a + diag[row-1] + 2;
3007: for (n = 0; n<sz-1; n+=2) {
3008: i1 = idx[0];
3009: i2 = idx[1];
3010: idx += 2;
3011: tmp0 = x[i1];
3012: tmp1 = x[i2];
3013: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3014: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3015: }
3017: if (n == sz-1) {
3018: tmp0 = x[*idx];
3019: sum1 -= *v1*tmp0;
3020: sum2 -= *v2*tmp0;
3021: }
3022: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3023: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3024: break;
3025: case 3:
3027: sum1 = xb[row];
3028: sum2 = xb[row-1];
3029: sum3 = xb[row-2];
3030: v2 = a->a + diag[row-1] + 2;
3031: v3 = a->a + diag[row-2] + 3;
3032: for (n = 0; n<sz-1; n+=2) {
3033: i1 = idx[0];
3034: i2 = idx[1];
3035: idx += 2;
3036: tmp0 = x[i1];
3037: tmp1 = x[i2];
3038: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3039: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3040: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3041: }
3043: if (n == sz-1) {
3044: tmp0 = x[*idx];
3045: sum1 -= *v1*tmp0;
3046: sum2 -= *v2*tmp0;
3047: sum3 -= *v3*tmp0;
3048: }
3049: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3050: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3051: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3052: break;
3053: case 4:
3055: sum1 = xb[row];
3056: sum2 = xb[row-1];
3057: sum3 = xb[row-2];
3058: sum4 = xb[row-3];
3059: v2 = a->a + diag[row-1] + 2;
3060: v3 = a->a + diag[row-2] + 3;
3061: v4 = a->a + diag[row-3] + 4;
3062: for (n = 0; n<sz-1; n+=2) {
3063: i1 = idx[0];
3064: i2 = idx[1];
3065: idx += 2;
3066: tmp0 = x[i1];
3067: tmp1 = x[i2];
3068: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3072: }
3074: if (n == sz-1) {
3075: tmp0 = x[*idx];
3076: sum1 -= *v1*tmp0;
3077: sum2 -= *v2*tmp0;
3078: sum3 -= *v3*tmp0;
3079: sum4 -= *v4*tmp0;
3080: }
3081: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3082: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3083: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3084: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3085: break;
3086: case 5:
3088: sum1 = xb[row];
3089: sum2 = xb[row-1];
3090: sum3 = xb[row-2];
3091: sum4 = xb[row-3];
3092: sum5 = xb[row-4];
3093: v2 = a->a + diag[row-1] + 2;
3094: v3 = a->a + diag[row-2] + 3;
3095: v4 = a->a + diag[row-3] + 4;
3096: v5 = a->a + diag[row-4] + 5;
3097: for (n = 0; n<sz-1; n+=2) {
3098: i1 = idx[0];
3099: i2 = idx[1];
3100: idx += 2;
3101: tmp0 = x[i1];
3102: tmp1 = x[i2];
3103: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3104: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3105: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3106: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3107: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3108: }
3110: if (n == sz-1) {
3111: tmp0 = x[*idx];
3112: sum1 -= *v1*tmp0;
3113: sum2 -= *v2*tmp0;
3114: sum3 -= *v3*tmp0;
3115: sum4 -= *v4*tmp0;
3116: sum5 -= *v5*tmp0;
3117: }
3118: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3119: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3120: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3121: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3122: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3123: break;
3124: default:
3125: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3126: }
3127: }
3129: PetscLogFlops(a->nz);
3130: }
3131: its--;
3132: }
3133: while (its--) {
3135: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3136: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3137: i<m;
3138: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3140: sz = diag[row] - ii[row];
3141: v1 = a->a + ii[row];
3142: idx = a->j + ii[row];
3143: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3144: switch (sizes[i]) {
3145: case 1:
3146: sum1 = b[row];
3147: for (n = 0; n<sz-1; n+=2) {
3148: i1 = idx[0];
3149: i2 = idx[1];
3150: idx += 2;
3151: tmp0 = x[i1];
3152: tmp1 = x[i2];
3153: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3154: }
3155: if (n == sz-1) {
3156: tmp0 = x[*idx++];
3157: sum1 -= *v1 * tmp0;
3158: v1++;
3159: }
3160: t[row] = sum1;
3161: sz = ii[row+1] - diag[row] - 1;
3162: idx = a->j + diag[row] + 1;
3163: v1 += 1;
3164: for (n = 0; n<sz-1; n+=2) {
3165: i1 = idx[0];
3166: i2 = idx[1];
3167: idx += 2;
3168: tmp0 = x[i1];
3169: tmp1 = x[i2];
3170: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3171: }
3172: if (n == sz-1) {
3173: tmp0 = x[*idx++];
3174: sum1 -= *v1 * tmp0;
3175: }
3176: /* in MatSOR_SeqAIJ this line would be
3177: *
3178: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3179: *
3180: * but omega == 1, so this becomes
3181: *
3182: * x[row] = sum1*(*ibdiag++);
3183: *
3184: */
3185: x[row] = sum1*(*ibdiag);
3186: break;
3187: case 2:
3188: v2 = a->a + ii[row+1];
3189: sum1 = b[row];
3190: sum2 = b[row+1];
3191: for (n = 0; n<sz-1; n+=2) {
3192: i1 = idx[0];
3193: i2 = idx[1];
3194: idx += 2;
3195: tmp0 = x[i1];
3196: tmp1 = x[i2];
3197: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3198: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3199: }
3200: if (n == sz-1) {
3201: tmp0 = x[*idx++];
3202: sum1 -= v1[0] * tmp0;
3203: sum2 -= v2[0] * tmp0;
3204: v1++; v2++;
3205: }
3206: t[row] = sum1;
3207: t[row+1] = sum2;
3208: sz = ii[row+1] - diag[row] - 2;
3209: idx = a->j + diag[row] + 2;
3210: v1 += 2;
3211: v2 += 2;
3212: for (n = 0; n<sz-1; n+=2) {
3213: i1 = idx[0];
3214: i2 = idx[1];
3215: idx += 2;
3216: tmp0 = x[i1];
3217: tmp1 = x[i2];
3218: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3219: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3220: }
3221: if (n == sz-1) {
3222: tmp0 = x[*idx];
3223: sum1 -= v1[0] * tmp0;
3224: sum2 -= v2[0] * tmp0;
3225: }
3226: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3227: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3228: break;
3229: case 3:
3230: v2 = a->a + ii[row+1];
3231: v3 = a->a + ii[row+2];
3232: sum1 = b[row];
3233: sum2 = b[row+1];
3234: sum3 = b[row+2];
3235: for (n = 0; n<sz-1; n+=2) {
3236: i1 = idx[0];
3237: i2 = idx[1];
3238: idx += 2;
3239: tmp0 = x[i1];
3240: tmp1 = x[i2];
3241: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3244: }
3245: if (n == sz-1) {
3246: tmp0 = x[*idx++];
3247: sum1 -= v1[0] * tmp0;
3248: sum2 -= v2[0] * tmp0;
3249: sum3 -= v3[0] * tmp0;
3250: v1++; v2++; v3++;
3251: }
3252: t[row] = sum1;
3253: t[row+1] = sum2;
3254: t[row+2] = sum3;
3255: sz = ii[row+1] - diag[row] - 3;
3256: idx = a->j + diag[row] + 3;
3257: v1 += 3;
3258: v2 += 3;
3259: v3 += 3;
3260: for (n = 0; n<sz-1; n+=2) {
3261: i1 = idx[0];
3262: i2 = idx[1];
3263: idx += 2;
3264: tmp0 = x[i1];
3265: tmp1 = x[i2];
3266: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3267: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3268: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3269: }
3270: if (n == sz-1) {
3271: tmp0 = x[*idx];
3272: sum1 -= v1[0] * tmp0;
3273: sum2 -= v2[0] * tmp0;
3274: sum3 -= v3[0] * tmp0;
3275: }
3276: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3277: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3278: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3279: break;
3280: case 4:
3281: v2 = a->a + ii[row+1];
3282: v3 = a->a + ii[row+2];
3283: v4 = a->a + ii[row+3];
3284: sum1 = b[row];
3285: sum2 = b[row+1];
3286: sum3 = b[row+2];
3287: sum4 = b[row+3];
3288: for (n = 0; n<sz-1; n+=2) {
3289: i1 = idx[0];
3290: i2 = idx[1];
3291: idx += 2;
3292: tmp0 = x[i1];
3293: tmp1 = x[i2];
3294: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3295: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3296: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3297: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3298: }
3299: if (n == sz-1) {
3300: tmp0 = x[*idx++];
3301: sum1 -= v1[0] * tmp0;
3302: sum2 -= v2[0] * tmp0;
3303: sum3 -= v3[0] * tmp0;
3304: sum4 -= v4[0] * tmp0;
3305: v1++; v2++; v3++; v4++;
3306: }
3307: t[row] = sum1;
3308: t[row+1] = sum2;
3309: t[row+2] = sum3;
3310: t[row+3] = sum4;
3311: sz = ii[row+1] - diag[row] - 4;
3312: idx = a->j + diag[row] + 4;
3313: v1 += 4;
3314: v2 += 4;
3315: v3 += 4;
3316: v4 += 4;
3317: for (n = 0; n<sz-1; n+=2) {
3318: i1 = idx[0];
3319: i2 = idx[1];
3320: idx += 2;
3321: tmp0 = x[i1];
3322: tmp1 = x[i2];
3323: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3324: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3325: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3326: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3327: }
3328: if (n == sz-1) {
3329: tmp0 = x[*idx];
3330: sum1 -= v1[0] * tmp0;
3331: sum2 -= v2[0] * tmp0;
3332: sum3 -= v3[0] * tmp0;
3333: sum4 -= v4[0] * tmp0;
3334: }
3335: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3336: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3337: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3338: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3339: break;
3340: case 5:
3341: v2 = a->a + ii[row+1];
3342: v3 = a->a + ii[row+2];
3343: v4 = a->a + ii[row+3];
3344: v5 = a->a + ii[row+4];
3345: sum1 = b[row];
3346: sum2 = b[row+1];
3347: sum3 = b[row+2];
3348: sum4 = b[row+3];
3349: sum5 = b[row+4];
3350: for (n = 0; n<sz-1; n+=2) {
3351: i1 = idx[0];
3352: i2 = idx[1];
3353: idx += 2;
3354: tmp0 = x[i1];
3355: tmp1 = x[i2];
3356: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3357: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3358: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3359: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3360: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3361: }
3362: if (n == sz-1) {
3363: tmp0 = x[*idx++];
3364: sum1 -= v1[0] * tmp0;
3365: sum2 -= v2[0] * tmp0;
3366: sum3 -= v3[0] * tmp0;
3367: sum4 -= v4[0] * tmp0;
3368: sum5 -= v5[0] * tmp0;
3369: v1++; v2++; v3++; v4++; v5++;
3370: }
3371: t[row] = sum1;
3372: t[row+1] = sum2;
3373: t[row+2] = sum3;
3374: t[row+3] = sum4;
3375: t[row+4] = sum5;
3376: sz = ii[row+1] - diag[row] - 5;
3377: idx = a->j + diag[row] + 5;
3378: v1 += 5;
3379: v2 += 5;
3380: v3 += 5;
3381: v4 += 5;
3382: v5 += 5;
3383: for (n = 0; n<sz-1; n+=2) {
3384: i1 = idx[0];
3385: i2 = idx[1];
3386: idx += 2;
3387: tmp0 = x[i1];
3388: tmp1 = x[i2];
3389: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3390: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3391: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3392: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3393: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3394: }
3395: if (n == sz-1) {
3396: tmp0 = x[*idx];
3397: sum1 -= v1[0] * tmp0;
3398: sum2 -= v2[0] * tmp0;
3399: sum3 -= v3[0] * tmp0;
3400: sum4 -= v4[0] * tmp0;
3401: sum5 -= v5[0] * tmp0;
3402: }
3403: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3404: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3405: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3406: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3407: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3408: break;
3409: default:
3410: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3411: }
3412: }
3413: xb = t;
3414: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3415: } else xb = b;
3417: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3419: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3420: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3421: ibdiag -= sizes[i]*sizes[i];
3423: /* set RHS */
3424: if (xb == b) {
3425: /* whole (old way) */
3426: sz = ii[row+1] - ii[row];
3427: idx = a->j + ii[row];
3428: switch (sizes[i]) {
3429: case 5:
3430: v5 = a->a + ii[row-4];
3431: case 4: /* fall through */
3432: v4 = a->a + ii[row-3];
3433: case 3:
3434: v3 = a->a + ii[row-2];
3435: case 2:
3436: v2 = a->a + ii[row-1];
3437: case 1:
3438: v1 = a->a + ii[row];
3439: break;
3440: default:
3441: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3442: }
3443: } else {
3444: /* upper, no diag */
3445: sz = ii[row+1] - diag[row] - 1;
3446: idx = a->j + diag[row] + 1;
3447: switch (sizes[i]) {
3448: case 5:
3449: v5 = a->a + diag[row-4] + 5;
3450: case 4: /* fall through */
3451: v4 = a->a + diag[row-3] + 4;
3452: case 3:
3453: v3 = a->a + diag[row-2] + 3;
3454: case 2:
3455: v2 = a->a + diag[row-1] + 2;
3456: case 1:
3457: v1 = a->a + diag[row] + 1;
3458: }
3459: }
3460: /* set sum */
3461: switch (sizes[i]) {
3462: case 5:
3463: sum5 = xb[row-4];
3464: case 4: /* fall through */
3465: sum4 = xb[row-3];
3466: case 3:
3467: sum3 = xb[row-2];
3468: case 2:
3469: sum2 = xb[row-1];
3470: case 1:
3471: /* note that sum1 is associated with the last row */
3472: sum1 = xb[row];
3473: }
3474: /* do sums */
3475: for (n = 0; n<sz-1; n+=2) {
3476: i1 = idx[0];
3477: i2 = idx[1];
3478: idx += 2;
3479: tmp0 = x[i1];
3480: tmp1 = x[i2];
3481: switch (sizes[i]) {
3482: case 5:
3483: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3484: case 4: /* fall through */
3485: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3486: case 3:
3487: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3488: case 2:
3489: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3490: case 1:
3491: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3492: }
3493: }
3494: /* ragged edge */
3495: if (n == sz-1) {
3496: tmp0 = x[*idx];
3497: switch (sizes[i]) {
3498: case 5:
3499: sum5 -= *v5*tmp0;
3500: case 4: /* fall through */
3501: sum4 -= *v4*tmp0;
3502: case 3:
3503: sum3 -= *v3*tmp0;
3504: case 2:
3505: sum2 -= *v2*tmp0;
3506: case 1:
3507: sum1 -= *v1*tmp0;
3508: }
3509: }
3510: /* update */
3511: if (xb == b) {
3512: /* whole (old way) w/ diag */
3513: switch (sizes[i]) {
3514: case 5:
3515: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3516: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3517: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3518: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3519: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3520: break;
3521: case 4:
3522: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3523: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3524: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3525: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3526: break;
3527: case 3:
3528: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3529: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3530: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3531: break;
3532: case 2:
3533: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3534: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3535: break;
3536: case 1:
3537: x[row--] += sum1*(*ibdiag);
3538: break;
3539: }
3540: } else {
3541: /* no diag so set = */
3542: switch (sizes[i]) {
3543: case 5:
3544: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3545: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3546: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3547: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3548: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3549: break;
3550: case 4:
3551: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3552: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3553: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3554: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3555: break;
3556: case 3:
3557: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3558: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3559: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3560: break;
3561: case 2:
3562: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3563: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3564: break;
3565: case 1:
3566: x[row--] = sum1*(*ibdiag);
3567: break;
3568: }
3569: }
3570: }
3571: if (xb == b) {
3572: PetscLogFlops(2.0*a->nz);
3573: } else {
3574: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3575: }
3576: }
3577: }
3578: if (flag & SOR_EISENSTAT) {
3579: /*
3580: Apply (U + D)^-1 where D is now the block diagonal
3581: */
3582: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3583: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3584: ibdiag -= sizes[i]*sizes[i];
3585: sz = ii[row+1] - diag[row] - 1;
3586: v1 = a->a + diag[row] + 1;
3587: idx = a->j + diag[row] + 1;
3588: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3589: switch (sizes[i]) {
3590: case 1:
3592: sum1 = b[row];
3593: for (n = 0; n<sz-1; n+=2) {
3594: i1 = idx[0];
3595: i2 = idx[1];
3596: idx += 2;
3597: tmp0 = x[i1];
3598: tmp1 = x[i2];
3599: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3600: }
3602: if (n == sz-1) {
3603: tmp0 = x[*idx];
3604: sum1 -= *v1*tmp0;
3605: }
3606: x[row] = sum1*(*ibdiag);row--;
3607: break;
3609: case 2:
3611: sum1 = b[row];
3612: sum2 = b[row-1];
3613: /* note that sum1 is associated with the second of the two rows */
3614: v2 = a->a + diag[row-1] + 2;
3615: for (n = 0; n<sz-1; n+=2) {
3616: i1 = idx[0];
3617: i2 = idx[1];
3618: idx += 2;
3619: tmp0 = x[i1];
3620: tmp1 = x[i2];
3621: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3622: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3623: }
3625: if (n == sz-1) {
3626: tmp0 = x[*idx];
3627: sum1 -= *v1*tmp0;
3628: sum2 -= *v2*tmp0;
3629: }
3630: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3631: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3632: row -= 2;
3633: break;
3634: case 3:
3636: sum1 = b[row];
3637: sum2 = b[row-1];
3638: sum3 = b[row-2];
3639: v2 = a->a + diag[row-1] + 2;
3640: v3 = a->a + diag[row-2] + 3;
3641: for (n = 0; n<sz-1; n+=2) {
3642: i1 = idx[0];
3643: i2 = idx[1];
3644: idx += 2;
3645: tmp0 = x[i1];
3646: tmp1 = x[i2];
3647: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3648: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3649: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3650: }
3652: if (n == sz-1) {
3653: tmp0 = x[*idx];
3654: sum1 -= *v1*tmp0;
3655: sum2 -= *v2*tmp0;
3656: sum3 -= *v3*tmp0;
3657: }
3658: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3659: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3660: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3661: row -= 3;
3662: break;
3663: case 4:
3665: sum1 = b[row];
3666: sum2 = b[row-1];
3667: sum3 = b[row-2];
3668: sum4 = b[row-3];
3669: v2 = a->a + diag[row-1] + 2;
3670: v3 = a->a + diag[row-2] + 3;
3671: v4 = a->a + diag[row-3] + 4;
3672: for (n = 0; n<sz-1; n+=2) {
3673: i1 = idx[0];
3674: i2 = idx[1];
3675: idx += 2;
3676: tmp0 = x[i1];
3677: tmp1 = x[i2];
3678: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3679: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3680: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3681: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3682: }
3684: if (n == sz-1) {
3685: tmp0 = x[*idx];
3686: sum1 -= *v1*tmp0;
3687: sum2 -= *v2*tmp0;
3688: sum3 -= *v3*tmp0;
3689: sum4 -= *v4*tmp0;
3690: }
3691: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3692: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3693: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3694: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3695: row -= 4;
3696: break;
3697: case 5:
3699: sum1 = b[row];
3700: sum2 = b[row-1];
3701: sum3 = b[row-2];
3702: sum4 = b[row-3];
3703: sum5 = b[row-4];
3704: v2 = a->a + diag[row-1] + 2;
3705: v3 = a->a + diag[row-2] + 3;
3706: v4 = a->a + diag[row-3] + 4;
3707: v5 = a->a + diag[row-4] + 5;
3708: for (n = 0; n<sz-1; n+=2) {
3709: i1 = idx[0];
3710: i2 = idx[1];
3711: idx += 2;
3712: tmp0 = x[i1];
3713: tmp1 = x[i2];
3714: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3715: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3716: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3717: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3718: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3719: }
3721: if (n == sz-1) {
3722: tmp0 = x[*idx];
3723: sum1 -= *v1*tmp0;
3724: sum2 -= *v2*tmp0;
3725: sum3 -= *v3*tmp0;
3726: sum4 -= *v4*tmp0;
3727: sum5 -= *v5*tmp0;
3728: }
3729: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3730: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3731: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3732: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3733: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3734: row -= 5;
3735: break;
3736: default:
3737: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3738: }
3739: }
3740: PetscLogFlops(a->nz);
3742: /*
3743: t = b - D x where D is the block diagonal
3744: */
3745: cnt = 0;
3746: for (i=0, row=0; i<m; i++) {
3747: switch (sizes[i]) {
3748: case 1:
3749: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3750: break;
3751: case 2:
3752: x1 = x[row]; x2 = x[row+1];
3753: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3754: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3755: t[row] = b[row] - tmp1;
3756: t[row+1] = b[row+1] - tmp2; row += 2;
3757: cnt += 4;
3758: break;
3759: case 3:
3760: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3761: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3762: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3763: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3764: t[row] = b[row] - tmp1;
3765: t[row+1] = b[row+1] - tmp2;
3766: t[row+2] = b[row+2] - tmp3; row += 3;
3767: cnt += 9;
3768: break;
3769: case 4:
3770: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3771: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3772: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3773: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3774: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3775: t[row] = b[row] - tmp1;
3776: t[row+1] = b[row+1] - tmp2;
3777: t[row+2] = b[row+2] - tmp3;
3778: t[row+3] = b[row+3] - tmp4; row += 4;
3779: cnt += 16;
3780: break;
3781: case 5:
3782: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3783: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3784: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3785: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3786: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3787: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3788: t[row] = b[row] - tmp1;
3789: t[row+1] = b[row+1] - tmp2;
3790: t[row+2] = b[row+2] - tmp3;
3791: t[row+3] = b[row+3] - tmp4;
3792: t[row+4] = b[row+4] - tmp5;row += 5;
3793: cnt += 25;
3794: break;
3795: default:
3796: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3797: }
3798: }
3799: PetscLogFlops(m);
3803: /*
3804: Apply (L + D)^-1 where D is the block diagonal
3805: */
3806: for (i=0, row=0; i<m; i++) {
3807: sz = diag[row] - ii[row];
3808: v1 = a->a + ii[row];
3809: idx = a->j + ii[row];
3810: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3811: switch (sizes[i]) {
3812: case 1:
3814: sum1 = t[row];
3815: for (n = 0; n<sz-1; n+=2) {
3816: i1 = idx[0];
3817: i2 = idx[1];
3818: idx += 2;
3819: tmp0 = t[i1];
3820: tmp1 = t[i2];
3821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3822: }
3824: if (n == sz-1) {
3825: tmp0 = t[*idx];
3826: sum1 -= *v1 * tmp0;
3827: }
3828: x[row] += t[row] = sum1*(*ibdiag++); row++;
3829: break;
3830: case 2:
3831: v2 = a->a + ii[row+1];
3832: sum1 = t[row];
3833: sum2 = t[row+1];
3834: for (n = 0; n<sz-1; n+=2) {
3835: i1 = idx[0];
3836: i2 = idx[1];
3837: idx += 2;
3838: tmp0 = t[i1];
3839: tmp1 = t[i2];
3840: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3841: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3842: }
3844: if (n == sz-1) {
3845: tmp0 = t[*idx];
3846: sum1 -= v1[0] * tmp0;
3847: sum2 -= v2[0] * tmp0;
3848: }
3849: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3850: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3851: ibdiag += 4; row += 2;
3852: break;
3853: case 3:
3854: v2 = a->a + ii[row+1];
3855: v3 = a->a + ii[row+2];
3856: sum1 = t[row];
3857: sum2 = t[row+1];
3858: sum3 = t[row+2];
3859: for (n = 0; n<sz-1; n+=2) {
3860: i1 = idx[0];
3861: i2 = idx[1];
3862: idx += 2;
3863: tmp0 = t[i1];
3864: tmp1 = t[i2];
3865: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3866: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3867: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3868: }
3870: if (n == sz-1) {
3871: tmp0 = t[*idx];
3872: sum1 -= v1[0] * tmp0;
3873: sum2 -= v2[0] * tmp0;
3874: sum3 -= v3[0] * tmp0;
3875: }
3876: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3877: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3878: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3879: ibdiag += 9; row += 3;
3880: break;
3881: case 4:
3882: v2 = a->a + ii[row+1];
3883: v3 = a->a + ii[row+2];
3884: v4 = a->a + ii[row+3];
3885: sum1 = t[row];
3886: sum2 = t[row+1];
3887: sum3 = t[row+2];
3888: sum4 = t[row+3];
3889: for (n = 0; n<sz-1; n+=2) {
3890: i1 = idx[0];
3891: i2 = idx[1];
3892: idx += 2;
3893: tmp0 = t[i1];
3894: tmp1 = t[i2];
3895: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3896: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3897: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3898: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3899: }
3901: if (n == sz-1) {
3902: tmp0 = t[*idx];
3903: sum1 -= v1[0] * tmp0;
3904: sum2 -= v2[0] * tmp0;
3905: sum3 -= v3[0] * tmp0;
3906: sum4 -= v4[0] * tmp0;
3907: }
3908: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3909: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3910: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3911: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3912: ibdiag += 16; row += 4;
3913: break;
3914: case 5:
3915: v2 = a->a + ii[row+1];
3916: v3 = a->a + ii[row+2];
3917: v4 = a->a + ii[row+3];
3918: v5 = a->a + ii[row+4];
3919: sum1 = t[row];
3920: sum2 = t[row+1];
3921: sum3 = t[row+2];
3922: sum4 = t[row+3];
3923: sum5 = t[row+4];
3924: for (n = 0; n<sz-1; n+=2) {
3925: i1 = idx[0];
3926: i2 = idx[1];
3927: idx += 2;
3928: tmp0 = t[i1];
3929: tmp1 = t[i2];
3930: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3931: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3932: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3933: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3934: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3935: }
3937: if (n == sz-1) {
3938: tmp0 = t[*idx];
3939: sum1 -= v1[0] * tmp0;
3940: sum2 -= v2[0] * tmp0;
3941: sum3 -= v3[0] * tmp0;
3942: sum4 -= v4[0] * tmp0;
3943: sum5 -= v5[0] * tmp0;
3944: }
3945: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3946: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3947: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3948: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3949: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3950: ibdiag += 25; row += 5;
3951: break;
3952: default:
3953: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3954: }
3955: }
3956: PetscLogFlops(a->nz);
3957: }
3958: VecRestoreArray(xx,&x);
3959: VecRestoreArrayRead(bb,&b);
3960: return(0);
3961: }
3963: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3964: {
3965: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3966: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3967: const MatScalar *bdiag = a->inode.bdiag;
3968: const PetscScalar *b;
3969: PetscErrorCode ierr;
3970: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3971: const PetscInt *sizes = a->inode.size;
3974: VecGetArray(xx,&x);
3975: VecGetArrayRead(bb,&b);
3976: cnt = 0;
3977: for (i=0, row=0; i<m; i++) {
3978: switch (sizes[i]) {
3979: case 1:
3980: x[row] = b[row]*bdiag[cnt++];row++;
3981: break;
3982: case 2:
3983: x1 = b[row]; x2 = b[row+1];
3984: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3985: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3986: x[row++] = tmp1;
3987: x[row++] = tmp2;
3988: cnt += 4;
3989: break;
3990: case 3:
3991: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3992: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3993: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3994: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3995: x[row++] = tmp1;
3996: x[row++] = tmp2;
3997: x[row++] = tmp3;
3998: cnt += 9;
3999: break;
4000: case 4:
4001: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4002: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4003: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4004: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4005: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4006: x[row++] = tmp1;
4007: x[row++] = tmp2;
4008: x[row++] = tmp3;
4009: x[row++] = tmp4;
4010: cnt += 16;
4011: break;
4012: case 5:
4013: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4014: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4015: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4016: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4017: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4018: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4019: x[row++] = tmp1;
4020: x[row++] = tmp2;
4021: x[row++] = tmp3;
4022: x[row++] = tmp4;
4023: x[row++] = tmp5;
4024: cnt += 25;
4025: break;
4026: default:
4027: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4028: }
4029: }
4030: PetscLogFlops(2.0*cnt);
4031: VecRestoreArray(xx,&x);
4032: VecRestoreArrayRead(bb,&b);
4033: return(0);
4034: }
4036: /*
4037: samestructure indicates that the matrix has not changed its nonzero structure so we
4038: do not need to recompute the inodes
4039: */
4040: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4041: {
4042: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4044: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4045: PetscBool flag;
4046: const PetscInt *idx,*idy,*ii;
4049: if (!a->inode.use) return(0);
4050: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4052: m = A->rmap->n;
4053: if (a->inode.size) ns = a->inode.size;
4054: else {
4055: PetscMalloc1(m+1,&ns);
4056: }
4058: i = 0;
4059: node_count = 0;
4060: idx = a->j;
4061: ii = a->i;
4062: while (i < m) { /* For each row */
4063: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4064: /* Limits the number of elements in a node to 'a->inode.limit' */
4065: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4066: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4067: if (nzy != nzx) break;
4068: idy += nzx; /* Same nonzero pattern */
4069: PetscArraycmp(idx,idy,nzx,&flag);
4070: if (!flag) break;
4071: }
4072: ns[node_count++] = blk_size;
4073: idx += blk_size*nzx;
4074: i = j;
4075: }
4077: {
4078: PetscBool is_cudatype;
4079: PetscObjectTypeCompareAny((PetscObject)A,&is_cudatype, MATAIJCUSPARSE, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MATAIJVIENNACL, MATSEQAIJVIENNACL, MATMPIAIJVIENNACL,"");
4080: if (is_cudatype) {
4081: PetscInfo(A,"Not using Inode routines on GPU matrix\n");
4082: PetscFree(ns);
4083: a->inode.node_count = 0;
4084: a->inode.size = NULL;
4085: a->inode.use = PETSC_FALSE;
4086: a->inode.checked = PETSC_TRUE;
4087: a->inode.mat_nonzerostate = A->nonzerostate;
4088: return(0);
4089: }
4090: }
4092: /* If not enough inodes found,, do not use inode version of the routines */
4093: if (!m || node_count > .8*m) {
4094: PetscFree(ns);
4096: a->inode.node_count = 0;
4097: a->inode.size = NULL;
4098: a->inode.use = PETSC_FALSE;
4099: A->ops->mult = MatMult_SeqAIJ;
4100: A->ops->sor = MatSOR_SeqAIJ;
4101: A->ops->multadd = MatMultAdd_SeqAIJ;
4102: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4103: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4104: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4105: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4106: A->ops->coloringpatch = 0;
4107: A->ops->multdiagonalblock = 0;
4109: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4110: } else {
4111: if (!A->factortype) {
4112: A->ops->mult = MatMult_SeqAIJ_Inode;
4113: A->ops->sor = MatSOR_SeqAIJ_Inode;
4114: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4115: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4116: if (A->rmap->n == A->cmap->n) {
4117: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4118: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4119: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4120: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4121: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4122: }
4123: } else {
4124: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4125: }
4126: a->inode.node_count = node_count;
4127: a->inode.size = ns;
4128: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4129: }
4130: a->inode.checked = PETSC_TRUE;
4131: a->inode.mat_nonzerostate = A->nonzerostate;
4132: return(0);
4133: }
4135: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4136: {
4137: Mat B =*C;
4138: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4140: PetscInt m=A->rmap->n;
4143: c->inode.use = a->inode.use;
4144: c->inode.limit = a->inode.limit;
4145: c->inode.max_limit = a->inode.max_limit;
4146: if (a->inode.size) {
4147: PetscMalloc1(m+1,&c->inode.size);
4148: c->inode.node_count = a->inode.node_count;
4149: PetscArraycpy(c->inode.size,a->inode.size,m+1);
4150: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4151: if (!B->factortype) {
4152: B->ops->mult = MatMult_SeqAIJ_Inode;
4153: B->ops->sor = MatSOR_SeqAIJ_Inode;
4154: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4155: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4156: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4157: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4158: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4159: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4160: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4161: } else {
4162: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4163: }
4164: } else {
4165: c->inode.size = 0;
4166: c->inode.node_count = 0;
4167: }
4168: c->inode.ibdiagvalid = PETSC_FALSE;
4169: c->inode.ibdiag = 0;
4170: c->inode.bdiag = 0;
4171: return(0);
4172: }
4174: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4175: {
4176: PetscInt k;
4177: const PetscInt *vi;
4180: vi = aj + ai[row];
4181: for (k=0; k<nzl; k++) cols[k] = vi[k];
4182: vi = aj + adiag[row];
4183: cols[nzl] = vi[0];
4184: vi = aj + adiag[row+1]+1;
4185: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4186: return(0);
4187: }
4188: /*
4189: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4190: Modified from MatSeqAIJCheckInode().
4192: Input Parameters:
4193: . Mat A - ILU or LU matrix factor
4195: */
4196: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4197: {
4198: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4200: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4201: PetscInt *cols1,*cols2,*ns;
4202: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4203: PetscBool flag;
4206: if (!a->inode.use) return(0);
4207: if (a->inode.checked) return(0);
4209: m = A->rmap->n;
4210: if (a->inode.size) ns = a->inode.size;
4211: else {
4212: PetscMalloc1(m+1,&ns);
4213: }
4215: i = 0;
4216: node_count = 0;
4217: PetscMalloc2(m,&cols1,m,&cols2);
4218: while (i < m) { /* For each row */
4219: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4220: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4221: nzx = nzl1 + nzu1 + 1;
4222: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4224: /* Limits the number of elements in a node to 'a->inode.limit' */
4225: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4226: nzl2 = ai[j+1] - ai[j];
4227: nzu2 = adiag[j] - adiag[j+1] - 1;
4228: nzy = nzl2 + nzu2 + 1;
4229: if (nzy != nzx) break;
4230: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4231: PetscArraycmp(cols1,cols2,nzx,&flag);
4232: if (!flag) break;
4233: }
4234: ns[node_count++] = blk_size;
4235: i = j;
4236: }
4237: PetscFree2(cols1,cols2);
4238: /* If not enough inodes found,, do not use inode version of the routines */
4239: if (!m || node_count > .8*m) {
4240: PetscFree(ns);
4242: a->inode.node_count = 0;
4243: a->inode.size = NULL;
4244: a->inode.use = PETSC_FALSE;
4246: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4247: } else {
4248: A->ops->mult = 0;
4249: A->ops->sor = 0;
4250: A->ops->multadd = 0;
4251: A->ops->getrowij = 0;
4252: A->ops->restorerowij = 0;
4253: A->ops->getcolumnij = 0;
4254: A->ops->restorecolumnij = 0;
4255: A->ops->coloringpatch = 0;
4256: A->ops->multdiagonalblock = 0;
4257: a->inode.node_count = node_count;
4258: a->inode.size = ns;
4260: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4261: }
4262: a->inode.checked = PETSC_TRUE;
4263: return(0);
4264: }
4266: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4267: {
4268: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4271: a->inode.ibdiagvalid = PETSC_FALSE;
4272: return(0);
4273: }
4275: /*
4276: This is really ugly. if inodes are used this replaces the
4277: permutations with ones that correspond to rows/cols of the matrix
4278: rather then inode blocks
4279: */
4280: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4281: {
4285: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4286: return(0);
4287: }
4289: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4290: {
4291: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4293: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4294: const PetscInt *ridx,*cidx;
4295: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4296: PetscInt nslim_col,*ns_col;
4297: IS ris = *rperm,cis = *cperm;
4300: if (!a->inode.size) return(0); /* no inodes so return */
4301: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4303: MatCreateColInode_Private(A,&nslim_col,&ns_col);
4304: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4305: PetscMalloc2(m,&permr,n,&permc);
4307: ISGetIndices(ris,&ridx);
4308: ISGetIndices(cis,&cidx);
4310: /* Form the inode structure for the rows of permuted matric using inv perm*/
4311: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4313: /* Construct the permutations for rows*/
4314: for (i=0,row = 0; i<nslim_row; ++i) {
4315: indx = ridx[i];
4316: start_val = tns[indx];
4317: end_val = tns[indx + 1];
4318: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4319: }
4321: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4322: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4324: /* Construct permutations for columns */
4325: for (i=0,col=0; i<nslim_col; ++i) {
4326: indx = cidx[i];
4327: start_val = tns[indx];
4328: end_val = tns[indx + 1];
4329: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4330: }
4332: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4333: ISSetPermutation(*rperm);
4334: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4335: ISSetPermutation(*cperm);
4337: ISRestoreIndices(ris,&ridx);
4338: ISRestoreIndices(cis,&cidx);
4340: PetscFree(ns_col);
4341: PetscFree2(permr,permc);
4342: ISDestroy(&cis);
4343: ISDestroy(&ris);
4344: PetscFree(tns);
4345: return(0);
4346: }
4348: /*@C
4349: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4351: Not Collective
4353: Input Parameter:
4354: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4356: Output Parameter:
4357: + node_count - no of inodes present in the matrix.
4358: . sizes - an array of size node_count,with sizes of each inode.
4359: - limit - the max size used to generate the inodes.
4361: Level: advanced
4363: Notes:
4364: This routine returns some internal storage information
4365: of the matrix, it is intended to be used by advanced users.
4366: It should be called after the matrix is assembled.
4367: The contents of the sizes[] array should not be changed.
4368: NULL may be passed for information not requested.
4370: .seealso: MatGetInfo()
4371: @*/
4372: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4373: {
4374: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4377: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4378: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4379: if (f) {
4380: (*f)(A,node_count,sizes,limit);
4381: }
4382: return(0);
4383: }
4385: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4386: {
4387: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4390: if (node_count) *node_count = a->inode.node_count;
4391: if (sizes) *sizes = a->inode.size;
4392: if (limit) *limit = a->inode.limit;
4393: return(0);
4394: }