Actual source code: inode.c
petsc-3.8.4 2018-03-24
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
12: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
15: n = A->cmap->n;
16: m = A->rmap->n;
17: ns_row = a->inode.size;
19: min_mn = (m < n) ? m : n;
20: if (!ns) {
21: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22: for (; count+1 < n; count++,i++) ;
23: if (count < n) {
24: i++;
25: }
26: *size = i;
27: return(0);
28: }
29: PetscMalloc1(n+1,&ns_col);
31: /* Use the same row structure wherever feasible. */
32: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33: ns_col[i] = ns_row[i];
34: }
36: /* if m < n; pad up the remainder with inode_limit */
37: for (; count+1 < n; count++,i++) {
38: ns_col[i] = 1;
39: }
40: /* The last node is the odd ball. padd it up with the remaining rows; */
41: if (count < n) {
42: ns_col[i] = n - count;
43: i++;
44: } else if (count > n) {
45: /* Adjust for the over estimation */
46: ns_col[i-1] += n - count;
47: }
48: *size = i;
49: *ns = ns_col;
50: return(0);
51: }
54: /*
55: This builds symmetric version of nonzero structure,
56: */
57: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58: {
59: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
61: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
66: nslim_row = a->inode.node_count;
67: m = A->rmap->n;
68: n = A->cmap->n;
69: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
71: /* Use the row_inode as column_inode */
72: nslim_col = nslim_row;
73: ns_col = ns_row;
75: /* allocate space for reformated inode structure */
76: PetscMalloc1(nslim_col+1,&tns);
77: PetscMalloc1(n+1,&tvc);
78: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
80: for (i1=0,col=0; i1<nslim_col; ++i1) {
81: nsz = ns_col[i1];
82: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
83: }
84: /* allocate space for row pointers */
85: PetscMalloc1(nslim_row+1,&ia);
86: *iia = ia;
87: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
88: PetscMalloc1(nslim_row+1,&work);
90: /* determine the number of columns in each row */
91: ia[0] = oshift;
92: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
94: j = aj + ai[row] + ishift;
95: jmax = aj + ai[row+1] + ishift;
96: if (j==jmax) continue; /* empty row */
97: col = *j++ + ishift;
98: i2 = tvc[col];
99: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100: ia[i1+1]++;
101: ia[i2+1]++;
102: i2++; /* Start col of next node */
103: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104: i2 = tvc[col];
105: }
106: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
107: }
109: /* shift ia[i] to point to next row */
110: for (i1=1; i1<nslim_row+1; i1++) {
111: row = ia[i1-1];
112: ia[i1] += row;
113: work[i1-1] = row - oshift;
114: }
116: /* allocate space for column pointers */
117: nz = ia[nslim_row] + (!ishift);
118: PetscMalloc1(nz,&ja);
119: *jja = ja;
121: /* loop over lower triangular part putting into ja */
122: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123: j = aj + ai[row] + ishift;
124: jmax = aj + ai[row+1] + ishift;
125: if (j==jmax) continue; /* empty row */
126: col = *j++ + ishift;
127: i2 = tvc[col];
128: while (i2<i1 && j<jmax) {
129: ja[work[i2]++] = i1 + oshift;
130: ja[work[i1]++] = i2 + oshift;
131: ++i2;
132: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133: i2 = tvc[col];
134: }
135: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
137: }
138: PetscFree(work);
139: PetscFree(tns);
140: PetscFree(tvc);
141: return(0);
142: }
144: /*
145: This builds nonsymmetric version of nonzero structure,
146: */
147: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
148: {
149: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
151: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
152: PetscInt *tns,*tvc,nsz,i1,i2;
153: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
156: nslim_row = a->inode.node_count;
157: n = A->cmap->n;
159: /* Create The column_inode for this matrix */
160: Mat_CreateColInode(A,&nslim_col,&ns_col);
162: /* allocate space for reformated column_inode structure */
163: PetscMalloc1(nslim_col +1,&tns);
164: PetscMalloc1(n + 1,&tvc);
165: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
167: for (i1=0,col=0; i1<nslim_col; ++i1) {
168: nsz = ns_col[i1];
169: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
170: }
171: /* allocate space for row pointers */
172: PetscMalloc1(nslim_row+1,&ia);
173: *iia = ia;
174: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
175: PetscMalloc1(nslim_row+1,&work);
177: /* determine the number of columns in each row */
178: ia[0] = oshift;
179: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
180: j = aj + ai[row] + ishift;
181: nz = ai[row+1] - ai[row];
182: if (!nz) continue; /* empty row */
183: col = *j++ + ishift;
184: i2 = tvc[col];
185: while (nz-- > 0) { /* off-diagonal elemets */
186: ia[i1+1]++;
187: i2++; /* Start col of next node */
188: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
189: if (nz > 0) i2 = tvc[col];
190: }
191: }
193: /* shift ia[i] to point to next row */
194: for (i1=1; i1<nslim_row+1; i1++) {
195: row = ia[i1-1];
196: ia[i1] += row;
197: work[i1-1] = row - oshift;
198: }
200: /* allocate space for column pointers */
201: nz = ia[nslim_row] + (!ishift);
202: PetscMalloc1(nz,&ja);
203: *jja = ja;
205: /* loop over matrix putting into ja */
206: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
207: j = aj + ai[row] + ishift;
208: nz = ai[row+1] - ai[row];
209: if (!nz) continue; /* empty row */
210: col = *j++ + ishift;
211: i2 = tvc[col];
212: while (nz-- > 0) {
213: ja[work[i1]++] = i2 + oshift;
214: ++i2;
215: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216: if (nz > 0) i2 = tvc[col];
217: }
218: }
219: PetscFree(ns_col);
220: PetscFree(work);
221: PetscFree(tns);
222: PetscFree(tvc);
223: return(0);
224: }
226: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
227: {
228: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
232: *n = a->inode.node_count;
233: if (!ia) return(0);
234: if (!blockcompressed) {
235: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
236: } else if (symmetric) {
237: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
238: } else {
239: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
240: }
241: return(0);
242: }
244: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
245: {
249: if (!ia) return(0);
251: if (!blockcompressed) {
252: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
253: } else {
254: PetscFree(*ia);
255: PetscFree(*ja);
256: }
257: return(0);
258: }
260: /* ----------------------------------------------------------- */
262: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
263: {
264: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
266: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
267: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
270: nslim_row = a->inode.node_count;
271: n = A->cmap->n;
273: /* Create The column_inode for this matrix */
274: Mat_CreateColInode(A,&nslim_col,&ns_col);
276: /* allocate space for reformated column_inode structure */
277: PetscMalloc1(nslim_col + 1,&tns);
278: PetscMalloc1(n + 1,&tvc);
279: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
281: for (i1=0,col=0; i1<nslim_col; ++i1) {
282: nsz = ns_col[i1];
283: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
284: }
285: /* allocate space for column pointers */
286: PetscMalloc1(nslim_col+1,&ia);
287: *iia = ia;
288: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
289: PetscMalloc1(nslim_col+1,&work);
291: /* determine the number of columns in each row */
292: ia[0] = oshift;
293: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
294: j = aj + ai[row] + ishift;
295: col = *j++ + ishift;
296: i2 = tvc[col];
297: nz = ai[row+1] - ai[row];
298: while (nz-- > 0) { /* off-diagonal elemets */
299: /* ia[i1+1]++; */
300: ia[i2+1]++;
301: i2++;
302: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
303: if (nz > 0) i2 = tvc[col];
304: }
305: }
307: /* shift ia[i] to point to next col */
308: for (i1=1; i1<nslim_col+1; i1++) {
309: col = ia[i1-1];
310: ia[i1] += col;
311: work[i1-1] = col - oshift;
312: }
314: /* allocate space for column pointers */
315: nz = ia[nslim_col] + (!ishift);
316: PetscMalloc1(nz,&ja);
317: *jja = ja;
319: /* loop over matrix putting into ja */
320: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
321: j = aj + ai[row] + ishift;
322: col = *j++ + ishift;
323: i2 = tvc[col];
324: nz = ai[row+1] - ai[row];
325: while (nz-- > 0) {
326: /* ja[work[i1]++] = i2 + oshift; */
327: ja[work[i2]++] = i1 + oshift;
328: i2++;
329: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
330: if (nz > 0) i2 = tvc[col];
331: }
332: }
333: PetscFree(ns_col);
334: PetscFree(work);
335: PetscFree(tns);
336: PetscFree(tvc);
337: return(0);
338: }
340: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
341: {
345: Mat_CreateColInode(A,n,NULL);
346: if (!ia) return(0);
348: if (!blockcompressed) {
349: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
350: } else if (symmetric) {
351: /* Since the indices are symmetric it does'nt matter */
352: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
353: } else {
354: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
355: }
356: return(0);
357: }
359: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
360: {
364: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else {
368: PetscFree(*ia);
369: PetscFree(*ja);
370: }
371: return(0);
372: }
374: /* ----------------------------------------------------------- */
376: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
377: {
378: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
379: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
380: PetscScalar *y;
381: const PetscScalar *x;
382: const MatScalar *v1,*v2,*v3,*v4,*v5;
383: PetscErrorCode ierr;
384: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
385: const PetscInt *idx,*ns,*ii;
387: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
388: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
389: #endif
392: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
393: node_max = a->inode.node_count;
394: ns = a->inode.size; /* Node Size array */
395: VecGetArrayRead(xx,&x);
396: VecGetArray(yy,&y);
397: idx = a->j;
398: v1 = a->a;
399: ii = a->i;
401: for (i = 0,row = 0; i< node_max; ++i) {
402: nsz = ns[i];
403: n = ii[1] - ii[0];
404: nonzerorow += (n>0)*nsz;
405: ii += nsz;
406: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
407: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
408: sz = n; /* No of non zeros in this row */
409: /* Switch on the size of Node */
410: switch (nsz) { /* Each loop in 'case' is unrolled */
411: case 1:
412: sum1 = 0.;
414: for (n = 0; n< sz-1; n+=2) {
415: i1 = idx[0]; /* The instructions are ordered to */
416: i2 = idx[1]; /* make the compiler's job easy */
417: idx += 2;
418: tmp0 = x[i1];
419: tmp1 = x[i2];
420: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
421: }
423: if (n == sz-1) { /* Take care of the last nonzero */
424: tmp0 = x[*idx++];
425: sum1 += *v1++ *tmp0;
426: }
427: y[row++]=sum1;
428: break;
429: case 2:
430: sum1 = 0.;
431: sum2 = 0.;
432: v2 = v1 + n;
434: for (n = 0; n< sz-1; n+=2) {
435: i1 = idx[0];
436: i2 = idx[1];
437: idx += 2;
438: tmp0 = x[i1];
439: tmp1 = x[i2];
440: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
442: }
443: if (n == sz-1) {
444: tmp0 = x[*idx++];
445: sum1 += *v1++ * tmp0;
446: sum2 += *v2++ * tmp0;
447: }
448: y[row++]=sum1;
449: y[row++]=sum2;
450: v1 =v2; /* Since the next block to be processed starts there*/
451: idx +=sz;
452: break;
453: case 3:
454: sum1 = 0.;
455: sum2 = 0.;
456: sum3 = 0.;
457: v2 = v1 + n;
458: v3 = v2 + n;
460: for (n = 0; n< sz-1; n+=2) {
461: i1 = idx[0];
462: i2 = idx[1];
463: idx += 2;
464: tmp0 = x[i1];
465: tmp1 = x[i2];
466: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
467: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
468: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
469: }
470: if (n == sz-1) {
471: tmp0 = x[*idx++];
472: sum1 += *v1++ * tmp0;
473: sum2 += *v2++ * tmp0;
474: sum3 += *v3++ * tmp0;
475: }
476: y[row++]=sum1;
477: y[row++]=sum2;
478: y[row++]=sum3;
479: v1 =v3; /* Since the next block to be processed starts there*/
480: idx +=2*sz;
481: break;
482: case 4:
483: sum1 = 0.;
484: sum2 = 0.;
485: sum3 = 0.;
486: sum4 = 0.;
487: v2 = v1 + n;
488: v3 = v2 + n;
489: v4 = v3 + n;
491: for (n = 0; n< sz-1; n+=2) {
492: i1 = idx[0];
493: i2 = idx[1];
494: idx += 2;
495: tmp0 = x[i1];
496: tmp1 = x[i2];
497: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
498: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
499: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
500: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
501: }
502: if (n == sz-1) {
503: tmp0 = x[*idx++];
504: sum1 += *v1++ * tmp0;
505: sum2 += *v2++ * tmp0;
506: sum3 += *v3++ * tmp0;
507: sum4 += *v4++ * tmp0;
508: }
509: y[row++]=sum1;
510: y[row++]=sum2;
511: y[row++]=sum3;
512: y[row++]=sum4;
513: v1 =v4; /* Since the next block to be processed starts there*/
514: idx +=3*sz;
515: break;
516: case 5:
517: sum1 = 0.;
518: sum2 = 0.;
519: sum3 = 0.;
520: sum4 = 0.;
521: sum5 = 0.;
522: v2 = v1 + n;
523: v3 = v2 + n;
524: v4 = v3 + n;
525: v5 = v4 + n;
527: for (n = 0; n<sz-1; n+=2) {
528: i1 = idx[0];
529: i2 = idx[1];
530: idx += 2;
531: tmp0 = x[i1];
532: tmp1 = x[i2];
533: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
534: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
535: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
536: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
537: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
538: }
539: if (n == sz-1) {
540: tmp0 = x[*idx++];
541: sum1 += *v1++ * tmp0;
542: sum2 += *v2++ * tmp0;
543: sum3 += *v3++ * tmp0;
544: sum4 += *v4++ * tmp0;
545: sum5 += *v5++ * tmp0;
546: }
547: y[row++]=sum1;
548: y[row++]=sum2;
549: y[row++]=sum3;
550: y[row++]=sum4;
551: y[row++]=sum5;
552: v1 =v5; /* Since the next block to be processed starts there */
553: idx +=4*sz;
554: break;
555: default:
556: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
557: }
558: }
559: VecRestoreArrayRead(xx,&x);
560: VecRestoreArray(yy,&y);
561: PetscLogFlops(2.0*a->nz - nonzerorow);
562: return(0);
563: }
564: /* ----------------------------------------------------------- */
565: /* Almost same code as the MatMult_SeqAIJ_Inode() */
566: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
567: {
568: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
569: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
570: const MatScalar *v1,*v2,*v3,*v4,*v5;
571: const PetscScalar *x;
572: PetscScalar *y,*z,*zt;
573: PetscErrorCode ierr;
574: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
575: const PetscInt *idx,*ns,*ii;
578: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
579: node_max = a->inode.node_count;
580: ns = a->inode.size; /* Node Size array */
582: VecGetArrayRead(xx,&x);
583: VecGetArrayPair(zz,yy,&z,&y);
584: zt = z;
586: idx = a->j;
587: v1 = a->a;
588: ii = a->i;
590: for (i = 0,row = 0; i< node_max; ++i) {
591: nsz = ns[i];
592: n = ii[1] - ii[0];
593: ii += nsz;
594: sz = n; /* No of non zeros in this row */
595: /* Switch on the size of Node */
596: switch (nsz) { /* Each loop in 'case' is unrolled */
597: case 1:
598: sum1 = *zt++;
600: for (n = 0; n< sz-1; n+=2) {
601: i1 = idx[0]; /* The instructions are ordered to */
602: i2 = idx[1]; /* make the compiler's job easy */
603: idx += 2;
604: tmp0 = x[i1];
605: tmp1 = x[i2];
606: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
607: }
609: if (n == sz-1) { /* Take care of the last nonzero */
610: tmp0 = x[*idx++];
611: sum1 += *v1++ * tmp0;
612: }
613: y[row++]=sum1;
614: break;
615: case 2:
616: sum1 = *zt++;
617: sum2 = *zt++;
618: v2 = v1 + n;
620: for (n = 0; n< sz-1; n+=2) {
621: i1 = idx[0];
622: i2 = idx[1];
623: idx += 2;
624: tmp0 = x[i1];
625: tmp1 = x[i2];
626: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
628: }
629: if (n == sz-1) {
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: sum2 += *v2++ * tmp0;
633: }
634: y[row++]=sum1;
635: y[row++]=sum2;
636: v1 =v2; /* Since the next block to be processed starts there*/
637: idx +=sz;
638: break;
639: case 3:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: v2 = v1 + n;
644: v3 = v2 + n;
646: for (n = 0; n< sz-1; n+=2) {
647: i1 = idx[0];
648: i2 = idx[1];
649: idx += 2;
650: tmp0 = x[i1];
651: tmp1 = x[i2];
652: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
653: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
654: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
655: }
656: if (n == sz-1) {
657: tmp0 = x[*idx++];
658: sum1 += *v1++ * tmp0;
659: sum2 += *v2++ * tmp0;
660: sum3 += *v3++ * tmp0;
661: }
662: y[row++]=sum1;
663: y[row++]=sum2;
664: y[row++]=sum3;
665: v1 =v3; /* Since the next block to be processed starts there*/
666: idx +=2*sz;
667: break;
668: case 4:
669: sum1 = *zt++;
670: sum2 = *zt++;
671: sum3 = *zt++;
672: sum4 = *zt++;
673: v2 = v1 + n;
674: v3 = v2 + n;
675: v4 = v3 + n;
677: for (n = 0; n< sz-1; n+=2) {
678: i1 = idx[0];
679: i2 = idx[1];
680: idx += 2;
681: tmp0 = x[i1];
682: tmp1 = x[i2];
683: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
684: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
685: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
686: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
687: }
688: if (n == sz-1) {
689: tmp0 = x[*idx++];
690: sum1 += *v1++ * tmp0;
691: sum2 += *v2++ * tmp0;
692: sum3 += *v3++ * tmp0;
693: sum4 += *v4++ * tmp0;
694: }
695: y[row++]=sum1;
696: y[row++]=sum2;
697: y[row++]=sum3;
698: y[row++]=sum4;
699: v1 =v4; /* Since the next block to be processed starts there*/
700: idx +=3*sz;
701: break;
702: case 5:
703: sum1 = *zt++;
704: sum2 = *zt++;
705: sum3 = *zt++;
706: sum4 = *zt++;
707: sum5 = *zt++;
708: v2 = v1 + n;
709: v3 = v2 + n;
710: v4 = v3 + n;
711: v5 = v4 + n;
713: for (n = 0; n<sz-1; n+=2) {
714: i1 = idx[0];
715: i2 = idx[1];
716: idx += 2;
717: tmp0 = x[i1];
718: tmp1 = x[i2];
719: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
720: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
721: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
722: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
723: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
724: }
725: if (n == sz-1) {
726: tmp0 = x[*idx++];
727: sum1 += *v1++ * tmp0;
728: sum2 += *v2++ * tmp0;
729: sum3 += *v3++ * tmp0;
730: sum4 += *v4++ * tmp0;
731: sum5 += *v5++ * tmp0;
732: }
733: y[row++]=sum1;
734: y[row++]=sum2;
735: y[row++]=sum3;
736: y[row++]=sum4;
737: y[row++]=sum5;
738: v1 =v5; /* Since the next block to be processed starts there */
739: idx +=4*sz;
740: break;
741: default:
742: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
743: }
744: }
745: VecRestoreArrayRead(xx,&x);
746: VecRestoreArrayPair(zz,yy,&z,&y);
747: PetscLogFlops(2.0*a->nz);
748: return(0);
749: }
751: /* ----------------------------------------------------------- */
752: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
753: {
754: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
755: IS iscol = a->col,isrow = a->row;
756: PetscErrorCode ierr;
757: const PetscInt *r,*c,*rout,*cout;
758: PetscInt i,j,n = A->rmap->n,nz;
759: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
760: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
761: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
762: PetscScalar sum1,sum2,sum3,sum4,sum5;
763: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
764: const PetscScalar *b;
767: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
768: node_max = a->inode.node_count;
769: ns = a->inode.size; /* Node Size array */
771: VecGetArrayRead(bb,&b);
772: VecGetArray(xx,&x);
773: tmp = a->solve_work;
775: ISGetIndices(isrow,&rout); r = rout;
776: ISGetIndices(iscol,&cout); c = cout + (n-1);
778: /* forward solve the lower triangular */
779: tmps = tmp;
780: aa = a_a;
781: aj = a_j;
782: ad = a->diag;
784: for (i = 0,row = 0; i< node_max; ++i) {
785: nsz = ns[i];
786: aii = ai[row];
787: v1 = aa + aii;
788: vi = aj + aii;
789: nz = ad[row]- aii;
790: if (i < node_max-1) {
791: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
792: * but our indexing to determine it's size could. */
793: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
794: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
795: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
796: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
797: }
799: switch (nsz) { /* Each loop in 'case' is unrolled */
800: case 1:
801: sum1 = b[*r++];
802: for (j=0; j<nz-1; j+=2) {
803: i0 = vi[0];
804: i1 = vi[1];
805: vi +=2;
806: tmp0 = tmps[i0];
807: tmp1 = tmps[i1];
808: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809: }
810: if (j == nz-1) {
811: tmp0 = tmps[*vi++];
812: sum1 -= *v1++ *tmp0;
813: }
814: tmp[row++]=sum1;
815: break;
816: case 2:
817: sum1 = b[*r++];
818: sum2 = b[*r++];
819: v2 = aa + ai[row+1];
821: for (j=0; j<nz-1; j+=2) {
822: i0 = vi[0];
823: i1 = vi[1];
824: vi +=2;
825: tmp0 = tmps[i0];
826: tmp1 = tmps[i1];
827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829: }
830: if (j == nz-1) {
831: tmp0 = tmps[*vi++];
832: sum1 -= *v1++ *tmp0;
833: sum2 -= *v2++ *tmp0;
834: }
835: sum2 -= *v2++ *sum1;
836: tmp[row++]=sum1;
837: tmp[row++]=sum2;
838: break;
839: case 3:
840: sum1 = b[*r++];
841: sum2 = b[*r++];
842: sum3 = b[*r++];
843: v2 = aa + ai[row+1];
844: v3 = aa + ai[row+2];
846: for (j=0; j<nz-1; j+=2) {
847: i0 = vi[0];
848: i1 = vi[1];
849: vi +=2;
850: tmp0 = tmps[i0];
851: tmp1 = tmps[i1];
852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855: }
856: if (j == nz-1) {
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: sum3 -= *v3++ *tmp0;
861: }
862: sum2 -= *v2++ * sum1;
863: sum3 -= *v3++ * sum1;
864: sum3 -= *v3++ * sum2;
866: tmp[row++]=sum1;
867: tmp[row++]=sum2;
868: tmp[row++]=sum3;
869: break;
871: case 4:
872: sum1 = b[*r++];
873: sum2 = b[*r++];
874: sum3 = b[*r++];
875: sum4 = b[*r++];
876: v2 = aa + ai[row+1];
877: v3 = aa + ai[row+2];
878: v4 = aa + ai[row+3];
880: for (j=0; j<nz-1; j+=2) {
881: i0 = vi[0];
882: i1 = vi[1];
883: vi +=2;
884: tmp0 = tmps[i0];
885: tmp1 = tmps[i1];
886: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
887: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
888: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
889: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
890: }
891: if (j == nz-1) {
892: tmp0 = tmps[*vi++];
893: sum1 -= *v1++ *tmp0;
894: sum2 -= *v2++ *tmp0;
895: sum3 -= *v3++ *tmp0;
896: sum4 -= *v4++ *tmp0;
897: }
898: sum2 -= *v2++ * sum1;
899: sum3 -= *v3++ * sum1;
900: sum4 -= *v4++ * sum1;
901: sum3 -= *v3++ * sum2;
902: sum4 -= *v4++ * sum2;
903: sum4 -= *v4++ * sum3;
905: tmp[row++]=sum1;
906: tmp[row++]=sum2;
907: tmp[row++]=sum3;
908: tmp[row++]=sum4;
909: break;
910: case 5:
911: sum1 = b[*r++];
912: sum2 = b[*r++];
913: sum3 = b[*r++];
914: sum4 = b[*r++];
915: sum5 = b[*r++];
916: v2 = aa + ai[row+1];
917: v3 = aa + ai[row+2];
918: v4 = aa + ai[row+3];
919: v5 = aa + ai[row+4];
921: for (j=0; j<nz-1; j+=2) {
922: i0 = vi[0];
923: i1 = vi[1];
924: vi +=2;
925: tmp0 = tmps[i0];
926: tmp1 = tmps[i1];
927: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
928: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
929: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
930: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
931: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
932: }
933: if (j == nz-1) {
934: tmp0 = tmps[*vi++];
935: sum1 -= *v1++ *tmp0;
936: sum2 -= *v2++ *tmp0;
937: sum3 -= *v3++ *tmp0;
938: sum4 -= *v4++ *tmp0;
939: sum5 -= *v5++ *tmp0;
940: }
942: sum2 -= *v2++ * sum1;
943: sum3 -= *v3++ * sum1;
944: sum4 -= *v4++ * sum1;
945: sum5 -= *v5++ * sum1;
946: sum3 -= *v3++ * sum2;
947: sum4 -= *v4++ * sum2;
948: sum5 -= *v5++ * sum2;
949: sum4 -= *v4++ * sum3;
950: sum5 -= *v5++ * sum3;
951: sum5 -= *v5++ * sum4;
953: tmp[row++]=sum1;
954: tmp[row++]=sum2;
955: tmp[row++]=sum3;
956: tmp[row++]=sum4;
957: tmp[row++]=sum5;
958: break;
959: default:
960: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
961: }
962: }
963: /* backward solve the upper triangular */
964: for (i=node_max -1,row = n-1; i>=0; i--) {
965: nsz = ns[i];
966: aii = ai[row+1] -1;
967: v1 = aa + aii;
968: vi = aj + aii;
969: nz = aii- ad[row];
970: switch (nsz) { /* Each loop in 'case' is unrolled */
971: case 1:
972: sum1 = tmp[row];
974: for (j=nz; j>1; j-=2) {
975: vi -=2;
976: i0 = vi[2];
977: i1 = vi[1];
978: tmp0 = tmps[i0];
979: tmp1 = tmps[i1];
980: v1 -= 2;
981: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
982: }
983: if (j==1) {
984: tmp0 = tmps[*vi--];
985: sum1 -= *v1-- * tmp0;
986: }
987: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
988: break;
989: case 2:
990: sum1 = tmp[row];
991: sum2 = tmp[row -1];
992: v2 = aa + ai[row]-1;
993: for (j=nz; j>1; j-=2) {
994: vi -=2;
995: i0 = vi[2];
996: i1 = vi[1];
997: tmp0 = tmps[i0];
998: tmp1 = tmps[i1];
999: v1 -= 2;
1000: v2 -= 2;
1001: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1003: }
1004: if (j==1) {
1005: tmp0 = tmps[*vi--];
1006: sum1 -= *v1-- * tmp0;
1007: sum2 -= *v2-- * tmp0;
1008: }
1010: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1011: sum2 -= *v2-- * tmp0;
1012: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1013: break;
1014: case 3:
1015: sum1 = tmp[row];
1016: sum2 = tmp[row -1];
1017: sum3 = tmp[row -2];
1018: v2 = aa + ai[row]-1;
1019: v3 = aa + ai[row -1]-1;
1020: for (j=nz; j>1; j-=2) {
1021: vi -=2;
1022: i0 = vi[2];
1023: i1 = vi[1];
1024: tmp0 = tmps[i0];
1025: tmp1 = tmps[i1];
1026: v1 -= 2;
1027: v2 -= 2;
1028: v3 -= 2;
1029: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1030: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1031: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1032: }
1033: if (j==1) {
1034: tmp0 = tmps[*vi--];
1035: sum1 -= *v1-- * tmp0;
1036: sum2 -= *v2-- * tmp0;
1037: sum3 -= *v3-- * tmp0;
1038: }
1039: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1040: sum2 -= *v2-- * tmp0;
1041: sum3 -= *v3-- * tmp0;
1042: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1043: sum3 -= *v3-- * tmp0;
1044: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1046: break;
1047: case 4:
1048: sum1 = tmp[row];
1049: sum2 = tmp[row -1];
1050: sum3 = tmp[row -2];
1051: sum4 = tmp[row -3];
1052: v2 = aa + ai[row]-1;
1053: v3 = aa + ai[row -1]-1;
1054: v4 = aa + ai[row -2]-1;
1056: for (j=nz; j>1; j-=2) {
1057: vi -=2;
1058: i0 = vi[2];
1059: i1 = vi[1];
1060: tmp0 = tmps[i0];
1061: tmp1 = tmps[i1];
1062: v1 -= 2;
1063: v2 -= 2;
1064: v3 -= 2;
1065: v4 -= 2;
1066: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1067: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1068: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1069: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1070: }
1071: if (j==1) {
1072: tmp0 = tmps[*vi--];
1073: sum1 -= *v1-- * tmp0;
1074: sum2 -= *v2-- * tmp0;
1075: sum3 -= *v3-- * tmp0;
1076: sum4 -= *v4-- * tmp0;
1077: }
1079: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1080: sum2 -= *v2-- * tmp0;
1081: sum3 -= *v3-- * tmp0;
1082: sum4 -= *v4-- * tmp0;
1083: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1084: sum3 -= *v3-- * tmp0;
1085: sum4 -= *v4-- * tmp0;
1086: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1087: sum4 -= *v4-- * tmp0;
1088: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1089: break;
1090: case 5:
1091: sum1 = tmp[row];
1092: sum2 = tmp[row -1];
1093: sum3 = tmp[row -2];
1094: sum4 = tmp[row -3];
1095: sum5 = tmp[row -4];
1096: v2 = aa + ai[row]-1;
1097: v3 = aa + ai[row -1]-1;
1098: v4 = aa + ai[row -2]-1;
1099: v5 = aa + ai[row -3]-1;
1100: for (j=nz; j>1; j-=2) {
1101: vi -= 2;
1102: i0 = vi[2];
1103: i1 = vi[1];
1104: tmp0 = tmps[i0];
1105: tmp1 = tmps[i1];
1106: v1 -= 2;
1107: v2 -= 2;
1108: v3 -= 2;
1109: v4 -= 2;
1110: v5 -= 2;
1111: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1112: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1113: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1114: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1115: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1116: }
1117: if (j==1) {
1118: tmp0 = tmps[*vi--];
1119: sum1 -= *v1-- * tmp0;
1120: sum2 -= *v2-- * tmp0;
1121: sum3 -= *v3-- * tmp0;
1122: sum4 -= *v4-- * tmp0;
1123: sum5 -= *v5-- * tmp0;
1124: }
1126: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1127: sum2 -= *v2-- * tmp0;
1128: sum3 -= *v3-- * tmp0;
1129: sum4 -= *v4-- * tmp0;
1130: sum5 -= *v5-- * tmp0;
1131: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1132: sum3 -= *v3-- * tmp0;
1133: sum4 -= *v4-- * tmp0;
1134: sum5 -= *v5-- * tmp0;
1135: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1136: sum4 -= *v4-- * tmp0;
1137: sum5 -= *v5-- * tmp0;
1138: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1139: sum5 -= *v5-- * tmp0;
1140: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1141: break;
1142: default:
1143: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1144: }
1145: }
1146: ISRestoreIndices(isrow,&rout);
1147: ISRestoreIndices(iscol,&cout);
1148: VecRestoreArrayRead(bb,&b);
1149: VecRestoreArray(xx,&x);
1150: PetscLogFlops(2.0*a->nz - A->cmap->n);
1151: return(0);
1152: }
1154: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1155: {
1156: Mat C =B;
1157: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1158: IS isrow = b->row,isicol = b->icol;
1159: PetscErrorCode ierr;
1160: const PetscInt *r,*ic,*ics;
1161: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1162: PetscInt i,j,k,nz,nzL,row,*pj;
1163: const PetscInt *ajtmp,*bjtmp;
1164: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1165: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1166: FactorShiftCtx sctx;
1167: const PetscInt *ddiag;
1168: PetscReal rs;
1169: MatScalar d;
1170: PetscInt inod,nodesz,node_max,col;
1171: const PetscInt *ns;
1172: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1175: /* MatPivotSetUp(): initialize shift context sctx */
1176: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1178: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1179: ddiag = a->diag;
1180: sctx.shift_top = info->zeropivot;
1181: for (i=0; i<n; i++) {
1182: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183: d = (aa)[ddiag[i]];
1184: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1185: v = aa+ai[i];
1186: nz = ai[i+1] - ai[i];
1187: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1188: if (rs>sctx.shift_top) sctx.shift_top = rs;
1189: }
1190: sctx.shift_top *= 1.1;
1191: sctx.nshift_max = 5;
1192: sctx.shift_lo = 0.;
1193: sctx.shift_hi = 1.;
1194: }
1196: ISGetIndices(isrow,&r);
1197: ISGetIndices(isicol,&ic);
1199: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1200: ics = ic;
1202: node_max = a->inode.node_count;
1203: ns = a->inode.size;
1204: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1206: /* If max inode size > 4, split it into two inodes.*/
1207: /* also map the inode sizes according to the ordering */
1208: PetscMalloc1(n+1,&tmp_vec1);
1209: for (i=0,j=0; i<node_max; ++i,++j) {
1210: if (ns[i] > 4) {
1211: tmp_vec1[j] = 4;
1212: ++j;
1213: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1214: } else {
1215: tmp_vec1[j] = ns[i];
1216: }
1217: }
1218: /* Use the correct node_max */
1219: node_max = j;
1221: /* Now reorder the inode info based on mat re-ordering info */
1222: /* First create a row -> inode_size_array_index map */
1223: PetscMalloc1(n+1,&nsmap);
1224: PetscMalloc1(node_max+1,&tmp_vec2);
1225: for (i=0,row=0; i<node_max; i++) {
1226: nodesz = tmp_vec1[i];
1227: for (j=0; j<nodesz; j++,row++) {
1228: nsmap[row] = i;
1229: }
1230: }
1231: /* Using nsmap, create a reordered ns structure */
1232: for (i=0,j=0; i< node_max; i++) {
1233: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1234: tmp_vec2[i] = nodesz;
1235: j += nodesz;
1236: }
1237: PetscFree(nsmap);
1238: PetscFree(tmp_vec1);
1240: /* Now use the correct ns */
1241: ns = tmp_vec2;
1243: do {
1244: sctx.newshift = PETSC_FALSE;
1245: /* Now loop over each block-row, and do the factorization */
1246: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1247: nodesz = ns[inod];
1249: switch (nodesz) {
1250: case 1:
1251: /*----------*/
1252: /* zero rtmp1 */
1253: /* L part */
1254: nz = bi[i+1] - bi[i];
1255: bjtmp = bj + bi[i];
1256: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1258: /* U part */
1259: nz = bdiag[i]-bdiag[i+1];
1260: bjtmp = bj + bdiag[i+1]+1;
1261: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1263: /* load in initial (unfactored row) */
1264: nz = ai[r[i]+1] - ai[r[i]];
1265: ajtmp = aj + ai[r[i]];
1266: v = aa + ai[r[i]];
1267: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1269: /* ZeropivotApply() */
1270: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1272: /* elimination */
1273: bjtmp = bj + bi[i];
1274: row = *bjtmp++;
1275: nzL = bi[i+1] - bi[i];
1276: for (k=0; k < nzL; k++) {
1277: pc = rtmp1 + row;
1278: if (*pc != 0.0) {
1279: pv = b->a + bdiag[row];
1280: mul1 = *pc * (*pv);
1281: *pc = mul1;
1282: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1283: pv = b->a + bdiag[row+1]+1;
1284: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1285: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1286: PetscLogFlops(1+2*nz);
1287: }
1288: row = *bjtmp++;
1289: }
1291: /* finished row so stick it into b->a */
1292: rs = 0.0;
1293: /* L part */
1294: pv = b->a + bi[i];
1295: pj = b->j + bi[i];
1296: nz = bi[i+1] - bi[i];
1297: for (j=0; j<nz; j++) {
1298: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1299: }
1301: /* U part */
1302: pv = b->a + bdiag[i+1]+1;
1303: pj = b->j + bdiag[i+1]+1;
1304: nz = bdiag[i] - bdiag[i+1]-1;
1305: for (j=0; j<nz; j++) {
1306: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1307: }
1309: /* Check zero pivot */
1310: sctx.rs = rs;
1311: sctx.pv = rtmp1[i];
1312: MatPivotCheck(B,A,info,&sctx,i);
1313: if (sctx.newshift) break;
1315: /* Mark diagonal and invert diagonal for simplier triangular solves */
1316: pv = b->a + bdiag[i];
1317: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1318: break;
1320: case 2:
1321: /*----------*/
1322: /* zero rtmp1 and rtmp2 */
1323: /* L part */
1324: nz = bi[i+1] - bi[i];
1325: bjtmp = bj + bi[i];
1326: for (j=0; j<nz; j++) {
1327: col = bjtmp[j];
1328: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1329: }
1331: /* U part */
1332: nz = bdiag[i]-bdiag[i+1];
1333: bjtmp = bj + bdiag[i+1]+1;
1334: for (j=0; j<nz; j++) {
1335: col = bjtmp[j];
1336: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1337: }
1339: /* load in initial (unfactored row) */
1340: nz = ai[r[i]+1] - ai[r[i]];
1341: ajtmp = aj + ai[r[i]];
1342: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1343: for (j=0; j<nz; j++) {
1344: col = ics[ajtmp[j]];
1345: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1346: }
1347: /* ZeropivotApply(): shift the diagonal of the matrix */
1348: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1350: /* elimination */
1351: bjtmp = bj + bi[i];
1352: row = *bjtmp++; /* pivot row */
1353: nzL = bi[i+1] - bi[i];
1354: for (k=0; k < nzL; k++) {
1355: pc1 = rtmp1 + row;
1356: pc2 = rtmp2 + row;
1357: if (*pc1 != 0.0 || *pc2 != 0.0) {
1358: pv = b->a + bdiag[row];
1359: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1360: *pc1 = mul1; *pc2 = mul2;
1362: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1363: pv = b->a + bdiag[row+1]+1;
1364: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1365: for (j=0; j<nz; j++) {
1366: col = pj[j];
1367: rtmp1[col] -= mul1 * pv[j];
1368: rtmp2[col] -= mul2 * pv[j];
1369: }
1370: PetscLogFlops(2+4*nz);
1371: }
1372: row = *bjtmp++;
1373: }
1375: /* finished row i; check zero pivot, then stick row i into b->a */
1376: rs = 0.0;
1377: /* L part */
1378: pc1 = b->a + bi[i];
1379: pj = b->j + bi[i];
1380: nz = bi[i+1] - bi[i];
1381: for (j=0; j<nz; j++) {
1382: col = pj[j];
1383: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1384: }
1385: /* U part */
1386: pc1 = b->a + bdiag[i+1]+1;
1387: pj = b->j + bdiag[i+1]+1;
1388: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1389: for (j=0; j<nz; j++) {
1390: col = pj[j];
1391: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1392: }
1394: sctx.rs = rs;
1395: sctx.pv = rtmp1[i];
1396: MatPivotCheck(B,A,info,&sctx,i);
1397: if (sctx.newshift) break;
1398: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1399: *pc1 = 1.0/sctx.pv;
1401: /* Now take care of diagonal 2x2 block. */
1402: pc2 = rtmp2 + i;
1403: if (*pc2 != 0.0) {
1404: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1405: *pc2 = mul1; /* insert L entry */
1406: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1407: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1408: for (j=0; j<nz; j++) {
1409: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1410: }
1411: PetscLogFlops(1+2*nz);
1412: }
1414: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1415: rs = 0.0;
1416: /* L part */
1417: pc2 = b->a + bi[i+1];
1418: pj = b->j + bi[i+1];
1419: nz = bi[i+2] - bi[i+1];
1420: for (j=0; j<nz; j++) {
1421: col = pj[j];
1422: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1423: }
1424: /* U part */
1425: pc2 = b->a + bdiag[i+2]+1;
1426: pj = b->j + bdiag[i+2]+1;
1427: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1428: for (j=0; j<nz; j++) {
1429: col = pj[j];
1430: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1431: }
1433: sctx.rs = rs;
1434: sctx.pv = rtmp2[i+1];
1435: MatPivotCheck(B,A,info,&sctx,i+1);
1436: if (sctx.newshift) break;
1437: pc2 = b->a + bdiag[i+1];
1438: *pc2 = 1.0/sctx.pv;
1439: break;
1441: case 3:
1442: /*----------*/
1443: /* zero rtmp */
1444: /* L part */
1445: nz = bi[i+1] - bi[i];
1446: bjtmp = bj + bi[i];
1447: for (j=0; j<nz; j++) {
1448: col = bjtmp[j];
1449: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1450: }
1452: /* U part */
1453: nz = bdiag[i]-bdiag[i+1];
1454: bjtmp = bj + bdiag[i+1]+1;
1455: for (j=0; j<nz; j++) {
1456: col = bjtmp[j];
1457: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1458: }
1460: /* load in initial (unfactored row) */
1461: nz = ai[r[i]+1] - ai[r[i]];
1462: ajtmp = aj + ai[r[i]];
1463: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1464: for (j=0; j<nz; j++) {
1465: col = ics[ajtmp[j]];
1466: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1467: }
1468: /* ZeropivotApply(): shift the diagonal of the matrix */
1469: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1471: /* elimination */
1472: bjtmp = bj + bi[i];
1473: row = *bjtmp++; /* pivot row */
1474: nzL = bi[i+1] - bi[i];
1475: for (k=0; k < nzL; k++) {
1476: pc1 = rtmp1 + row;
1477: pc2 = rtmp2 + row;
1478: pc3 = rtmp3 + row;
1479: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1480: pv = b->a + bdiag[row];
1481: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1482: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1484: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1485: pv = b->a + bdiag[row+1]+1;
1486: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1487: for (j=0; j<nz; j++) {
1488: col = pj[j];
1489: rtmp1[col] -= mul1 * pv[j];
1490: rtmp2[col] -= mul2 * pv[j];
1491: rtmp3[col] -= mul3 * pv[j];
1492: }
1493: PetscLogFlops(3+6*nz);
1494: }
1495: row = *bjtmp++;
1496: }
1498: /* finished row i; check zero pivot, then stick row i into b->a */
1499: rs = 0.0;
1500: /* L part */
1501: pc1 = b->a + bi[i];
1502: pj = b->j + bi[i];
1503: nz = bi[i+1] - bi[i];
1504: for (j=0; j<nz; j++) {
1505: col = pj[j];
1506: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1507: }
1508: /* U part */
1509: pc1 = b->a + bdiag[i+1]+1;
1510: pj = b->j + bdiag[i+1]+1;
1511: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1512: for (j=0; j<nz; j++) {
1513: col = pj[j];
1514: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1515: }
1517: sctx.rs = rs;
1518: sctx.pv = rtmp1[i];
1519: MatPivotCheck(B,A,info,&sctx,i);
1520: if (sctx.newshift) break;
1521: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1522: *pc1 = 1.0/sctx.pv;
1524: /* Now take care of 1st column of diagonal 3x3 block. */
1525: pc2 = rtmp2 + i;
1526: pc3 = rtmp3 + i;
1527: if (*pc2 != 0.0 || *pc3 != 0.0) {
1528: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1529: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1530: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1531: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1532: for (j=0; j<nz; j++) {
1533: col = pj[j];
1534: rtmp2[col] -= mul2 * rtmp1[col];
1535: rtmp3[col] -= mul3 * rtmp1[col];
1536: }
1537: PetscLogFlops(2+4*nz);
1538: }
1540: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1541: rs = 0.0;
1542: /* L part */
1543: pc2 = b->a + bi[i+1];
1544: pj = b->j + bi[i+1];
1545: nz = bi[i+2] - bi[i+1];
1546: for (j=0; j<nz; j++) {
1547: col = pj[j];
1548: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1549: }
1550: /* U part */
1551: pc2 = b->a + bdiag[i+2]+1;
1552: pj = b->j + bdiag[i+2]+1;
1553: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1554: for (j=0; j<nz; j++) {
1555: col = pj[j];
1556: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1557: }
1559: sctx.rs = rs;
1560: sctx.pv = rtmp2[i+1];
1561: MatPivotCheck(B,A,info,&sctx,i+1);
1562: if (sctx.newshift) break;
1563: pc2 = b->a + bdiag[i+1];
1564: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1566: /* Now take care of 2nd column of diagonal 3x3 block. */
1567: pc3 = rtmp3 + i+1;
1568: if (*pc3 != 0.0) {
1569: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1570: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1571: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1572: for (j=0; j<nz; j++) {
1573: col = pj[j];
1574: rtmp3[col] -= mul3 * rtmp2[col];
1575: }
1576: PetscLogFlops(1+2*nz);
1577: }
1579: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1580: rs = 0.0;
1581: /* L part */
1582: pc3 = b->a + bi[i+2];
1583: pj = b->j + bi[i+2];
1584: nz = bi[i+3] - bi[i+2];
1585: for (j=0; j<nz; j++) {
1586: col = pj[j];
1587: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1588: }
1589: /* U part */
1590: pc3 = b->a + bdiag[i+3]+1;
1591: pj = b->j + bdiag[i+3]+1;
1592: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1593: for (j=0; j<nz; j++) {
1594: col = pj[j];
1595: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1596: }
1598: sctx.rs = rs;
1599: sctx.pv = rtmp3[i+2];
1600: MatPivotCheck(B,A,info,&sctx,i+2);
1601: if (sctx.newshift) break;
1602: pc3 = b->a + bdiag[i+2];
1603: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1604: break;
1605: case 4:
1606: /*----------*/
1607: /* zero rtmp */
1608: /* L part */
1609: nz = bi[i+1] - bi[i];
1610: bjtmp = bj + bi[i];
1611: for (j=0; j<nz; j++) {
1612: col = bjtmp[j];
1613: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1614: }
1616: /* U part */
1617: nz = bdiag[i]-bdiag[i+1];
1618: bjtmp = bj + bdiag[i+1]+1;
1619: for (j=0; j<nz; j++) {
1620: col = bjtmp[j];
1621: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1622: }
1624: /* load in initial (unfactored row) */
1625: nz = ai[r[i]+1] - ai[r[i]];
1626: ajtmp = aj + ai[r[i]];
1627: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1628: for (j=0; j<nz; j++) {
1629: col = ics[ajtmp[j]];
1630: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1631: }
1632: /* ZeropivotApply(): shift the diagonal of the matrix */
1633: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1635: /* elimination */
1636: bjtmp = bj + bi[i];
1637: row = *bjtmp++; /* pivot row */
1638: nzL = bi[i+1] - bi[i];
1639: for (k=0; k < nzL; k++) {
1640: pc1 = rtmp1 + row;
1641: pc2 = rtmp2 + row;
1642: pc3 = rtmp3 + row;
1643: pc4 = rtmp4 + row;
1644: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1645: pv = b->a + bdiag[row];
1646: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1647: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1649: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1650: pv = b->a + bdiag[row+1]+1;
1651: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1652: for (j=0; j<nz; j++) {
1653: col = pj[j];
1654: rtmp1[col] -= mul1 * pv[j];
1655: rtmp2[col] -= mul2 * pv[j];
1656: rtmp3[col] -= mul3 * pv[j];
1657: rtmp4[col] -= mul4 * pv[j];
1658: }
1659: PetscLogFlops(4+8*nz);
1660: }
1661: row = *bjtmp++;
1662: }
1664: /* finished row i; check zero pivot, then stick row i into b->a */
1665: rs = 0.0;
1666: /* L part */
1667: pc1 = b->a + bi[i];
1668: pj = b->j + bi[i];
1669: nz = bi[i+1] - bi[i];
1670: for (j=0; j<nz; j++) {
1671: col = pj[j];
1672: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1673: }
1674: /* U part */
1675: pc1 = b->a + bdiag[i+1]+1;
1676: pj = b->j + bdiag[i+1]+1;
1677: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1678: for (j=0; j<nz; j++) {
1679: col = pj[j];
1680: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1681: }
1683: sctx.rs = rs;
1684: sctx.pv = rtmp1[i];
1685: MatPivotCheck(B,A,info,&sctx,i);
1686: if (sctx.newshift) break;
1687: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1688: *pc1 = 1.0/sctx.pv;
1690: /* Now take care of 1st column of diagonal 4x4 block. */
1691: pc2 = rtmp2 + i;
1692: pc3 = rtmp3 + i;
1693: pc4 = rtmp4 + i;
1694: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1695: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1696: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1697: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1698: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1699: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1700: for (j=0; j<nz; j++) {
1701: col = pj[j];
1702: rtmp2[col] -= mul2 * rtmp1[col];
1703: rtmp3[col] -= mul3 * rtmp1[col];
1704: rtmp4[col] -= mul4 * rtmp1[col];
1705: }
1706: PetscLogFlops(3+6*nz);
1707: }
1709: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1710: rs = 0.0;
1711: /* L part */
1712: pc2 = b->a + bi[i+1];
1713: pj = b->j + bi[i+1];
1714: nz = bi[i+2] - bi[i+1];
1715: for (j=0; j<nz; j++) {
1716: col = pj[j];
1717: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1718: }
1719: /* U part */
1720: pc2 = b->a + bdiag[i+2]+1;
1721: pj = b->j + bdiag[i+2]+1;
1722: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1723: for (j=0; j<nz; j++) {
1724: col = pj[j];
1725: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1726: }
1728: sctx.rs = rs;
1729: sctx.pv = rtmp2[i+1];
1730: MatPivotCheck(B,A,info,&sctx,i+1);
1731: if (sctx.newshift) break;
1732: pc2 = b->a + bdiag[i+1];
1733: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1735: /* Now take care of 2nd column of diagonal 4x4 block. */
1736: pc3 = rtmp3 + i+1;
1737: pc4 = rtmp4 + i+1;
1738: if (*pc3 != 0.0 || *pc4 != 0.0) {
1739: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1740: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1741: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1742: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1743: for (j=0; j<nz; j++) {
1744: col = pj[j];
1745: rtmp3[col] -= mul3 * rtmp2[col];
1746: rtmp4[col] -= mul4 * rtmp2[col];
1747: }
1748: PetscLogFlops(4*nz);
1749: }
1751: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1752: rs = 0.0;
1753: /* L part */
1754: pc3 = b->a + bi[i+2];
1755: pj = b->j + bi[i+2];
1756: nz = bi[i+3] - bi[i+2];
1757: for (j=0; j<nz; j++) {
1758: col = pj[j];
1759: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1760: }
1761: /* U part */
1762: pc3 = b->a + bdiag[i+3]+1;
1763: pj = b->j + bdiag[i+3]+1;
1764: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1765: for (j=0; j<nz; j++) {
1766: col = pj[j];
1767: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1768: }
1770: sctx.rs = rs;
1771: sctx.pv = rtmp3[i+2];
1772: MatPivotCheck(B,A,info,&sctx,i+2);
1773: if (sctx.newshift) break;
1774: pc3 = b->a + bdiag[i+2];
1775: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1777: /* Now take care of 3rd column of diagonal 4x4 block. */
1778: pc4 = rtmp4 + i+2;
1779: if (*pc4 != 0.0) {
1780: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1781: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1782: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1783: for (j=0; j<nz; j++) {
1784: col = pj[j];
1785: rtmp4[col] -= mul4 * rtmp3[col];
1786: }
1787: PetscLogFlops(1+2*nz);
1788: }
1790: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1791: rs = 0.0;
1792: /* L part */
1793: pc4 = b->a + bi[i+3];
1794: pj = b->j + bi[i+3];
1795: nz = bi[i+4] - bi[i+3];
1796: for (j=0; j<nz; j++) {
1797: col = pj[j];
1798: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1799: }
1800: /* U part */
1801: pc4 = b->a + bdiag[i+4]+1;
1802: pj = b->j + bdiag[i+4]+1;
1803: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1804: for (j=0; j<nz; j++) {
1805: col = pj[j];
1806: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1807: }
1809: sctx.rs = rs;
1810: sctx.pv = rtmp4[i+3];
1811: MatPivotCheck(B,A,info,&sctx,i+3);
1812: if (sctx.newshift) break;
1813: pc4 = b->a + bdiag[i+3];
1814: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1815: break;
1817: default:
1818: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1819: }
1820: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1821: i += nodesz; /* Update the row */
1822: }
1824: /* MatPivotRefine() */
1825: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1826: /*
1827: * if no shift in this attempt & shifting & started shifting & can refine,
1828: * then try lower shift
1829: */
1830: sctx.shift_hi = sctx.shift_fraction;
1831: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1832: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1833: sctx.newshift = PETSC_TRUE;
1834: sctx.nshift++;
1835: }
1836: } while (sctx.newshift);
1838: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1839: PetscFree(tmp_vec2);
1840: ISRestoreIndices(isicol,&ic);
1841: ISRestoreIndices(isrow,&r);
1843: if (b->inode.size) {
1844: C->ops->solve = MatSolve_SeqAIJ_Inode;
1845: } else {
1846: C->ops->solve = MatSolve_SeqAIJ;
1847: }
1848: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1849: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1850: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1851: C->ops->matsolve = MatMatSolve_SeqAIJ;
1852: C->assembled = PETSC_TRUE;
1853: C->preallocated = PETSC_TRUE;
1855: PetscLogFlops(C->cmap->n);
1857: /* MatShiftView(A,info,&sctx) */
1858: if (sctx.nshift) {
1859: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1860: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1861: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1862: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1863: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1864: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1865: }
1866: }
1867: return(0);
1868: }
1870: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1871: {
1872: Mat C = B;
1873: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1874: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1875: PetscErrorCode ierr;
1876: const PetscInt *r,*ic,*c,*ics;
1877: PetscInt n = A->rmap->n,*bi = b->i;
1878: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1879: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1880: PetscInt *ai = a->i,*aj = a->j;
1881: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1882: PetscScalar mul1,mul2,mul3,tmp;
1883: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1884: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1885: PetscReal rs=0.0;
1886: FactorShiftCtx sctx;
1889: sctx.shift_top = 0;
1890: sctx.nshift_max = 0;
1891: sctx.shift_lo = 0;
1892: sctx.shift_hi = 0;
1893: sctx.shift_fraction = 0;
1895: /* if both shift schemes are chosen by user, only use info->shiftpd */
1896: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1897: sctx.shift_top = 0;
1898: for (i=0; i<n; i++) {
1899: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1900: rs = 0.0;
1901: ajtmp = aj + ai[i];
1902: rtmp1 = aa + ai[i];
1903: nz = ai[i+1] - ai[i];
1904: for (j=0; j<nz; j++) {
1905: if (*ajtmp != i) {
1906: rs += PetscAbsScalar(*rtmp1++);
1907: } else {
1908: rs -= PetscRealPart(*rtmp1++);
1909: }
1910: ajtmp++;
1911: }
1912: if (rs>sctx.shift_top) sctx.shift_top = rs;
1913: }
1914: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1915: sctx.shift_top *= 1.1;
1916: sctx.nshift_max = 5;
1917: sctx.shift_lo = 0.;
1918: sctx.shift_hi = 1.;
1919: }
1920: sctx.shift_amount = 0;
1921: sctx.nshift = 0;
1923: ISGetIndices(isrow,&r);
1924: ISGetIndices(iscol,&c);
1925: ISGetIndices(isicol,&ic);
1926: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1927: ics = ic;
1929: node_max = a->inode.node_count;
1930: ns = a->inode.size;
1931: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1933: /* If max inode size > 3, split it into two inodes.*/
1934: /* also map the inode sizes according to the ordering */
1935: PetscMalloc1(n+1,&tmp_vec1);
1936: for (i=0,j=0; i<node_max; ++i,++j) {
1937: if (ns[i]>3) {
1938: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1939: ++j;
1940: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1941: } else {
1942: tmp_vec1[j] = ns[i];
1943: }
1944: }
1945: /* Use the correct node_max */
1946: node_max = j;
1948: /* Now reorder the inode info based on mat re-ordering info */
1949: /* First create a row -> inode_size_array_index map */
1950: PetscMalloc1(n+1,&nsmap);
1951: PetscMalloc1(node_max+1,&tmp_vec2);
1952: for (i=0,row=0; i<node_max; i++) {
1953: nodesz = tmp_vec1[i];
1954: for (j=0; j<nodesz; j++,row++) {
1955: nsmap[row] = i;
1956: }
1957: }
1958: /* Using nsmap, create a reordered ns structure */
1959: for (i=0,j=0; i< node_max; i++) {
1960: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1961: tmp_vec2[i] = nodesz;
1962: j += nodesz;
1963: }
1964: PetscFree(nsmap);
1965: PetscFree(tmp_vec1);
1966: /* Now use the correct ns */
1967: ns = tmp_vec2;
1969: do {
1970: sctx.newshift = PETSC_FALSE;
1971: /* Now loop over each block-row, and do the factorization */
1972: for (i=0,row=0; i<node_max; i++) {
1973: nodesz = ns[i];
1974: nz = bi[row+1] - bi[row];
1975: bjtmp = bj + bi[row];
1977: switch (nodesz) {
1978: case 1:
1979: for (j=0; j<nz; j++) {
1980: idx = bjtmp[j];
1981: rtmp11[idx] = 0.0;
1982: }
1984: /* load in initial (unfactored row) */
1985: idx = r[row];
1986: nz_tmp = ai[idx+1] - ai[idx];
1987: ajtmp = aj + ai[idx];
1988: v1 = aa + ai[idx];
1990: for (j=0; j<nz_tmp; j++) {
1991: idx = ics[ajtmp[j]];
1992: rtmp11[idx] = v1[j];
1993: }
1994: rtmp11[ics[r[row]]] += sctx.shift_amount;
1996: prow = *bjtmp++;
1997: while (prow < row) {
1998: pc1 = rtmp11 + prow;
1999: if (*pc1 != 0.0) {
2000: pv = ba + bd[prow];
2001: pj = nbj + bd[prow];
2002: mul1 = *pc1 * *pv++;
2003: *pc1 = mul1;
2004: nz_tmp = bi[prow+1] - bd[prow] - 1;
2005: PetscLogFlops(1+2*nz_tmp);
2006: for (j=0; j<nz_tmp; j++) {
2007: tmp = pv[j];
2008: idx = pj[j];
2009: rtmp11[idx] -= mul1 * tmp;
2010: }
2011: }
2012: prow = *bjtmp++;
2013: }
2014: pj = bj + bi[row];
2015: pc1 = ba + bi[row];
2017: sctx.pv = rtmp11[row];
2018: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2019: rs = 0.0;
2020: for (j=0; j<nz; j++) {
2021: idx = pj[j];
2022: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2023: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2024: }
2025: sctx.rs = rs;
2026: MatPivotCheck(B,A,info,&sctx,row);
2027: if (sctx.newshift) goto endofwhile;
2028: break;
2030: case 2:
2031: for (j=0; j<nz; j++) {
2032: idx = bjtmp[j];
2033: rtmp11[idx] = 0.0;
2034: rtmp22[idx] = 0.0;
2035: }
2037: /* load in initial (unfactored row) */
2038: idx = r[row];
2039: nz_tmp = ai[idx+1] - ai[idx];
2040: ajtmp = aj + ai[idx];
2041: v1 = aa + ai[idx];
2042: v2 = aa + ai[idx+1];
2043: for (j=0; j<nz_tmp; j++) {
2044: idx = ics[ajtmp[j]];
2045: rtmp11[idx] = v1[j];
2046: rtmp22[idx] = v2[j];
2047: }
2048: rtmp11[ics[r[row]]] += sctx.shift_amount;
2049: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2051: prow = *bjtmp++;
2052: while (prow < row) {
2053: pc1 = rtmp11 + prow;
2054: pc2 = rtmp22 + prow;
2055: if (*pc1 != 0.0 || *pc2 != 0.0) {
2056: pv = ba + bd[prow];
2057: pj = nbj + bd[prow];
2058: mul1 = *pc1 * *pv;
2059: mul2 = *pc2 * *pv;
2060: ++pv;
2061: *pc1 = mul1;
2062: *pc2 = mul2;
2064: nz_tmp = bi[prow+1] - bd[prow] - 1;
2065: for (j=0; j<nz_tmp; j++) {
2066: tmp = pv[j];
2067: idx = pj[j];
2068: rtmp11[idx] -= mul1 * tmp;
2069: rtmp22[idx] -= mul2 * tmp;
2070: }
2071: PetscLogFlops(2+4*nz_tmp);
2072: }
2073: prow = *bjtmp++;
2074: }
2076: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2077: pc1 = rtmp11 + prow;
2078: pc2 = rtmp22 + prow;
2080: sctx.pv = *pc1;
2081: pj = bj + bi[prow];
2082: rs = 0.0;
2083: for (j=0; j<nz; j++) {
2084: idx = pj[j];
2085: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2086: }
2087: sctx.rs = rs;
2088: MatPivotCheck(B,A,info,&sctx,row);
2089: if (sctx.newshift) goto endofwhile;
2091: if (*pc2 != 0.0) {
2092: pj = nbj + bd[prow];
2093: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2094: *pc2 = mul2;
2095: nz_tmp = bi[prow+1] - bd[prow] - 1;
2096: for (j=0; j<nz_tmp; j++) {
2097: idx = pj[j];
2098: tmp = rtmp11[idx];
2099: rtmp22[idx] -= mul2 * tmp;
2100: }
2101: PetscLogFlops(1+2*nz_tmp);
2102: }
2104: pj = bj + bi[row];
2105: pc1 = ba + bi[row];
2106: pc2 = ba + bi[row+1];
2108: sctx.pv = rtmp22[row+1];
2109: rs = 0.0;
2110: rtmp11[row] = 1.0/rtmp11[row];
2111: rtmp22[row+1] = 1.0/rtmp22[row+1];
2112: /* copy row entries from dense representation to sparse */
2113: for (j=0; j<nz; j++) {
2114: idx = pj[j];
2115: pc1[j] = rtmp11[idx];
2116: pc2[j] = rtmp22[idx];
2117: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2118: }
2119: sctx.rs = rs;
2120: MatPivotCheck(B,A,info,&sctx,row+1);
2121: if (sctx.newshift) goto endofwhile;
2122: break;
2124: case 3:
2125: for (j=0; j<nz; j++) {
2126: idx = bjtmp[j];
2127: rtmp11[idx] = 0.0;
2128: rtmp22[idx] = 0.0;
2129: rtmp33[idx] = 0.0;
2130: }
2131: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2132: idx = r[row];
2133: nz_tmp = ai[idx+1] - ai[idx];
2134: ajtmp = aj + ai[idx];
2135: v1 = aa + ai[idx];
2136: v2 = aa + ai[idx+1];
2137: v3 = aa + ai[idx+2];
2138: for (j=0; j<nz_tmp; j++) {
2139: idx = ics[ajtmp[j]];
2140: rtmp11[idx] = v1[j];
2141: rtmp22[idx] = v2[j];
2142: rtmp33[idx] = v3[j];
2143: }
2144: rtmp11[ics[r[row]]] += sctx.shift_amount;
2145: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2146: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2148: /* loop over all pivot row blocks above this row block */
2149: prow = *bjtmp++;
2150: while (prow < row) {
2151: pc1 = rtmp11 + prow;
2152: pc2 = rtmp22 + prow;
2153: pc3 = rtmp33 + prow;
2154: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2155: pv = ba + bd[prow];
2156: pj = nbj + bd[prow];
2157: mul1 = *pc1 * *pv;
2158: mul2 = *pc2 * *pv;
2159: mul3 = *pc3 * *pv;
2160: ++pv;
2161: *pc1 = mul1;
2162: *pc2 = mul2;
2163: *pc3 = mul3;
2165: nz_tmp = bi[prow+1] - bd[prow] - 1;
2166: /* update this row based on pivot row */
2167: for (j=0; j<nz_tmp; j++) {
2168: tmp = pv[j];
2169: idx = pj[j];
2170: rtmp11[idx] -= mul1 * tmp;
2171: rtmp22[idx] -= mul2 * tmp;
2172: rtmp33[idx] -= mul3 * tmp;
2173: }
2174: PetscLogFlops(3+6*nz_tmp);
2175: }
2176: prow = *bjtmp++;
2177: }
2179: /* Now take care of diagonal 3x3 block in this set of rows */
2180: /* note: prow = row here */
2181: pc1 = rtmp11 + prow;
2182: pc2 = rtmp22 + prow;
2183: pc3 = rtmp33 + prow;
2185: sctx.pv = *pc1;
2186: pj = bj + bi[prow];
2187: rs = 0.0;
2188: for (j=0; j<nz; j++) {
2189: idx = pj[j];
2190: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2191: }
2192: sctx.rs = rs;
2193: MatPivotCheck(B,A,info,&sctx,row);
2194: if (sctx.newshift) goto endofwhile;
2196: if (*pc2 != 0.0 || *pc3 != 0.0) {
2197: mul2 = (*pc2)/(*pc1);
2198: mul3 = (*pc3)/(*pc1);
2199: *pc2 = mul2;
2200: *pc3 = mul3;
2201: nz_tmp = bi[prow+1] - bd[prow] - 1;
2202: pj = nbj + bd[prow];
2203: for (j=0; j<nz_tmp; j++) {
2204: idx = pj[j];
2205: tmp = rtmp11[idx];
2206: rtmp22[idx] -= mul2 * tmp;
2207: rtmp33[idx] -= mul3 * tmp;
2208: }
2209: PetscLogFlops(2+4*nz_tmp);
2210: }
2211: ++prow;
2213: pc2 = rtmp22 + prow;
2214: pc3 = rtmp33 + prow;
2215: sctx.pv = *pc2;
2216: pj = bj + bi[prow];
2217: rs = 0.0;
2218: for (j=0; j<nz; j++) {
2219: idx = pj[j];
2220: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2221: }
2222: sctx.rs = rs;
2223: MatPivotCheck(B,A,info,&sctx,row+1);
2224: if (sctx.newshift) goto endofwhile;
2226: if (*pc3 != 0.0) {
2227: mul3 = (*pc3)/(*pc2);
2228: *pc3 = mul3;
2229: pj = nbj + bd[prow];
2230: nz_tmp = bi[prow+1] - bd[prow] - 1;
2231: for (j=0; j<nz_tmp; j++) {
2232: idx = pj[j];
2233: tmp = rtmp22[idx];
2234: rtmp33[idx] -= mul3 * tmp;
2235: }
2236: PetscLogFlops(1+2*nz_tmp);
2237: }
2239: pj = bj + bi[row];
2240: pc1 = ba + bi[row];
2241: pc2 = ba + bi[row+1];
2242: pc3 = ba + bi[row+2];
2244: sctx.pv = rtmp33[row+2];
2245: rs = 0.0;
2246: rtmp11[row] = 1.0/rtmp11[row];
2247: rtmp22[row+1] = 1.0/rtmp22[row+1];
2248: rtmp33[row+2] = 1.0/rtmp33[row+2];
2249: /* copy row entries from dense representation to sparse */
2250: for (j=0; j<nz; j++) {
2251: idx = pj[j];
2252: pc1[j] = rtmp11[idx];
2253: pc2[j] = rtmp22[idx];
2254: pc3[j] = rtmp33[idx];
2255: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2256: }
2258: sctx.rs = rs;
2259: MatPivotCheck(B,A,info,&sctx,row+2);
2260: if (sctx.newshift) goto endofwhile;
2261: break;
2263: default:
2264: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2265: }
2266: row += nodesz; /* Update the row */
2267: }
2268: endofwhile:;
2269: } while (sctx.newshift);
2270: PetscFree3(rtmp11,rtmp22,rtmp33);
2271: PetscFree(tmp_vec2);
2272: ISRestoreIndices(isicol,&ic);
2273: ISRestoreIndices(isrow,&r);
2274: ISRestoreIndices(iscol,&c);
2276: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2277: /* do not set solve add, since MatSolve_Inode + Add is faster */
2278: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2279: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2280: C->assembled = PETSC_TRUE;
2281: C->preallocated = PETSC_TRUE;
2282: if (sctx.nshift) {
2283: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2284: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2285: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2286: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2287: }
2288: }
2289: PetscLogFlops(C->cmap->n);
2290: MatSeqAIJCheckInode(C);
2291: return(0);
2292: }
2295: /* ----------------------------------------------------------- */
2296: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2297: {
2298: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2299: IS iscol = a->col,isrow = a->row;
2300: PetscErrorCode ierr;
2301: const PetscInt *r,*c,*rout,*cout;
2302: PetscInt i,j,n = A->rmap->n;
2303: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2304: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2305: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2306: PetscScalar sum1,sum2,sum3,sum4,sum5;
2307: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2308: const PetscScalar *b;
2311: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2312: node_max = a->inode.node_count;
2313: ns = a->inode.size; /* Node Size array */
2315: VecGetArrayRead(bb,&b);
2316: VecGetArray(xx,&x);
2317: tmp = a->solve_work;
2319: ISGetIndices(isrow,&rout); r = rout;
2320: ISGetIndices(iscol,&cout); c = cout;
2322: /* forward solve the lower triangular */
2323: tmps = tmp;
2324: aa = a_a;
2325: aj = a_j;
2326: ad = a->diag;
2328: for (i = 0,row = 0; i< node_max; ++i) {
2329: nsz = ns[i];
2330: aii = ai[row];
2331: v1 = aa + aii;
2332: vi = aj + aii;
2333: nz = ai[row+1]- ai[row];
2335: if (i < node_max-1) {
2336: /* Prefetch the indices for the next block */
2337: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2338: /* Prefetch the data for the next block */
2339: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2340: }
2342: switch (nsz) { /* Each loop in 'case' is unrolled */
2343: case 1:
2344: sum1 = b[r[row]];
2345: for (j=0; j<nz-1; j+=2) {
2346: i0 = vi[j];
2347: i1 = vi[j+1];
2348: tmp0 = tmps[i0];
2349: tmp1 = tmps[i1];
2350: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2351: }
2352: if (j == nz-1) {
2353: tmp0 = tmps[vi[j]];
2354: sum1 -= v1[j]*tmp0;
2355: }
2356: tmp[row++]=sum1;
2357: break;
2358: case 2:
2359: sum1 = b[r[row]];
2360: sum2 = b[r[row+1]];
2361: v2 = aa + ai[row+1];
2363: for (j=0; j<nz-1; j+=2) {
2364: i0 = vi[j];
2365: i1 = vi[j+1];
2366: tmp0 = tmps[i0];
2367: tmp1 = tmps[i1];
2368: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2369: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2370: }
2371: if (j == nz-1) {
2372: tmp0 = tmps[vi[j]];
2373: sum1 -= v1[j] *tmp0;
2374: sum2 -= v2[j] *tmp0;
2375: }
2376: sum2 -= v2[nz] * sum1;
2377: tmp[row++]=sum1;
2378: tmp[row++]=sum2;
2379: break;
2380: case 3:
2381: sum1 = b[r[row]];
2382: sum2 = b[r[row+1]];
2383: sum3 = b[r[row+2]];
2384: v2 = aa + ai[row+1];
2385: v3 = aa + ai[row+2];
2387: for (j=0; j<nz-1; j+=2) {
2388: i0 = vi[j];
2389: i1 = vi[j+1];
2390: tmp0 = tmps[i0];
2391: tmp1 = tmps[i1];
2392: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2395: }
2396: if (j == nz-1) {
2397: tmp0 = tmps[vi[j]];
2398: sum1 -= v1[j] *tmp0;
2399: sum2 -= v2[j] *tmp0;
2400: sum3 -= v3[j] *tmp0;
2401: }
2402: sum2 -= v2[nz] * sum1;
2403: sum3 -= v3[nz] * sum1;
2404: sum3 -= v3[nz+1] * sum2;
2405: tmp[row++]=sum1;
2406: tmp[row++]=sum2;
2407: tmp[row++]=sum3;
2408: break;
2410: case 4:
2411: sum1 = b[r[row]];
2412: sum2 = b[r[row+1]];
2413: sum3 = b[r[row+2]];
2414: sum4 = b[r[row+3]];
2415: v2 = aa + ai[row+1];
2416: v3 = aa + ai[row+2];
2417: v4 = aa + ai[row+3];
2419: for (j=0; j<nz-1; j+=2) {
2420: i0 = vi[j];
2421: i1 = vi[j+1];
2422: tmp0 = tmps[i0];
2423: tmp1 = tmps[i1];
2424: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428: }
2429: if (j == nz-1) {
2430: tmp0 = tmps[vi[j]];
2431: sum1 -= v1[j] *tmp0;
2432: sum2 -= v2[j] *tmp0;
2433: sum3 -= v3[j] *tmp0;
2434: sum4 -= v4[j] *tmp0;
2435: }
2436: sum2 -= v2[nz] * sum1;
2437: sum3 -= v3[nz] * sum1;
2438: sum4 -= v4[nz] * sum1;
2439: sum3 -= v3[nz+1] * sum2;
2440: sum4 -= v4[nz+1] * sum2;
2441: sum4 -= v4[nz+2] * sum3;
2443: tmp[row++]=sum1;
2444: tmp[row++]=sum2;
2445: tmp[row++]=sum3;
2446: tmp[row++]=sum4;
2447: break;
2448: case 5:
2449: sum1 = b[r[row]];
2450: sum2 = b[r[row+1]];
2451: sum3 = b[r[row+2]];
2452: sum4 = b[r[row+3]];
2453: sum5 = b[r[row+4]];
2454: v2 = aa + ai[row+1];
2455: v3 = aa + ai[row+2];
2456: v4 = aa + ai[row+3];
2457: v5 = aa + ai[row+4];
2459: for (j=0; j<nz-1; j+=2) {
2460: i0 = vi[j];
2461: i1 = vi[j+1];
2462: tmp0 = tmps[i0];
2463: tmp1 = tmps[i1];
2464: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2465: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2466: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2467: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2468: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2469: }
2470: if (j == nz-1) {
2471: tmp0 = tmps[vi[j]];
2472: sum1 -= v1[j] *tmp0;
2473: sum2 -= v2[j] *tmp0;
2474: sum3 -= v3[j] *tmp0;
2475: sum4 -= v4[j] *tmp0;
2476: sum5 -= v5[j] *tmp0;
2477: }
2479: sum2 -= v2[nz] * sum1;
2480: sum3 -= v3[nz] * sum1;
2481: sum4 -= v4[nz] * sum1;
2482: sum5 -= v5[nz] * sum1;
2483: sum3 -= v3[nz+1] * sum2;
2484: sum4 -= v4[nz+1] * sum2;
2485: sum5 -= v5[nz+1] * sum2;
2486: sum4 -= v4[nz+2] * sum3;
2487: sum5 -= v5[nz+2] * sum3;
2488: sum5 -= v5[nz+3] * sum4;
2490: tmp[row++]=sum1;
2491: tmp[row++]=sum2;
2492: tmp[row++]=sum3;
2493: tmp[row++]=sum4;
2494: tmp[row++]=sum5;
2495: break;
2496: default:
2497: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2498: }
2499: }
2500: /* backward solve the upper triangular */
2501: for (i=node_max -1,row = n-1; i>=0; i--) {
2502: nsz = ns[i];
2503: aii = ad[row+1] + 1;
2504: v1 = aa + aii;
2505: vi = aj + aii;
2506: nz = ad[row]- ad[row+1] - 1;
2508: if (i > 0) {
2509: /* Prefetch the indices for the next block */
2510: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2511: /* Prefetch the data for the next block */
2512: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2513: }
2515: switch (nsz) { /* Each loop in 'case' is unrolled */
2516: case 1:
2517: sum1 = tmp[row];
2519: for (j=0; j<nz-1; j+=2) {
2520: i0 = vi[j];
2521: i1 = vi[j+1];
2522: tmp0 = tmps[i0];
2523: tmp1 = tmps[i1];
2524: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2525: }
2526: if (j == nz-1) {
2527: tmp0 = tmps[vi[j]];
2528: sum1 -= v1[j]*tmp0;
2529: }
2530: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2531: break;
2532: case 2:
2533: sum1 = tmp[row];
2534: sum2 = tmp[row-1];
2535: v2 = aa + ad[row] + 1;
2536: for (j=0; j<nz-1; j+=2) {
2537: i0 = vi[j];
2538: i1 = vi[j+1];
2539: tmp0 = tmps[i0];
2540: tmp1 = tmps[i1];
2541: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2542: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2543: }
2544: if (j == nz-1) {
2545: tmp0 = tmps[vi[j]];
2546: sum1 -= v1[j]* tmp0;
2547: sum2 -= v2[j+1]* tmp0;
2548: }
2550: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2551: sum2 -= v2[0] * tmp0;
2552: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2553: break;
2554: case 3:
2555: sum1 = tmp[row];
2556: sum2 = tmp[row -1];
2557: sum3 = tmp[row -2];
2558: v2 = aa + ad[row] + 1;
2559: v3 = aa + ad[row -1] + 1;
2560: for (j=0; j<nz-1; j+=2) {
2561: i0 = vi[j];
2562: i1 = vi[j+1];
2563: tmp0 = tmps[i0];
2564: tmp1 = tmps[i1];
2565: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2568: }
2569: if (j== nz-1) {
2570: tmp0 = tmps[vi[j]];
2571: sum1 -= v1[j] * tmp0;
2572: sum2 -= v2[j+1] * tmp0;
2573: sum3 -= v3[j+2] * tmp0;
2574: }
2575: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2576: sum2 -= v2[0]* tmp0;
2577: sum3 -= v3[1] * tmp0;
2578: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2579: sum3 -= v3[0]* tmp0;
2580: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2582: break;
2583: case 4:
2584: sum1 = tmp[row];
2585: sum2 = tmp[row -1];
2586: sum3 = tmp[row -2];
2587: sum4 = tmp[row -3];
2588: v2 = aa + ad[row]+1;
2589: v3 = aa + ad[row -1]+1;
2590: v4 = aa + ad[row -2]+1;
2592: for (j=0; j<nz-1; j+=2) {
2593: i0 = vi[j];
2594: i1 = vi[j+1];
2595: tmp0 = tmps[i0];
2596: tmp1 = tmps[i1];
2597: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2601: }
2602: if (j== nz-1) {
2603: tmp0 = tmps[vi[j]];
2604: sum1 -= v1[j] * tmp0;
2605: sum2 -= v2[j+1] * tmp0;
2606: sum3 -= v3[j+2] * tmp0;
2607: sum4 -= v4[j+3] * tmp0;
2608: }
2610: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611: sum2 -= v2[0] * tmp0;
2612: sum3 -= v3[1] * tmp0;
2613: sum4 -= v4[2] * tmp0;
2614: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2615: sum3 -= v3[0] * tmp0;
2616: sum4 -= v4[1] * tmp0;
2617: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2618: sum4 -= v4[0] * tmp0;
2619: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2620: break;
2621: case 5:
2622: sum1 = tmp[row];
2623: sum2 = tmp[row -1];
2624: sum3 = tmp[row -2];
2625: sum4 = tmp[row -3];
2626: sum5 = tmp[row -4];
2627: v2 = aa + ad[row]+1;
2628: v3 = aa + ad[row -1]+1;
2629: v4 = aa + ad[row -2]+1;
2630: v5 = aa + ad[row -3]+1;
2631: for (j=0; j<nz-1; j+=2) {
2632: i0 = vi[j];
2633: i1 = vi[j+1];
2634: tmp0 = tmps[i0];
2635: tmp1 = tmps[i1];
2636: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2637: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2641: }
2642: if (j==nz-1) {
2643: tmp0 = tmps[vi[j]];
2644: sum1 -= v1[j] * tmp0;
2645: sum2 -= v2[j+1] * tmp0;
2646: sum3 -= v3[j+2] * tmp0;
2647: sum4 -= v4[j+3] * tmp0;
2648: sum5 -= v5[j+4] * tmp0;
2649: }
2651: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2652: sum2 -= v2[0] * tmp0;
2653: sum3 -= v3[1] * tmp0;
2654: sum4 -= v4[2] * tmp0;
2655: sum5 -= v5[3] * tmp0;
2656: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2657: sum3 -= v3[0] * tmp0;
2658: sum4 -= v4[1] * tmp0;
2659: sum5 -= v5[2] * tmp0;
2660: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2661: sum4 -= v4[0] * tmp0;
2662: sum5 -= v5[1] * tmp0;
2663: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2664: sum5 -= v5[0] * tmp0;
2665: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2666: break;
2667: default:
2668: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2669: }
2670: }
2671: ISRestoreIndices(isrow,&rout);
2672: ISRestoreIndices(iscol,&cout);
2673: VecRestoreArrayRead(bb,&b);
2674: VecRestoreArray(xx,&x);
2675: PetscLogFlops(2.0*a->nz - A->cmap->n);
2676: return(0);
2677: }
2680: /*
2681: Makes a longer coloring[] array and calls the usual code with that
2682: */
2683: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2684: {
2685: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2686: PetscErrorCode ierr;
2687: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2688: PetscInt *colorused,i;
2689: ISColoringValue *newcolor;
2692: PetscMalloc1(n+1,&newcolor);
2693: /* loop over inodes, marking a color for each column*/
2694: row = 0;
2695: for (i=0; i<m; i++) {
2696: for (j=0; j<ns[i]; j++) {
2697: newcolor[row++] = coloring[i] + j*ncolors;
2698: }
2699: }
2701: /* eliminate unneeded colors */
2702: PetscCalloc1(5*ncolors,&colorused);
2703: for (i=0; i<n; i++) {
2704: colorused[newcolor[i]] = 1;
2705: }
2707: for (i=1; i<5*ncolors; i++) {
2708: colorused[i] += colorused[i-1];
2709: }
2710: ncolors = colorused[5*ncolors-1];
2711: for (i=0; i<n; i++) {
2712: newcolor[i] = colorused[newcolor[i]]-1;
2713: }
2714: PetscFree(colorused);
2715: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2716: PetscFree(coloring);
2717: return(0);
2718: }
2720: #include <petsc/private/kernels/blockinvert.h>
2722: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2723: {
2724: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2725: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2726: MatScalar *ibdiag,*bdiag,work[25],*t;
2727: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2728: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2729: const PetscScalar *xb, *b;
2730: PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2731: PetscErrorCode ierr;
2732: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2733: PetscInt sz,k,ipvt[5];
2734: PetscBool allowzeropivot,zeropivotdetected;
2735: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2738: allowzeropivot = PetscNot(A->erroriffailure);
2739: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2740: if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2741: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2743: if (!a->inode.ibdiagvalid) {
2744: if (!a->inode.ibdiag) {
2745: /* calculate space needed for diagonal blocks */
2746: for (i=0; i<m; i++) {
2747: cnt += sizes[i]*sizes[i];
2748: }
2749: a->inode.bdiagsize = cnt;
2751: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2752: }
2754: /* copy over the diagonal blocks and invert them */
2755: ibdiag = a->inode.ibdiag;
2756: bdiag = a->inode.bdiag;
2757: cnt = 0;
2758: for (i=0, row = 0; i<m; i++) {
2759: for (j=0; j<sizes[i]; j++) {
2760: for (k=0; k<sizes[i]; k++) {
2761: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2762: }
2763: }
2764: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2766: switch (sizes[i]) {
2767: case 1:
2768: /* Create matrix data structure */
2769: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2770: if (allowzeropivot) {
2771: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2772: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2773: A->factorerror_zeropivot_row = row;
2774: PetscInfo1(A,"Zero pivot, row %D\n",row);
2775: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2776: }
2777: ibdiag[cnt] = 1.0/ibdiag[cnt];
2778: break;
2779: case 2:
2780: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2781: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2782: break;
2783: case 3:
2784: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2785: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2786: break;
2787: case 4:
2788: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2789: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2790: break;
2791: case 5:
2792: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2793: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2794: break;
2795: default:
2796: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2797: }
2798: cnt += sizes[i]*sizes[i];
2799: row += sizes[i];
2800: }
2801: a->inode.ibdiagvalid = PETSC_TRUE;
2802: }
2803: ibdiag = a->inode.ibdiag;
2804: bdiag = a->inode.bdiag;
2805: t = a->inode.ssor_work;
2807: VecGetArray(xx,&x);
2808: VecGetArrayRead(bb,&b);
2809: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2810: if (flag & SOR_ZERO_INITIAL_GUESS) {
2811: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2813: for (i=0, row=0; i<m; i++) {
2814: sz = diag[row] - ii[row];
2815: v1 = a->a + ii[row];
2816: idx = a->j + ii[row];
2818: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2819: switch (sizes[i]) {
2820: case 1:
2822: sum1 = b[row];
2823: for (n = 0; n<sz-1; n+=2) {
2824: i1 = idx[0];
2825: i2 = idx[1];
2826: idx += 2;
2827: tmp0 = x[i1];
2828: tmp1 = x[i2];
2829: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2830: }
2832: if (n == sz-1) {
2833: tmp0 = x[*idx];
2834: sum1 -= *v1 * tmp0;
2835: }
2836: t[row] = sum1;
2837: x[row++] = sum1*(*ibdiag++);
2838: break;
2839: case 2:
2840: v2 = a->a + ii[row+1];
2841: sum1 = b[row];
2842: sum2 = b[row+1];
2843: for (n = 0; n<sz-1; n+=2) {
2844: i1 = idx[0];
2845: i2 = idx[1];
2846: idx += 2;
2847: tmp0 = x[i1];
2848: tmp1 = x[i2];
2849: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2850: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2851: }
2853: if (n == sz-1) {
2854: tmp0 = x[*idx];
2855: sum1 -= v1[0] * tmp0;
2856: sum2 -= v2[0] * tmp0;
2857: }
2858: t[row] = sum1;
2859: t[row+1] = sum2;
2860: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2861: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2862: ibdiag += 4;
2863: break;
2864: case 3:
2865: v2 = a->a + ii[row+1];
2866: v3 = a->a + ii[row+2];
2867: sum1 = b[row];
2868: sum2 = b[row+1];
2869: sum3 = b[row+2];
2870: for (n = 0; n<sz-1; n+=2) {
2871: i1 = idx[0];
2872: i2 = idx[1];
2873: idx += 2;
2874: tmp0 = x[i1];
2875: tmp1 = x[i2];
2876: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2877: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2878: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2879: }
2881: if (n == sz-1) {
2882: tmp0 = x[*idx];
2883: sum1 -= v1[0] * tmp0;
2884: sum2 -= v2[0] * tmp0;
2885: sum3 -= v3[0] * tmp0;
2886: }
2887: t[row] = sum1;
2888: t[row+1] = sum2;
2889: t[row+2] = sum3;
2890: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2891: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2892: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2893: ibdiag += 9;
2894: break;
2895: case 4:
2896: v2 = a->a + ii[row+1];
2897: v3 = a->a + ii[row+2];
2898: v4 = a->a + ii[row+3];
2899: sum1 = b[row];
2900: sum2 = b[row+1];
2901: sum3 = b[row+2];
2902: sum4 = b[row+3];
2903: for (n = 0; n<sz-1; n+=2) {
2904: i1 = idx[0];
2905: i2 = idx[1];
2906: idx += 2;
2907: tmp0 = x[i1];
2908: tmp1 = x[i2];
2909: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2910: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2911: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2912: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2913: }
2915: if (n == sz-1) {
2916: tmp0 = x[*idx];
2917: sum1 -= v1[0] * tmp0;
2918: sum2 -= v2[0] * tmp0;
2919: sum3 -= v3[0] * tmp0;
2920: sum4 -= v4[0] * tmp0;
2921: }
2922: t[row] = sum1;
2923: t[row+1] = sum2;
2924: t[row+2] = sum3;
2925: t[row+3] = sum4;
2926: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2927: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2928: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2929: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2930: ibdiag += 16;
2931: break;
2932: case 5:
2933: v2 = a->a + ii[row+1];
2934: v3 = a->a + ii[row+2];
2935: v4 = a->a + ii[row+3];
2936: v5 = a->a + ii[row+4];
2937: sum1 = b[row];
2938: sum2 = b[row+1];
2939: sum3 = b[row+2];
2940: sum4 = b[row+3];
2941: sum5 = b[row+4];
2942: for (n = 0; n<sz-1; n+=2) {
2943: i1 = idx[0];
2944: i2 = idx[1];
2945: idx += 2;
2946: tmp0 = x[i1];
2947: tmp1 = x[i2];
2948: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2949: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2950: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2951: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2952: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2953: }
2955: if (n == sz-1) {
2956: tmp0 = x[*idx];
2957: sum1 -= v1[0] * tmp0;
2958: sum2 -= v2[0] * tmp0;
2959: sum3 -= v3[0] * tmp0;
2960: sum4 -= v4[0] * tmp0;
2961: sum5 -= v5[0] * tmp0;
2962: }
2963: t[row] = sum1;
2964: t[row+1] = sum2;
2965: t[row+2] = sum3;
2966: t[row+3] = sum4;
2967: t[row+4] = sum5;
2968: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2969: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2970: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2971: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2972: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2973: ibdiag += 25;
2974: break;
2975: default:
2976: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2977: }
2978: }
2980: xb = t;
2981: PetscLogFlops(a->nz);
2982: } else xb = b;
2983: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2985: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2986: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2987: ibdiag -= sizes[i]*sizes[i];
2988: sz = ii[row+1] - diag[row] - 1;
2989: v1 = a->a + diag[row] + 1;
2990: idx = a->j + diag[row] + 1;
2992: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2993: switch (sizes[i]) {
2994: case 1:
2996: sum1 = xb[row];
2997: for (n = 0; n<sz-1; n+=2) {
2998: i1 = idx[0];
2999: i2 = idx[1];
3000: idx += 2;
3001: tmp0 = x[i1];
3002: tmp1 = x[i2];
3003: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3004: }
3006: if (n == sz-1) {
3007: tmp0 = x[*idx];
3008: sum1 -= *v1*tmp0;
3009: }
3010: x[row--] = sum1*(*ibdiag);
3011: break;
3013: case 2:
3015: sum1 = xb[row];
3016: sum2 = xb[row-1];
3017: /* note that sum1 is associated with the second of the two rows */
3018: v2 = a->a + diag[row-1] + 2;
3019: for (n = 0; n<sz-1; n+=2) {
3020: i1 = idx[0];
3021: i2 = idx[1];
3022: idx += 2;
3023: tmp0 = x[i1];
3024: tmp1 = x[i2];
3025: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3026: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3027: }
3029: if (n == sz-1) {
3030: tmp0 = x[*idx];
3031: sum1 -= *v1*tmp0;
3032: sum2 -= *v2*tmp0;
3033: }
3034: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3035: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3036: break;
3037: case 3:
3039: sum1 = xb[row];
3040: sum2 = xb[row-1];
3041: sum3 = xb[row-2];
3042: v2 = a->a + diag[row-1] + 2;
3043: v3 = a->a + diag[row-2] + 3;
3044: for (n = 0; n<sz-1; n+=2) {
3045: i1 = idx[0];
3046: i2 = idx[1];
3047: idx += 2;
3048: tmp0 = x[i1];
3049: tmp1 = x[i2];
3050: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3051: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3052: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3053: }
3055: if (n == sz-1) {
3056: tmp0 = x[*idx];
3057: sum1 -= *v1*tmp0;
3058: sum2 -= *v2*tmp0;
3059: sum3 -= *v3*tmp0;
3060: }
3061: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3062: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3063: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3064: break;
3065: case 4:
3067: sum1 = xb[row];
3068: sum2 = xb[row-1];
3069: sum3 = xb[row-2];
3070: sum4 = xb[row-3];
3071: v2 = a->a + diag[row-1] + 2;
3072: v3 = a->a + diag[row-2] + 3;
3073: v4 = a->a + diag[row-3] + 4;
3074: for (n = 0; n<sz-1; n+=2) {
3075: i1 = idx[0];
3076: i2 = idx[1];
3077: idx += 2;
3078: tmp0 = x[i1];
3079: tmp1 = x[i2];
3080: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3084: }
3086: if (n == sz-1) {
3087: tmp0 = x[*idx];
3088: sum1 -= *v1*tmp0;
3089: sum2 -= *v2*tmp0;
3090: sum3 -= *v3*tmp0;
3091: sum4 -= *v4*tmp0;
3092: }
3093: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3094: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3095: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3096: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3097: break;
3098: case 5:
3100: sum1 = xb[row];
3101: sum2 = xb[row-1];
3102: sum3 = xb[row-2];
3103: sum4 = xb[row-3];
3104: sum5 = xb[row-4];
3105: v2 = a->a + diag[row-1] + 2;
3106: v3 = a->a + diag[row-2] + 3;
3107: v4 = a->a + diag[row-3] + 4;
3108: v5 = a->a + diag[row-4] + 5;
3109: for (n = 0; n<sz-1; n+=2) {
3110: i1 = idx[0];
3111: i2 = idx[1];
3112: idx += 2;
3113: tmp0 = x[i1];
3114: tmp1 = x[i2];
3115: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3116: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3117: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3118: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3119: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3120: }
3122: if (n == sz-1) {
3123: tmp0 = x[*idx];
3124: sum1 -= *v1*tmp0;
3125: sum2 -= *v2*tmp0;
3126: sum3 -= *v3*tmp0;
3127: sum4 -= *v4*tmp0;
3128: sum5 -= *v5*tmp0;
3129: }
3130: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3131: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3132: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3133: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3134: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3135: break;
3136: default:
3137: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3138: }
3139: }
3141: PetscLogFlops(a->nz);
3142: }
3143: its--;
3144: }
3145: while (its--) {
3147: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3148: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3149: i<m;
3150: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3152: sz = diag[row] - ii[row];
3153: v1 = a->a + ii[row];
3154: idx = a->j + ii[row];
3155: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3156: switch (sizes[i]) {
3157: case 1:
3158: sum1 = b[row];
3159: for (n = 0; n<sz-1; n+=2) {
3160: i1 = idx[0];
3161: i2 = idx[1];
3162: idx += 2;
3163: tmp0 = x[i1];
3164: tmp1 = x[i2];
3165: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3166: }
3167: if (n == sz-1) {
3168: tmp0 = x[*idx++];
3169: sum1 -= *v1 * tmp0;
3170: v1++;
3171: }
3172: t[row] = sum1;
3173: sz = ii[row+1] - diag[row] - 1;
3174: idx = a->j + diag[row] + 1;
3175: v1 += 1;
3176: for (n = 0; n<sz-1; n+=2) {
3177: i1 = idx[0];
3178: i2 = idx[1];
3179: idx += 2;
3180: tmp0 = x[i1];
3181: tmp1 = x[i2];
3182: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3183: }
3184: if (n == sz-1) {
3185: tmp0 = x[*idx++];
3186: sum1 -= *v1 * tmp0;
3187: }
3188: /* in MatSOR_SeqAIJ this line would be
3189: *
3190: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3191: *
3192: * but omega == 1, so this becomes
3193: *
3194: * x[row] = sum1*(*ibdiag++);
3195: *
3196: */
3197: x[row] = sum1*(*ibdiag);
3198: break;
3199: case 2:
3200: v2 = a->a + ii[row+1];
3201: sum1 = b[row];
3202: sum2 = b[row+1];
3203: for (n = 0; n<sz-1; n+=2) {
3204: i1 = idx[0];
3205: i2 = idx[1];
3206: idx += 2;
3207: tmp0 = x[i1];
3208: tmp1 = x[i2];
3209: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3210: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3211: }
3212: if (n == sz-1) {
3213: tmp0 = x[*idx++];
3214: sum1 -= v1[0] * tmp0;
3215: sum2 -= v2[0] * tmp0;
3216: v1++; v2++;
3217: }
3218: t[row] = sum1;
3219: t[row+1] = sum2;
3220: sz = ii[row+1] - diag[row] - 2;
3221: idx = a->j + diag[row] + 2;
3222: v1 += 2;
3223: v2 += 2;
3224: for (n = 0; n<sz-1; n+=2) {
3225: i1 = idx[0];
3226: i2 = idx[1];
3227: idx += 2;
3228: tmp0 = x[i1];
3229: tmp1 = x[i2];
3230: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3231: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3232: }
3233: if (n == sz-1) {
3234: tmp0 = x[*idx];
3235: sum1 -= v1[0] * tmp0;
3236: sum2 -= v2[0] * tmp0;
3237: }
3238: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3239: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3240: break;
3241: case 3:
3242: v2 = a->a + ii[row+1];
3243: v3 = a->a + ii[row+2];
3244: sum1 = b[row];
3245: sum2 = b[row+1];
3246: sum3 = b[row+2];
3247: for (n = 0; n<sz-1; n+=2) {
3248: i1 = idx[0];
3249: i2 = idx[1];
3250: idx += 2;
3251: tmp0 = x[i1];
3252: tmp1 = x[i2];
3253: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3256: }
3257: if (n == sz-1) {
3258: tmp0 = x[*idx++];
3259: sum1 -= v1[0] * tmp0;
3260: sum2 -= v2[0] * tmp0;
3261: sum3 -= v3[0] * tmp0;
3262: v1++; v2++; v3++;
3263: }
3264: t[row] = sum1;
3265: t[row+1] = sum2;
3266: t[row+2] = sum3;
3267: sz = ii[row+1] - diag[row] - 3;
3268: idx = a->j + diag[row] + 3;
3269: v1 += 3;
3270: v2 += 3;
3271: v3 += 3;
3272: for (n = 0; n<sz-1; n+=2) {
3273: i1 = idx[0];
3274: i2 = idx[1];
3275: idx += 2;
3276: tmp0 = x[i1];
3277: tmp1 = x[i2];
3278: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3279: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3280: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3281: }
3282: if (n == sz-1) {
3283: tmp0 = x[*idx];
3284: sum1 -= v1[0] * tmp0;
3285: sum2 -= v2[0] * tmp0;
3286: sum3 -= v3[0] * tmp0;
3287: }
3288: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3289: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3290: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3291: break;
3292: case 4:
3293: v2 = a->a + ii[row+1];
3294: v3 = a->a + ii[row+2];
3295: v4 = a->a + ii[row+3];
3296: sum1 = b[row];
3297: sum2 = b[row+1];
3298: sum3 = b[row+2];
3299: sum4 = b[row+3];
3300: for (n = 0; n<sz-1; n+=2) {
3301: i1 = idx[0];
3302: i2 = idx[1];
3303: idx += 2;
3304: tmp0 = x[i1];
3305: tmp1 = x[i2];
3306: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3307: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3308: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3309: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3310: }
3311: if (n == sz-1) {
3312: tmp0 = x[*idx++];
3313: sum1 -= v1[0] * tmp0;
3314: sum2 -= v2[0] * tmp0;
3315: sum3 -= v3[0] * tmp0;
3316: sum4 -= v4[0] * tmp0;
3317: v1++; v2++; v3++; v4++;
3318: }
3319: t[row] = sum1;
3320: t[row+1] = sum2;
3321: t[row+2] = sum3;
3322: t[row+3] = sum4;
3323: sz = ii[row+1] - diag[row] - 4;
3324: idx = a->j + diag[row] + 4;
3325: v1 += 4;
3326: v2 += 4;
3327: v3 += 4;
3328: v4 += 4;
3329: for (n = 0; n<sz-1; n+=2) {
3330: i1 = idx[0];
3331: i2 = idx[1];
3332: idx += 2;
3333: tmp0 = x[i1];
3334: tmp1 = x[i2];
3335: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3336: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3337: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3338: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3339: }
3340: if (n == sz-1) {
3341: tmp0 = x[*idx];
3342: sum1 -= v1[0] * tmp0;
3343: sum2 -= v2[0] * tmp0;
3344: sum3 -= v3[0] * tmp0;
3345: sum4 -= v4[0] * tmp0;
3346: }
3347: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3348: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3349: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3350: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3351: break;
3352: case 5:
3353: v2 = a->a + ii[row+1];
3354: v3 = a->a + ii[row+2];
3355: v4 = a->a + ii[row+3];
3356: v5 = a->a + ii[row+4];
3357: sum1 = b[row];
3358: sum2 = b[row+1];
3359: sum3 = b[row+2];
3360: sum4 = b[row+3];
3361: sum5 = b[row+4];
3362: for (n = 0; n<sz-1; n+=2) {
3363: i1 = idx[0];
3364: i2 = idx[1];
3365: idx += 2;
3366: tmp0 = x[i1];
3367: tmp1 = x[i2];
3368: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3369: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3370: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3371: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3372: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3373: }
3374: if (n == sz-1) {
3375: tmp0 = x[*idx++];
3376: sum1 -= v1[0] * tmp0;
3377: sum2 -= v2[0] * tmp0;
3378: sum3 -= v3[0] * tmp0;
3379: sum4 -= v4[0] * tmp0;
3380: sum5 -= v5[0] * tmp0;
3381: v1++; v2++; v3++; v4++; v5++;
3382: }
3383: t[row] = sum1;
3384: t[row+1] = sum2;
3385: t[row+2] = sum3;
3386: t[row+3] = sum4;
3387: t[row+4] = sum5;
3388: sz = ii[row+1] - diag[row] - 5;
3389: idx = a->j + diag[row] + 5;
3390: v1 += 5;
3391: v2 += 5;
3392: v3 += 5;
3393: v4 += 5;
3394: v5 += 5;
3395: for (n = 0; n<sz-1; n+=2) {
3396: i1 = idx[0];
3397: i2 = idx[1];
3398: idx += 2;
3399: tmp0 = x[i1];
3400: tmp1 = x[i2];
3401: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3402: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3403: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3404: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3405: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3406: }
3407: if (n == sz-1) {
3408: tmp0 = x[*idx];
3409: sum1 -= v1[0] * tmp0;
3410: sum2 -= v2[0] * tmp0;
3411: sum3 -= v3[0] * tmp0;
3412: sum4 -= v4[0] * tmp0;
3413: sum5 -= v5[0] * tmp0;
3414: }
3415: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3416: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3417: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3418: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3419: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3420: break;
3421: default:
3422: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3423: }
3424: }
3425: xb = t;
3426: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3427: } else xb = b;
3429: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3431: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3432: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3433: ibdiag -= sizes[i]*sizes[i];
3435: /* set RHS */
3436: if (xb == b) {
3437: /* whole (old way) */
3438: sz = ii[row+1] - ii[row];
3439: idx = a->j + ii[row];
3440: switch (sizes[i]) {
3441: case 5:
3442: v5 = a->a + ii[row-4];
3443: case 4: /* fall through */
3444: v4 = a->a + ii[row-3];
3445: case 3:
3446: v3 = a->a + ii[row-2];
3447: case 2:
3448: v2 = a->a + ii[row-1];
3449: case 1:
3450: v1 = a->a + ii[row];
3451: break;
3452: default:
3453: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3454: }
3455: } else {
3456: /* upper, no diag */
3457: sz = ii[row+1] - diag[row] - 1;
3458: idx = a->j + diag[row] + 1;
3459: switch (sizes[i]) {
3460: case 5:
3461: v5 = a->a + diag[row-4] + 5;
3462: case 4: /* fall through */
3463: v4 = a->a + diag[row-3] + 4;
3464: case 3:
3465: v3 = a->a + diag[row-2] + 3;
3466: case 2:
3467: v2 = a->a + diag[row-1] + 2;
3468: case 1:
3469: v1 = a->a + diag[row] + 1;
3470: }
3471: }
3472: /* set sum */
3473: switch (sizes[i]) {
3474: case 5:
3475: sum5 = xb[row-4];
3476: case 4: /* fall through */
3477: sum4 = xb[row-3];
3478: case 3:
3479: sum3 = xb[row-2];
3480: case 2:
3481: sum2 = xb[row-1];
3482: case 1:
3483: /* note that sum1 is associated with the last row */
3484: sum1 = xb[row];
3485: }
3486: /* do sums */
3487: for (n = 0; n<sz-1; n+=2) {
3488: i1 = idx[0];
3489: i2 = idx[1];
3490: idx += 2;
3491: tmp0 = x[i1];
3492: tmp1 = x[i2];
3493: switch (sizes[i]) {
3494: case 5:
3495: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3496: case 4: /* fall through */
3497: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3498: case 3:
3499: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3500: case 2:
3501: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3502: case 1:
3503: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3504: }
3505: }
3506: /* ragged edge */
3507: if (n == sz-1) {
3508: tmp0 = x[*idx];
3509: switch (sizes[i]) {
3510: case 5:
3511: sum5 -= *v5*tmp0;
3512: case 4: /* fall through */
3513: sum4 -= *v4*tmp0;
3514: case 3:
3515: sum3 -= *v3*tmp0;
3516: case 2:
3517: sum2 -= *v2*tmp0;
3518: case 1:
3519: sum1 -= *v1*tmp0;
3520: }
3521: }
3522: /* update */
3523: if (xb == b) {
3524: /* whole (old way) w/ diag */
3525: switch (sizes[i]) {
3526: case 5:
3527: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3528: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3529: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3530: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3531: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3532: break;
3533: case 4:
3534: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3535: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3536: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3537: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3538: break;
3539: case 3:
3540: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3541: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3542: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3543: break;
3544: case 2:
3545: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3546: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3547: break;
3548: case 1:
3549: x[row--] += sum1*(*ibdiag);
3550: break;
3551: }
3552: } else {
3553: /* no diag so set = */
3554: switch (sizes[i]) {
3555: case 5:
3556: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3557: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3558: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3559: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3560: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3561: break;
3562: case 4:
3563: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3564: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3565: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3566: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3567: break;
3568: case 3:
3569: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3570: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3571: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3572: break;
3573: case 2:
3574: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3575: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3576: break;
3577: case 1:
3578: x[row--] = sum1*(*ibdiag);
3579: break;
3580: }
3581: }
3582: }
3583: if (xb == b) {
3584: PetscLogFlops(2.0*a->nz);
3585: } else {
3586: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3587: }
3588: }
3589: }
3590: if (flag & SOR_EISENSTAT) {
3591: /*
3592: Apply (U + D)^-1 where D is now the block diagonal
3593: */
3594: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3595: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3596: ibdiag -= sizes[i]*sizes[i];
3597: sz = ii[row+1] - diag[row] - 1;
3598: v1 = a->a + diag[row] + 1;
3599: idx = a->j + diag[row] + 1;
3600: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3601: switch (sizes[i]) {
3602: case 1:
3604: sum1 = b[row];
3605: for (n = 0; n<sz-1; n+=2) {
3606: i1 = idx[0];
3607: i2 = idx[1];
3608: idx += 2;
3609: tmp0 = x[i1];
3610: tmp1 = x[i2];
3611: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3612: }
3614: if (n == sz-1) {
3615: tmp0 = x[*idx];
3616: sum1 -= *v1*tmp0;
3617: }
3618: x[row] = sum1*(*ibdiag);row--;
3619: break;
3621: case 2:
3623: sum1 = b[row];
3624: sum2 = b[row-1];
3625: /* note that sum1 is associated with the second of the two rows */
3626: v2 = a->a + diag[row-1] + 2;
3627: for (n = 0; n<sz-1; n+=2) {
3628: i1 = idx[0];
3629: i2 = idx[1];
3630: idx += 2;
3631: tmp0 = x[i1];
3632: tmp1 = x[i2];
3633: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3635: }
3637: if (n == sz-1) {
3638: tmp0 = x[*idx];
3639: sum1 -= *v1*tmp0;
3640: sum2 -= *v2*tmp0;
3641: }
3642: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3643: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3644: row -= 2;
3645: break;
3646: case 3:
3648: sum1 = b[row];
3649: sum2 = b[row-1];
3650: sum3 = b[row-2];
3651: v2 = a->a + diag[row-1] + 2;
3652: v3 = a->a + diag[row-2] + 3;
3653: for (n = 0; n<sz-1; n+=2) {
3654: i1 = idx[0];
3655: i2 = idx[1];
3656: idx += 2;
3657: tmp0 = x[i1];
3658: tmp1 = x[i2];
3659: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3660: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3661: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3662: }
3664: if (n == sz-1) {
3665: tmp0 = x[*idx];
3666: sum1 -= *v1*tmp0;
3667: sum2 -= *v2*tmp0;
3668: sum3 -= *v3*tmp0;
3669: }
3670: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3671: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3672: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3673: row -= 3;
3674: break;
3675: case 4:
3677: sum1 = b[row];
3678: sum2 = b[row-1];
3679: sum3 = b[row-2];
3680: sum4 = b[row-3];
3681: v2 = a->a + diag[row-1] + 2;
3682: v3 = a->a + diag[row-2] + 3;
3683: v4 = a->a + diag[row-3] + 4;
3684: for (n = 0; n<sz-1; n+=2) {
3685: i1 = idx[0];
3686: i2 = idx[1];
3687: idx += 2;
3688: tmp0 = x[i1];
3689: tmp1 = x[i2];
3690: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3691: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3692: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3693: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3694: }
3696: if (n == sz-1) {
3697: tmp0 = x[*idx];
3698: sum1 -= *v1*tmp0;
3699: sum2 -= *v2*tmp0;
3700: sum3 -= *v3*tmp0;
3701: sum4 -= *v4*tmp0;
3702: }
3703: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3704: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3705: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3706: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3707: row -= 4;
3708: break;
3709: case 5:
3711: sum1 = b[row];
3712: sum2 = b[row-1];
3713: sum3 = b[row-2];
3714: sum4 = b[row-3];
3715: sum5 = b[row-4];
3716: v2 = a->a + diag[row-1] + 2;
3717: v3 = a->a + diag[row-2] + 3;
3718: v4 = a->a + diag[row-3] + 4;
3719: v5 = a->a + diag[row-4] + 5;
3720: for (n = 0; n<sz-1; n+=2) {
3721: i1 = idx[0];
3722: i2 = idx[1];
3723: idx += 2;
3724: tmp0 = x[i1];
3725: tmp1 = x[i2];
3726: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3727: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3728: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3729: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3730: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3731: }
3733: if (n == sz-1) {
3734: tmp0 = x[*idx];
3735: sum1 -= *v1*tmp0;
3736: sum2 -= *v2*tmp0;
3737: sum3 -= *v3*tmp0;
3738: sum4 -= *v4*tmp0;
3739: sum5 -= *v5*tmp0;
3740: }
3741: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3742: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3743: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3744: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3745: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3746: row -= 5;
3747: break;
3748: default:
3749: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3750: }
3751: }
3752: PetscLogFlops(a->nz);
3754: /*
3755: t = b - D x where D is the block diagonal
3756: */
3757: cnt = 0;
3758: for (i=0, row=0; i<m; i++) {
3759: switch (sizes[i]) {
3760: case 1:
3761: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3762: break;
3763: case 2:
3764: x1 = x[row]; x2 = x[row+1];
3765: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3766: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3767: t[row] = b[row] - tmp1;
3768: t[row+1] = b[row+1] - tmp2; row += 2;
3769: cnt += 4;
3770: break;
3771: case 3:
3772: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3773: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3774: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3775: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3776: t[row] = b[row] - tmp1;
3777: t[row+1] = b[row+1] - tmp2;
3778: t[row+2] = b[row+2] - tmp3; row += 3;
3779: cnt += 9;
3780: break;
3781: case 4:
3782: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3783: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3784: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3785: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3786: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3787: t[row] = b[row] - tmp1;
3788: t[row+1] = b[row+1] - tmp2;
3789: t[row+2] = b[row+2] - tmp3;
3790: t[row+3] = b[row+3] - tmp4; row += 4;
3791: cnt += 16;
3792: break;
3793: case 5:
3794: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3795: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3796: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3797: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3798: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3799: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3800: t[row] = b[row] - tmp1;
3801: t[row+1] = b[row+1] - tmp2;
3802: t[row+2] = b[row+2] - tmp3;
3803: t[row+3] = b[row+3] - tmp4;
3804: t[row+4] = b[row+4] - tmp5;row += 5;
3805: cnt += 25;
3806: break;
3807: default:
3808: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3809: }
3810: }
3811: PetscLogFlops(m);
3815: /*
3816: Apply (L + D)^-1 where D is the block diagonal
3817: */
3818: for (i=0, row=0; i<m; i++) {
3819: sz = diag[row] - ii[row];
3820: v1 = a->a + ii[row];
3821: idx = a->j + ii[row];
3822: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3823: switch (sizes[i]) {
3824: case 1:
3826: sum1 = t[row];
3827: for (n = 0; n<sz-1; n+=2) {
3828: i1 = idx[0];
3829: i2 = idx[1];
3830: idx += 2;
3831: tmp0 = t[i1];
3832: tmp1 = t[i2];
3833: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3834: }
3836: if (n == sz-1) {
3837: tmp0 = t[*idx];
3838: sum1 -= *v1 * tmp0;
3839: }
3840: x[row] += t[row] = sum1*(*ibdiag++); row++;
3841: break;
3842: case 2:
3843: v2 = a->a + ii[row+1];
3844: sum1 = t[row];
3845: sum2 = t[row+1];
3846: for (n = 0; n<sz-1; n+=2) {
3847: i1 = idx[0];
3848: i2 = idx[1];
3849: idx += 2;
3850: tmp0 = t[i1];
3851: tmp1 = t[i2];
3852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3854: }
3856: if (n == sz-1) {
3857: tmp0 = t[*idx];
3858: sum1 -= v1[0] * tmp0;
3859: sum2 -= v2[0] * tmp0;
3860: }
3861: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3862: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3863: ibdiag += 4; row += 2;
3864: break;
3865: case 3:
3866: v2 = a->a + ii[row+1];
3867: v3 = a->a + ii[row+2];
3868: sum1 = t[row];
3869: sum2 = t[row+1];
3870: sum3 = t[row+2];
3871: for (n = 0; n<sz-1; n+=2) {
3872: i1 = idx[0];
3873: i2 = idx[1];
3874: idx += 2;
3875: tmp0 = t[i1];
3876: tmp1 = t[i2];
3877: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3878: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3879: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3880: }
3882: if (n == sz-1) {
3883: tmp0 = t[*idx];
3884: sum1 -= v1[0] * tmp0;
3885: sum2 -= v2[0] * tmp0;
3886: sum3 -= v3[0] * tmp0;
3887: }
3888: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3889: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3890: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3891: ibdiag += 9; row += 3;
3892: break;
3893: case 4:
3894: v2 = a->a + ii[row+1];
3895: v3 = a->a + ii[row+2];
3896: v4 = a->a + ii[row+3];
3897: sum1 = t[row];
3898: sum2 = t[row+1];
3899: sum3 = t[row+2];
3900: sum4 = t[row+3];
3901: for (n = 0; n<sz-1; n+=2) {
3902: i1 = idx[0];
3903: i2 = idx[1];
3904: idx += 2;
3905: tmp0 = t[i1];
3906: tmp1 = t[i2];
3907: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3911: }
3913: if (n == sz-1) {
3914: tmp0 = t[*idx];
3915: sum1 -= v1[0] * tmp0;
3916: sum2 -= v2[0] * tmp0;
3917: sum3 -= v3[0] * tmp0;
3918: sum4 -= v4[0] * tmp0;
3919: }
3920: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3921: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3922: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3923: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3924: ibdiag += 16; row += 4;
3925: break;
3926: case 5:
3927: v2 = a->a + ii[row+1];
3928: v3 = a->a + ii[row+2];
3929: v4 = a->a + ii[row+3];
3930: v5 = a->a + ii[row+4];
3931: sum1 = t[row];
3932: sum2 = t[row+1];
3933: sum3 = t[row+2];
3934: sum4 = t[row+3];
3935: sum5 = t[row+4];
3936: for (n = 0; n<sz-1; n+=2) {
3937: i1 = idx[0];
3938: i2 = idx[1];
3939: idx += 2;
3940: tmp0 = t[i1];
3941: tmp1 = t[i2];
3942: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3943: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3944: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3945: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3946: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3947: }
3949: if (n == sz-1) {
3950: tmp0 = t[*idx];
3951: sum1 -= v1[0] * tmp0;
3952: sum2 -= v2[0] * tmp0;
3953: sum3 -= v3[0] * tmp0;
3954: sum4 -= v4[0] * tmp0;
3955: sum5 -= v5[0] * tmp0;
3956: }
3957: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3958: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3959: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3960: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3961: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3962: ibdiag += 25; row += 5;
3963: break;
3964: default:
3965: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3966: }
3967: }
3968: PetscLogFlops(a->nz);
3969: }
3970: VecRestoreArray(xx,&x);
3971: VecRestoreArrayRead(bb,&b);
3972: return(0);
3973: }
3975: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3976: {
3977: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3978: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3979: const MatScalar *bdiag = a->inode.bdiag;
3980: const PetscScalar *b;
3981: PetscErrorCode ierr;
3982: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3983: const PetscInt *sizes = a->inode.size;
3986: VecGetArray(xx,&x);
3987: VecGetArrayRead(bb,&b);
3988: cnt = 0;
3989: for (i=0, row=0; i<m; i++) {
3990: switch (sizes[i]) {
3991: case 1:
3992: x[row] = b[row]*bdiag[cnt++];row++;
3993: break;
3994: case 2:
3995: x1 = b[row]; x2 = b[row+1];
3996: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3997: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3998: x[row++] = tmp1;
3999: x[row++] = tmp2;
4000: cnt += 4;
4001: break;
4002: case 3:
4003: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4004: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4005: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4006: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4007: x[row++] = tmp1;
4008: x[row++] = tmp2;
4009: x[row++] = tmp3;
4010: cnt += 9;
4011: break;
4012: case 4:
4013: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4014: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4015: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4016: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4017: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4018: x[row++] = tmp1;
4019: x[row++] = tmp2;
4020: x[row++] = tmp3;
4021: x[row++] = tmp4;
4022: cnt += 16;
4023: break;
4024: case 5:
4025: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4026: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4027: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4028: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4029: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4030: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4031: x[row++] = tmp1;
4032: x[row++] = tmp2;
4033: x[row++] = tmp3;
4034: x[row++] = tmp4;
4035: x[row++] = tmp5;
4036: cnt += 25;
4037: break;
4038: default:
4039: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4040: }
4041: }
4042: PetscLogFlops(2*cnt);
4043: VecRestoreArray(xx,&x);
4044: VecRestoreArrayRead(bb,&b);
4045: return(0);
4046: }
4048: /*
4049: samestructure indicates that the matrix has not changed its nonzero structure so we
4050: do not need to recompute the inodes
4051: */
4052: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4053: {
4054: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4056: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4057: PetscBool flag;
4058: const PetscInt *idx,*idy,*ii;
4061: if (!a->inode.use) return(0);
4062: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4064: m = A->rmap->n;
4065: if (a->inode.size) ns = a->inode.size;
4066: else {
4067: PetscMalloc1(m+1,&ns);
4068: }
4070: i = 0;
4071: node_count = 0;
4072: idx = a->j;
4073: ii = a->i;
4074: while (i < m) { /* For each row */
4075: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4076: /* Limits the number of elements in a node to 'a->inode.limit' */
4077: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4078: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4079: if (nzy != nzx) break;
4080: idy += nzx; /* Same nonzero pattern */
4081: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4082: if (!flag) break;
4083: }
4084: ns[node_count++] = blk_size;
4085: idx += blk_size*nzx;
4086: i = j;
4087: }
4088: /* If not enough inodes found,, do not use inode version of the routines */
4089: if (!m || node_count > .8*m) {
4090: PetscFree(ns);
4092: a->inode.node_count = 0;
4093: a->inode.size = NULL;
4094: a->inode.use = PETSC_FALSE;
4095: A->ops->mult = MatMult_SeqAIJ;
4096: A->ops->sor = MatSOR_SeqAIJ;
4097: A->ops->multadd = MatMultAdd_SeqAIJ;
4098: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4099: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4100: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4101: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4102: A->ops->coloringpatch = 0;
4103: A->ops->multdiagonalblock = 0;
4105: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4106: } else {
4107: if (!A->factortype) {
4108: A->ops->mult = MatMult_SeqAIJ_Inode;
4109: A->ops->sor = MatSOR_SeqAIJ_Inode;
4110: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4111: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4112: if (A->rmap->n == A->cmap->n) {
4113: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4114: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4115: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4116: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4117: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4118: }
4119: } else {
4120: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4121: }
4122: a->inode.node_count = node_count;
4123: a->inode.size = ns;
4124: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4125: }
4126: a->inode.checked = PETSC_TRUE;
4127: a->inode.mat_nonzerostate = A->nonzerostate;
4128: return(0);
4129: }
4131: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4132: {
4133: Mat B =*C;
4134: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4136: PetscInt m=A->rmap->n;
4139: c->inode.use = a->inode.use;
4140: c->inode.limit = a->inode.limit;
4141: c->inode.max_limit = a->inode.max_limit;
4142: if (a->inode.size) {
4143: PetscMalloc1(m+1,&c->inode.size);
4144: c->inode.node_count = a->inode.node_count;
4145: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4146: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4147: if (!B->factortype) {
4148: B->ops->mult = MatMult_SeqAIJ_Inode;
4149: B->ops->sor = MatSOR_SeqAIJ_Inode;
4150: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4151: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4152: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4153: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4154: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4155: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4156: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4157: } else {
4158: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4159: }
4160: } else {
4161: c->inode.size = 0;
4162: c->inode.node_count = 0;
4163: }
4164: c->inode.ibdiagvalid = PETSC_FALSE;
4165: c->inode.ibdiag = 0;
4166: c->inode.bdiag = 0;
4167: return(0);
4168: }
4170: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4171: {
4172: PetscInt k;
4173: const PetscInt *vi;
4176: vi = aj + ai[row];
4177: for (k=0; k<nzl; k++) cols[k] = vi[k];
4178: vi = aj + adiag[row];
4179: cols[nzl] = vi[0];
4180: vi = aj + adiag[row+1]+1;
4181: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4182: return(0);
4183: }
4184: /*
4185: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4186: Modified from MatSeqAIJCheckInode().
4188: Input Parameters:
4189: . Mat A - ILU or LU matrix factor
4191: */
4192: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4193: {
4194: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4196: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4197: PetscInt *cols1,*cols2,*ns;
4198: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4199: PetscBool flag;
4202: if (!a->inode.use) return(0);
4203: if (a->inode.checked) return(0);
4205: m = A->rmap->n;
4206: if (a->inode.size) ns = a->inode.size;
4207: else {
4208: PetscMalloc1(m+1,&ns);
4209: }
4211: i = 0;
4212: node_count = 0;
4213: PetscMalloc2(m,&cols1,m,&cols2);
4214: while (i < m) { /* For each row */
4215: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4216: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4217: nzx = nzl1 + nzu1 + 1;
4218: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4220: /* Limits the number of elements in a node to 'a->inode.limit' */
4221: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4222: nzl2 = ai[j+1] - ai[j];
4223: nzu2 = adiag[j] - adiag[j+1] - 1;
4224: nzy = nzl2 + nzu2 + 1;
4225: if (nzy != nzx) break;
4226: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4227: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4228: if (!flag) break;
4229: }
4230: ns[node_count++] = blk_size;
4231: i = j;
4232: }
4233: PetscFree2(cols1,cols2);
4234: /* If not enough inodes found,, do not use inode version of the routines */
4235: if (!m || node_count > .8*m) {
4236: PetscFree(ns);
4238: a->inode.node_count = 0;
4239: a->inode.size = NULL;
4240: a->inode.use = PETSC_FALSE;
4242: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4243: } else {
4244: A->ops->mult = 0;
4245: A->ops->sor = 0;
4246: A->ops->multadd = 0;
4247: A->ops->getrowij = 0;
4248: A->ops->restorerowij = 0;
4249: A->ops->getcolumnij = 0;
4250: A->ops->restorecolumnij = 0;
4251: A->ops->coloringpatch = 0;
4252: A->ops->multdiagonalblock = 0;
4253: a->inode.node_count = node_count;
4254: a->inode.size = ns;
4256: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4257: }
4258: a->inode.checked = PETSC_TRUE;
4259: return(0);
4260: }
4262: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4263: {
4264: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4267: a->inode.ibdiagvalid = PETSC_FALSE;
4268: return(0);
4269: }
4271: /*
4272: This is really ugly. if inodes are used this replaces the
4273: permutations with ones that correspond to rows/cols of the matrix
4274: rather then inode blocks
4275: */
4276: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4277: {
4281: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4282: return(0);
4283: }
4285: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4286: {
4287: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4289: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4290: const PetscInt *ridx,*cidx;
4291: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4292: PetscInt nslim_col,*ns_col;
4293: IS ris = *rperm,cis = *cperm;
4296: if (!a->inode.size) return(0); /* no inodes so return */
4297: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4299: Mat_CreateColInode(A,&nslim_col,&ns_col);
4300: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4301: PetscMalloc2(m,&permr,n,&permc);
4303: ISGetIndices(ris,&ridx);
4304: ISGetIndices(cis,&cidx);
4306: /* Form the inode structure for the rows of permuted matric using inv perm*/
4307: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4309: /* Construct the permutations for rows*/
4310: for (i=0,row = 0; i<nslim_row; ++i) {
4311: indx = ridx[i];
4312: start_val = tns[indx];
4313: end_val = tns[indx + 1];
4314: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4315: }
4317: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4318: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4320: /* Construct permutations for columns */
4321: for (i=0,col=0; i<nslim_col; ++i) {
4322: indx = cidx[i];
4323: start_val = tns[indx];
4324: end_val = tns[indx + 1];
4325: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4326: }
4328: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4329: ISSetPermutation(*rperm);
4330: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4331: ISSetPermutation(*cperm);
4333: ISRestoreIndices(ris,&ridx);
4334: ISRestoreIndices(cis,&cidx);
4336: PetscFree(ns_col);
4337: PetscFree2(permr,permc);
4338: ISDestroy(&cis);
4339: ISDestroy(&ris);
4340: PetscFree(tns);
4341: return(0);
4342: }
4344: /*@C
4345: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4347: Not Collective
4349: Input Parameter:
4350: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4352: Output Parameter:
4353: + node_count - no of inodes present in the matrix.
4354: . sizes - an array of size node_count,with sizes of each inode.
4355: - limit - the max size used to generate the inodes.
4357: Level: advanced
4359: Notes: This routine returns some internal storage information
4360: of the matrix, it is intended to be used by advanced users.
4361: It should be called after the matrix is assembled.
4362: The contents of the sizes[] array should not be changed.
4363: NULL may be passed for information not requested.
4365: .keywords: matrix, seqaij, get, inode
4367: .seealso: MatGetInfo()
4368: @*/
4369: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4370: {
4371: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4374: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4375: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4376: if (f) {
4377: (*f)(A,node_count,sizes,limit);
4378: }
4379: return(0);
4380: }
4382: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4383: {
4384: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4387: if (node_count) *node_count = a->inode.node_count;
4388: if (sizes) *sizes = a->inode.size;
4389: if (limit) *limit = a->inode.limit;
4390: return(0);
4391: }