Actual source code: inode.c
petsc-3.11.4 2019-09-28
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
12: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
15: n = A->cmap->n;
16: m = A->rmap->n;
17: ns_row = a->inode.size;
19: min_mn = (m < n) ? m : n;
20: if (!ns) {
21: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22: for (; count+1 < n; count++,i++) ;
23: if (count < n) {
24: i++;
25: }
26: *size = i;
27: return(0);
28: }
29: PetscMalloc1(n+1,&ns_col);
31: /* Use the same row structure wherever feasible. */
32: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33: ns_col[i] = ns_row[i];
34: }
36: /* if m < n; pad up the remainder with inode_limit */
37: for (; count+1 < n; count++,i++) {
38: ns_col[i] = 1;
39: }
40: /* The last node is the odd ball. padd it up with the remaining rows; */
41: if (count < n) {
42: ns_col[i] = n - count;
43: i++;
44: } else if (count > n) {
45: /* Adjust for the over estimation */
46: ns_col[i-1] += n - count;
47: }
48: *size = i;
49: *ns = ns_col;
50: return(0);
51: }
54: /*
55: This builds symmetric version of nonzero structure,
56: */
57: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58: {
59: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
61: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
66: nslim_row = a->inode.node_count;
67: m = A->rmap->n;
68: n = A->cmap->n;
69: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
71: /* Use the row_inode as column_inode */
72: nslim_col = nslim_row;
73: ns_col = ns_row;
75: /* allocate space for reformated inode structure */
76: PetscMalloc1(nslim_col+1,&tns);
77: PetscMalloc1(n+1,&tvc);
78: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
80: for (i1=0,col=0; i1<nslim_col; ++i1) {
81: nsz = ns_col[i1];
82: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
83: }
84: /* allocate space for row pointers */
85: PetscMalloc1(nslim_row+1,&ia);
86: *iia = ia;
87: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
88: PetscMalloc1(nslim_row+1,&work);
90: /* determine the number of columns in each row */
91: ia[0] = oshift;
92: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
94: j = aj + ai[row] + ishift;
95: jmax = aj + ai[row+1] + ishift;
96: if (j==jmax) continue; /* empty row */
97: col = *j++ + ishift;
98: i2 = tvc[col];
99: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100: ia[i1+1]++;
101: ia[i2+1]++;
102: i2++; /* Start col of next node */
103: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104: i2 = tvc[col];
105: }
106: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
107: }
109: /* shift ia[i] to point to next row */
110: for (i1=1; i1<nslim_row+1; i1++) {
111: row = ia[i1-1];
112: ia[i1] += row;
113: work[i1-1] = row - oshift;
114: }
116: /* allocate space for column pointers */
117: nz = ia[nslim_row] + (!ishift);
118: PetscMalloc1(nz,&ja);
119: *jja = ja;
121: /* loop over lower triangular part putting into ja */
122: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123: j = aj + ai[row] + ishift;
124: jmax = aj + ai[row+1] + ishift;
125: if (j==jmax) continue; /* empty row */
126: col = *j++ + ishift;
127: i2 = tvc[col];
128: while (i2<i1 && j<jmax) {
129: ja[work[i2]++] = i1 + oshift;
130: ja[work[i1]++] = i2 + oshift;
131: ++i2;
132: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133: i2 = tvc[col];
134: }
135: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
137: }
138: PetscFree(work);
139: PetscFree(tns);
140: PetscFree(tvc);
141: return(0);
142: }
144: /*
145: This builds nonsymmetric version of nonzero structure,
146: */
147: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
148: {
149: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
151: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
152: PetscInt *tns,*tvc,nsz,i1,i2;
153: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
156: nslim_row = a->inode.node_count;
157: n = A->cmap->n;
159: /* Create The column_inode for this matrix */
160: MatCreateColInode_Private(A,&nslim_col,&ns_col);
162: /* allocate space for reformated column_inode structure */
163: PetscMalloc1(nslim_col +1,&tns);
164: PetscMalloc1(n + 1,&tvc);
165: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
167: for (i1=0,col=0; i1<nslim_col; ++i1) {
168: nsz = ns_col[i1];
169: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
170: }
171: /* allocate space for row pointers */
172: PetscMalloc1(nslim_row+1,&ia);
173: *iia = ia;
174: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
175: PetscMalloc1(nslim_row+1,&work);
177: /* determine the number of columns in each row */
178: ia[0] = oshift;
179: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
180: j = aj + ai[row] + ishift;
181: nz = ai[row+1] - ai[row];
182: if (!nz) continue; /* empty row */
183: col = *j++ + ishift;
184: i2 = tvc[col];
185: while (nz-- > 0) { /* off-diagonal elemets */
186: ia[i1+1]++;
187: i2++; /* Start col of next node */
188: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
189: if (nz > 0) i2 = tvc[col];
190: }
191: }
193: /* shift ia[i] to point to next row */
194: for (i1=1; i1<nslim_row+1; i1++) {
195: row = ia[i1-1];
196: ia[i1] += row;
197: work[i1-1] = row - oshift;
198: }
200: /* allocate space for column pointers */
201: nz = ia[nslim_row] + (!ishift);
202: PetscMalloc1(nz,&ja);
203: *jja = ja;
205: /* loop over matrix putting into ja */
206: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
207: j = aj + ai[row] + ishift;
208: nz = ai[row+1] - ai[row];
209: if (!nz) continue; /* empty row */
210: col = *j++ + ishift;
211: i2 = tvc[col];
212: while (nz-- > 0) {
213: ja[work[i1]++] = i2 + oshift;
214: ++i2;
215: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216: if (nz > 0) i2 = tvc[col];
217: }
218: }
219: PetscFree(ns_col);
220: PetscFree(work);
221: PetscFree(tns);
222: PetscFree(tvc);
223: return(0);
224: }
226: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
227: {
228: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
232: *n = a->inode.node_count;
233: if (!ia) return(0);
234: if (!blockcompressed) {
235: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
236: } else if (symmetric) {
237: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
238: } else {
239: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
240: }
241: return(0);
242: }
244: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
245: {
249: if (!ia) return(0);
251: if (!blockcompressed) {
252: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
253: } else {
254: PetscFree(*ia);
255: PetscFree(*ja);
256: }
257: return(0);
258: }
260: /* ----------------------------------------------------------- */
262: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
263: {
264: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
266: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
267: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
270: nslim_row = a->inode.node_count;
271: n = A->cmap->n;
273: /* Create The column_inode for this matrix */
274: MatCreateColInode_Private(A,&nslim_col,&ns_col);
276: /* allocate space for reformated column_inode structure */
277: PetscMalloc1(nslim_col + 1,&tns);
278: PetscMalloc1(n + 1,&tvc);
279: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
281: for (i1=0,col=0; i1<nslim_col; ++i1) {
282: nsz = ns_col[i1];
283: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
284: }
285: /* allocate space for column pointers */
286: PetscMalloc1(nslim_col+1,&ia);
287: *iia = ia;
288: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
289: PetscMalloc1(nslim_col+1,&work);
291: /* determine the number of columns in each row */
292: ia[0] = oshift;
293: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
294: j = aj + ai[row] + ishift;
295: col = *j++ + ishift;
296: i2 = tvc[col];
297: nz = ai[row+1] - ai[row];
298: while (nz-- > 0) { /* off-diagonal elemets */
299: /* ia[i1+1]++; */
300: ia[i2+1]++;
301: i2++;
302: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
303: if (nz > 0) i2 = tvc[col];
304: }
305: }
307: /* shift ia[i] to point to next col */
308: for (i1=1; i1<nslim_col+1; i1++) {
309: col = ia[i1-1];
310: ia[i1] += col;
311: work[i1-1] = col - oshift;
312: }
314: /* allocate space for column pointers */
315: nz = ia[nslim_col] + (!ishift);
316: PetscMalloc1(nz,&ja);
317: *jja = ja;
319: /* loop over matrix putting into ja */
320: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
321: j = aj + ai[row] + ishift;
322: col = *j++ + ishift;
323: i2 = tvc[col];
324: nz = ai[row+1] - ai[row];
325: while (nz-- > 0) {
326: /* ja[work[i1]++] = i2 + oshift; */
327: ja[work[i2]++] = i1 + oshift;
328: i2++;
329: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
330: if (nz > 0) i2 = tvc[col];
331: }
332: }
333: PetscFree(ns_col);
334: PetscFree(work);
335: PetscFree(tns);
336: PetscFree(tvc);
337: return(0);
338: }
340: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
341: {
345: MatCreateColInode_Private(A,n,NULL);
346: if (!ia) return(0);
348: if (!blockcompressed) {
349: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
350: } else if (symmetric) {
351: /* Since the indices are symmetric it does'nt matter */
352: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
353: } else {
354: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
355: }
356: return(0);
357: }
359: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
360: {
364: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else {
368: PetscFree(*ia);
369: PetscFree(*ja);
370: }
371: return(0);
372: }
374: /* ----------------------------------------------------------- */
376: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
377: {
378: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
379: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
380: PetscScalar *y;
381: const PetscScalar *x;
382: const MatScalar *v1,*v2,*v3,*v4,*v5;
383: PetscErrorCode ierr;
384: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
385: const PetscInt *idx,*ns,*ii;
387: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
388: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
389: #endif
392: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
393: node_max = a->inode.node_count;
394: ns = a->inode.size; /* Node Size array */
395: VecGetArrayRead(xx,&x);
396: VecGetArray(yy,&y);
397: idx = a->j;
398: v1 = a->a;
399: ii = a->i;
401: for (i = 0,row = 0; i< node_max; ++i) {
402: nsz = ns[i];
403: n = ii[1] - ii[0];
404: nonzerorow += (n>0)*nsz;
405: ii += nsz;
406: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
407: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
408: sz = n; /* No of non zeros in this row */
409: /* Switch on the size of Node */
410: switch (nsz) { /* Each loop in 'case' is unrolled */
411: case 1:
412: sum1 = 0.;
414: for (n = 0; n< sz-1; n+=2) {
415: i1 = idx[0]; /* The instructions are ordered to */
416: i2 = idx[1]; /* make the compiler's job easy */
417: idx += 2;
418: tmp0 = x[i1];
419: tmp1 = x[i2];
420: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
421: }
423: if (n == sz-1) { /* Take care of the last nonzero */
424: tmp0 = x[*idx++];
425: sum1 += *v1++ *tmp0;
426: }
427: y[row++]=sum1;
428: break;
429: case 2:
430: sum1 = 0.;
431: sum2 = 0.;
432: v2 = v1 + n;
434: for (n = 0; n< sz-1; n+=2) {
435: i1 = idx[0];
436: i2 = idx[1];
437: idx += 2;
438: tmp0 = x[i1];
439: tmp1 = x[i2];
440: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
442: }
443: if (n == sz-1) {
444: tmp0 = x[*idx++];
445: sum1 += *v1++ * tmp0;
446: sum2 += *v2++ * tmp0;
447: }
448: y[row++]=sum1;
449: y[row++]=sum2;
450: v1 =v2; /* Since the next block to be processed starts there*/
451: idx +=sz;
452: break;
453: case 3:
454: sum1 = 0.;
455: sum2 = 0.;
456: sum3 = 0.;
457: v2 = v1 + n;
458: v3 = v2 + n;
460: for (n = 0; n< sz-1; n+=2) {
461: i1 = idx[0];
462: i2 = idx[1];
463: idx += 2;
464: tmp0 = x[i1];
465: tmp1 = x[i2];
466: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
467: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
468: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
469: }
470: if (n == sz-1) {
471: tmp0 = x[*idx++];
472: sum1 += *v1++ * tmp0;
473: sum2 += *v2++ * tmp0;
474: sum3 += *v3++ * tmp0;
475: }
476: y[row++]=sum1;
477: y[row++]=sum2;
478: y[row++]=sum3;
479: v1 =v3; /* Since the next block to be processed starts there*/
480: idx +=2*sz;
481: break;
482: case 4:
483: sum1 = 0.;
484: sum2 = 0.;
485: sum3 = 0.;
486: sum4 = 0.;
487: v2 = v1 + n;
488: v3 = v2 + n;
489: v4 = v3 + n;
491: for (n = 0; n< sz-1; n+=2) {
492: i1 = idx[0];
493: i2 = idx[1];
494: idx += 2;
495: tmp0 = x[i1];
496: tmp1 = x[i2];
497: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
498: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
499: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
500: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
501: }
502: if (n == sz-1) {
503: tmp0 = x[*idx++];
504: sum1 += *v1++ * tmp0;
505: sum2 += *v2++ * tmp0;
506: sum3 += *v3++ * tmp0;
507: sum4 += *v4++ * tmp0;
508: }
509: y[row++]=sum1;
510: y[row++]=sum2;
511: y[row++]=sum3;
512: y[row++]=sum4;
513: v1 =v4; /* Since the next block to be processed starts there*/
514: idx +=3*sz;
515: break;
516: case 5:
517: sum1 = 0.;
518: sum2 = 0.;
519: sum3 = 0.;
520: sum4 = 0.;
521: sum5 = 0.;
522: v2 = v1 + n;
523: v3 = v2 + n;
524: v4 = v3 + n;
525: v5 = v4 + n;
527: for (n = 0; n<sz-1; n+=2) {
528: i1 = idx[0];
529: i2 = idx[1];
530: idx += 2;
531: tmp0 = x[i1];
532: tmp1 = x[i2];
533: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
534: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
535: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
536: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
537: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
538: }
539: if (n == sz-1) {
540: tmp0 = x[*idx++];
541: sum1 += *v1++ * tmp0;
542: sum2 += *v2++ * tmp0;
543: sum3 += *v3++ * tmp0;
544: sum4 += *v4++ * tmp0;
545: sum5 += *v5++ * tmp0;
546: }
547: y[row++]=sum1;
548: y[row++]=sum2;
549: y[row++]=sum3;
550: y[row++]=sum4;
551: y[row++]=sum5;
552: v1 =v5; /* Since the next block to be processed starts there */
553: idx +=4*sz;
554: break;
555: default:
556: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
557: }
558: }
559: VecRestoreArrayRead(xx,&x);
560: VecRestoreArray(yy,&y);
561: PetscLogFlops(2.0*a->nz - nonzerorow);
562: return(0);
563: }
564: /* ----------------------------------------------------------- */
565: /* Almost same code as the MatMult_SeqAIJ_Inode() */
566: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
567: {
568: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
569: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
570: const MatScalar *v1,*v2,*v3,*v4,*v5;
571: const PetscScalar *x;
572: PetscScalar *y,*z,*zt;
573: PetscErrorCode ierr;
574: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
575: const PetscInt *idx,*ns,*ii;
578: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
579: node_max = a->inode.node_count;
580: ns = a->inode.size; /* Node Size array */
582: VecGetArrayRead(xx,&x);
583: VecGetArrayPair(zz,yy,&z,&y);
584: zt = z;
586: idx = a->j;
587: v1 = a->a;
588: ii = a->i;
590: for (i = 0,row = 0; i< node_max; ++i) {
591: nsz = ns[i];
592: n = ii[1] - ii[0];
593: ii += nsz;
594: sz = n; /* No of non zeros in this row */
595: /* Switch on the size of Node */
596: switch (nsz) { /* Each loop in 'case' is unrolled */
597: case 1:
598: sum1 = *zt++;
600: for (n = 0; n< sz-1; n+=2) {
601: i1 = idx[0]; /* The instructions are ordered to */
602: i2 = idx[1]; /* make the compiler's job easy */
603: idx += 2;
604: tmp0 = x[i1];
605: tmp1 = x[i2];
606: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
607: }
609: if (n == sz-1) { /* Take care of the last nonzero */
610: tmp0 = x[*idx++];
611: sum1 += *v1++ * tmp0;
612: }
613: y[row++]=sum1;
614: break;
615: case 2:
616: sum1 = *zt++;
617: sum2 = *zt++;
618: v2 = v1 + n;
620: for (n = 0; n< sz-1; n+=2) {
621: i1 = idx[0];
622: i2 = idx[1];
623: idx += 2;
624: tmp0 = x[i1];
625: tmp1 = x[i2];
626: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
628: }
629: if (n == sz-1) {
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: sum2 += *v2++ * tmp0;
633: }
634: y[row++]=sum1;
635: y[row++]=sum2;
636: v1 =v2; /* Since the next block to be processed starts there*/
637: idx +=sz;
638: break;
639: case 3:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: v2 = v1 + n;
644: v3 = v2 + n;
646: for (n = 0; n< sz-1; n+=2) {
647: i1 = idx[0];
648: i2 = idx[1];
649: idx += 2;
650: tmp0 = x[i1];
651: tmp1 = x[i2];
652: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
653: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
654: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
655: }
656: if (n == sz-1) {
657: tmp0 = x[*idx++];
658: sum1 += *v1++ * tmp0;
659: sum2 += *v2++ * tmp0;
660: sum3 += *v3++ * tmp0;
661: }
662: y[row++]=sum1;
663: y[row++]=sum2;
664: y[row++]=sum3;
665: v1 =v3; /* Since the next block to be processed starts there*/
666: idx +=2*sz;
667: break;
668: case 4:
669: sum1 = *zt++;
670: sum2 = *zt++;
671: sum3 = *zt++;
672: sum4 = *zt++;
673: v2 = v1 + n;
674: v3 = v2 + n;
675: v4 = v3 + n;
677: for (n = 0; n< sz-1; n+=2) {
678: i1 = idx[0];
679: i2 = idx[1];
680: idx += 2;
681: tmp0 = x[i1];
682: tmp1 = x[i2];
683: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
684: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
685: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
686: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
687: }
688: if (n == sz-1) {
689: tmp0 = x[*idx++];
690: sum1 += *v1++ * tmp0;
691: sum2 += *v2++ * tmp0;
692: sum3 += *v3++ * tmp0;
693: sum4 += *v4++ * tmp0;
694: }
695: y[row++]=sum1;
696: y[row++]=sum2;
697: y[row++]=sum3;
698: y[row++]=sum4;
699: v1 =v4; /* Since the next block to be processed starts there*/
700: idx +=3*sz;
701: break;
702: case 5:
703: sum1 = *zt++;
704: sum2 = *zt++;
705: sum3 = *zt++;
706: sum4 = *zt++;
707: sum5 = *zt++;
708: v2 = v1 + n;
709: v3 = v2 + n;
710: v4 = v3 + n;
711: v5 = v4 + n;
713: for (n = 0; n<sz-1; n+=2) {
714: i1 = idx[0];
715: i2 = idx[1];
716: idx += 2;
717: tmp0 = x[i1];
718: tmp1 = x[i2];
719: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
720: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
721: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
722: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
723: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
724: }
725: if (n == sz-1) {
726: tmp0 = x[*idx++];
727: sum1 += *v1++ * tmp0;
728: sum2 += *v2++ * tmp0;
729: sum3 += *v3++ * tmp0;
730: sum4 += *v4++ * tmp0;
731: sum5 += *v5++ * tmp0;
732: }
733: y[row++]=sum1;
734: y[row++]=sum2;
735: y[row++]=sum3;
736: y[row++]=sum4;
737: y[row++]=sum5;
738: v1 =v5; /* Since the next block to be processed starts there */
739: idx +=4*sz;
740: break;
741: default:
742: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
743: }
744: }
745: VecRestoreArrayRead(xx,&x);
746: VecRestoreArrayPair(zz,yy,&z,&y);
747: PetscLogFlops(2.0*a->nz);
748: return(0);
749: }
751: /* ----------------------------------------------------------- */
752: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
753: {
754: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
755: IS iscol = a->col,isrow = a->row;
756: PetscErrorCode ierr;
757: const PetscInt *r,*c,*rout,*cout;
758: PetscInt i,j,n = A->rmap->n,nz;
759: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
760: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
761: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
762: PetscScalar sum1,sum2,sum3,sum4,sum5;
763: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
764: const PetscScalar *b;
767: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
768: node_max = a->inode.node_count;
769: ns = a->inode.size; /* Node Size array */
771: VecGetArrayRead(bb,&b);
772: VecGetArray(xx,&x);
773: tmp = a->solve_work;
775: ISGetIndices(isrow,&rout); r = rout;
776: ISGetIndices(iscol,&cout); c = cout + (n-1);
778: /* forward solve the lower triangular */
779: tmps = tmp;
780: aa = a_a;
781: aj = a_j;
782: ad = a->diag;
784: for (i = 0,row = 0; i< node_max; ++i) {
785: nsz = ns[i];
786: aii = ai[row];
787: v1 = aa + aii;
788: vi = aj + aii;
789: nz = ad[row]- aii;
790: if (i < node_max-1) {
791: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
792: * but our indexing to determine it's size could. */
793: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
794: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
795: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
796: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
797: }
799: switch (nsz) { /* Each loop in 'case' is unrolled */
800: case 1:
801: sum1 = b[*r++];
802: for (j=0; j<nz-1; j+=2) {
803: i0 = vi[0];
804: i1 = vi[1];
805: vi +=2;
806: tmp0 = tmps[i0];
807: tmp1 = tmps[i1];
808: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809: }
810: if (j == nz-1) {
811: tmp0 = tmps[*vi++];
812: sum1 -= *v1++ *tmp0;
813: }
814: tmp[row++]=sum1;
815: break;
816: case 2:
817: sum1 = b[*r++];
818: sum2 = b[*r++];
819: v2 = aa + ai[row+1];
821: for (j=0; j<nz-1; j+=2) {
822: i0 = vi[0];
823: i1 = vi[1];
824: vi +=2;
825: tmp0 = tmps[i0];
826: tmp1 = tmps[i1];
827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829: }
830: if (j == nz-1) {
831: tmp0 = tmps[*vi++];
832: sum1 -= *v1++ *tmp0;
833: sum2 -= *v2++ *tmp0;
834: }
835: sum2 -= *v2++ *sum1;
836: tmp[row++]=sum1;
837: tmp[row++]=sum2;
838: break;
839: case 3:
840: sum1 = b[*r++];
841: sum2 = b[*r++];
842: sum3 = b[*r++];
843: v2 = aa + ai[row+1];
844: v3 = aa + ai[row+2];
846: for (j=0; j<nz-1; j+=2) {
847: i0 = vi[0];
848: i1 = vi[1];
849: vi +=2;
850: tmp0 = tmps[i0];
851: tmp1 = tmps[i1];
852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855: }
856: if (j == nz-1) {
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: sum3 -= *v3++ *tmp0;
861: }
862: sum2 -= *v2++ * sum1;
863: sum3 -= *v3++ * sum1;
864: sum3 -= *v3++ * sum2;
866: tmp[row++]=sum1;
867: tmp[row++]=sum2;
868: tmp[row++]=sum3;
869: break;
871: case 4:
872: sum1 = b[*r++];
873: sum2 = b[*r++];
874: sum3 = b[*r++];
875: sum4 = b[*r++];
876: v2 = aa + ai[row+1];
877: v3 = aa + ai[row+2];
878: v4 = aa + ai[row+3];
880: for (j=0; j<nz-1; j+=2) {
881: i0 = vi[0];
882: i1 = vi[1];
883: vi +=2;
884: tmp0 = tmps[i0];
885: tmp1 = tmps[i1];
886: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
887: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
888: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
889: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
890: }
891: if (j == nz-1) {
892: tmp0 = tmps[*vi++];
893: sum1 -= *v1++ *tmp0;
894: sum2 -= *v2++ *tmp0;
895: sum3 -= *v3++ *tmp0;
896: sum4 -= *v4++ *tmp0;
897: }
898: sum2 -= *v2++ * sum1;
899: sum3 -= *v3++ * sum1;
900: sum4 -= *v4++ * sum1;
901: sum3 -= *v3++ * sum2;
902: sum4 -= *v4++ * sum2;
903: sum4 -= *v4++ * sum3;
905: tmp[row++]=sum1;
906: tmp[row++]=sum2;
907: tmp[row++]=sum3;
908: tmp[row++]=sum4;
909: break;
910: case 5:
911: sum1 = b[*r++];
912: sum2 = b[*r++];
913: sum3 = b[*r++];
914: sum4 = b[*r++];
915: sum5 = b[*r++];
916: v2 = aa + ai[row+1];
917: v3 = aa + ai[row+2];
918: v4 = aa + ai[row+3];
919: v5 = aa + ai[row+4];
921: for (j=0; j<nz-1; j+=2) {
922: i0 = vi[0];
923: i1 = vi[1];
924: vi +=2;
925: tmp0 = tmps[i0];
926: tmp1 = tmps[i1];
927: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
928: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
929: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
930: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
931: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
932: }
933: if (j == nz-1) {
934: tmp0 = tmps[*vi++];
935: sum1 -= *v1++ *tmp0;
936: sum2 -= *v2++ *tmp0;
937: sum3 -= *v3++ *tmp0;
938: sum4 -= *v4++ *tmp0;
939: sum5 -= *v5++ *tmp0;
940: }
942: sum2 -= *v2++ * sum1;
943: sum3 -= *v3++ * sum1;
944: sum4 -= *v4++ * sum1;
945: sum5 -= *v5++ * sum1;
946: sum3 -= *v3++ * sum2;
947: sum4 -= *v4++ * sum2;
948: sum5 -= *v5++ * sum2;
949: sum4 -= *v4++ * sum3;
950: sum5 -= *v5++ * sum3;
951: sum5 -= *v5++ * sum4;
953: tmp[row++]=sum1;
954: tmp[row++]=sum2;
955: tmp[row++]=sum3;
956: tmp[row++]=sum4;
957: tmp[row++]=sum5;
958: break;
959: default:
960: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
961: }
962: }
963: /* backward solve the upper triangular */
964: for (i=node_max -1,row = n-1; i>=0; i--) {
965: nsz = ns[i];
966: aii = ai[row+1] -1;
967: v1 = aa + aii;
968: vi = aj + aii;
969: nz = aii- ad[row];
970: switch (nsz) { /* Each loop in 'case' is unrolled */
971: case 1:
972: sum1 = tmp[row];
974: for (j=nz; j>1; j-=2) {
975: vi -=2;
976: i0 = vi[2];
977: i1 = vi[1];
978: tmp0 = tmps[i0];
979: tmp1 = tmps[i1];
980: v1 -= 2;
981: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
982: }
983: if (j==1) {
984: tmp0 = tmps[*vi--];
985: sum1 -= *v1-- * tmp0;
986: }
987: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
988: break;
989: case 2:
990: sum1 = tmp[row];
991: sum2 = tmp[row -1];
992: v2 = aa + ai[row]-1;
993: for (j=nz; j>1; j-=2) {
994: vi -=2;
995: i0 = vi[2];
996: i1 = vi[1];
997: tmp0 = tmps[i0];
998: tmp1 = tmps[i1];
999: v1 -= 2;
1000: v2 -= 2;
1001: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1003: }
1004: if (j==1) {
1005: tmp0 = tmps[*vi--];
1006: sum1 -= *v1-- * tmp0;
1007: sum2 -= *v2-- * tmp0;
1008: }
1010: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1011: sum2 -= *v2-- * tmp0;
1012: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1013: break;
1014: case 3:
1015: sum1 = tmp[row];
1016: sum2 = tmp[row -1];
1017: sum3 = tmp[row -2];
1018: v2 = aa + ai[row]-1;
1019: v3 = aa + ai[row -1]-1;
1020: for (j=nz; j>1; j-=2) {
1021: vi -=2;
1022: i0 = vi[2];
1023: i1 = vi[1];
1024: tmp0 = tmps[i0];
1025: tmp1 = tmps[i1];
1026: v1 -= 2;
1027: v2 -= 2;
1028: v3 -= 2;
1029: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1030: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1031: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1032: }
1033: if (j==1) {
1034: tmp0 = tmps[*vi--];
1035: sum1 -= *v1-- * tmp0;
1036: sum2 -= *v2-- * tmp0;
1037: sum3 -= *v3-- * tmp0;
1038: }
1039: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1040: sum2 -= *v2-- * tmp0;
1041: sum3 -= *v3-- * tmp0;
1042: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1043: sum3 -= *v3-- * tmp0;
1044: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1046: break;
1047: case 4:
1048: sum1 = tmp[row];
1049: sum2 = tmp[row -1];
1050: sum3 = tmp[row -2];
1051: sum4 = tmp[row -3];
1052: v2 = aa + ai[row]-1;
1053: v3 = aa + ai[row -1]-1;
1054: v4 = aa + ai[row -2]-1;
1056: for (j=nz; j>1; j-=2) {
1057: vi -=2;
1058: i0 = vi[2];
1059: i1 = vi[1];
1060: tmp0 = tmps[i0];
1061: tmp1 = tmps[i1];
1062: v1 -= 2;
1063: v2 -= 2;
1064: v3 -= 2;
1065: v4 -= 2;
1066: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1067: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1068: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1069: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1070: }
1071: if (j==1) {
1072: tmp0 = tmps[*vi--];
1073: sum1 -= *v1-- * tmp0;
1074: sum2 -= *v2-- * tmp0;
1075: sum3 -= *v3-- * tmp0;
1076: sum4 -= *v4-- * tmp0;
1077: }
1079: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1080: sum2 -= *v2-- * tmp0;
1081: sum3 -= *v3-- * tmp0;
1082: sum4 -= *v4-- * tmp0;
1083: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1084: sum3 -= *v3-- * tmp0;
1085: sum4 -= *v4-- * tmp0;
1086: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1087: sum4 -= *v4-- * tmp0;
1088: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1089: break;
1090: case 5:
1091: sum1 = tmp[row];
1092: sum2 = tmp[row -1];
1093: sum3 = tmp[row -2];
1094: sum4 = tmp[row -3];
1095: sum5 = tmp[row -4];
1096: v2 = aa + ai[row]-1;
1097: v3 = aa + ai[row -1]-1;
1098: v4 = aa + ai[row -2]-1;
1099: v5 = aa + ai[row -3]-1;
1100: for (j=nz; j>1; j-=2) {
1101: vi -= 2;
1102: i0 = vi[2];
1103: i1 = vi[1];
1104: tmp0 = tmps[i0];
1105: tmp1 = tmps[i1];
1106: v1 -= 2;
1107: v2 -= 2;
1108: v3 -= 2;
1109: v4 -= 2;
1110: v5 -= 2;
1111: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1112: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1113: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1114: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1115: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1116: }
1117: if (j==1) {
1118: tmp0 = tmps[*vi--];
1119: sum1 -= *v1-- * tmp0;
1120: sum2 -= *v2-- * tmp0;
1121: sum3 -= *v3-- * tmp0;
1122: sum4 -= *v4-- * tmp0;
1123: sum5 -= *v5-- * tmp0;
1124: }
1126: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1127: sum2 -= *v2-- * tmp0;
1128: sum3 -= *v3-- * tmp0;
1129: sum4 -= *v4-- * tmp0;
1130: sum5 -= *v5-- * tmp0;
1131: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1132: sum3 -= *v3-- * tmp0;
1133: sum4 -= *v4-- * tmp0;
1134: sum5 -= *v5-- * tmp0;
1135: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1136: sum4 -= *v4-- * tmp0;
1137: sum5 -= *v5-- * tmp0;
1138: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1139: sum5 -= *v5-- * tmp0;
1140: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1141: break;
1142: default:
1143: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1144: }
1145: }
1146: ISRestoreIndices(isrow,&rout);
1147: ISRestoreIndices(iscol,&cout);
1148: VecRestoreArrayRead(bb,&b);
1149: VecRestoreArray(xx,&x);
1150: PetscLogFlops(2.0*a->nz - A->cmap->n);
1151: return(0);
1152: }
1154: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1155: {
1156: Mat C =B;
1157: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1158: IS isrow = b->row,isicol = b->icol;
1159: PetscErrorCode ierr;
1160: const PetscInt *r,*ic,*ics;
1161: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1162: PetscInt i,j,k,nz,nzL,row,*pj;
1163: const PetscInt *ajtmp,*bjtmp;
1164: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1165: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1166: FactorShiftCtx sctx;
1167: const PetscInt *ddiag;
1168: PetscReal rs;
1169: MatScalar d;
1170: PetscInt inod,nodesz,node_max,col;
1171: const PetscInt *ns;
1172: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1175: /* MatPivotSetUp(): initialize shift context sctx */
1176: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1178: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1179: ddiag = a->diag;
1180: sctx.shift_top = info->zeropivot;
1181: for (i=0; i<n; i++) {
1182: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183: d = (aa)[ddiag[i]];
1184: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1185: v = aa+ai[i];
1186: nz = ai[i+1] - ai[i];
1187: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1188: if (rs>sctx.shift_top) sctx.shift_top = rs;
1189: }
1190: sctx.shift_top *= 1.1;
1191: sctx.nshift_max = 5;
1192: sctx.shift_lo = 0.;
1193: sctx.shift_hi = 1.;
1194: }
1196: ISGetIndices(isrow,&r);
1197: ISGetIndices(isicol,&ic);
1199: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1200: ics = ic;
1202: node_max = a->inode.node_count;
1203: ns = a->inode.size;
1204: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1206: /* If max inode size > 4, split it into two inodes.*/
1207: /* also map the inode sizes according to the ordering */
1208: PetscMalloc1(n+1,&tmp_vec1);
1209: for (i=0,j=0; i<node_max; ++i,++j) {
1210: if (ns[i] > 4) {
1211: tmp_vec1[j] = 4;
1212: ++j;
1213: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1214: } else {
1215: tmp_vec1[j] = ns[i];
1216: }
1217: }
1218: /* Use the correct node_max */
1219: node_max = j;
1221: /* Now reorder the inode info based on mat re-ordering info */
1222: /* First create a row -> inode_size_array_index map */
1223: PetscMalloc1(n+1,&nsmap);
1224: PetscMalloc1(node_max+1,&tmp_vec2);
1225: for (i=0,row=0; i<node_max; i++) {
1226: nodesz = tmp_vec1[i];
1227: for (j=0; j<nodesz; j++,row++) {
1228: nsmap[row] = i;
1229: }
1230: }
1231: /* Using nsmap, create a reordered ns structure */
1232: for (i=0,j=0; i< node_max; i++) {
1233: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1234: tmp_vec2[i] = nodesz;
1235: j += nodesz;
1236: }
1237: PetscFree(nsmap);
1238: PetscFree(tmp_vec1);
1240: /* Now use the correct ns */
1241: ns = tmp_vec2;
1243: do {
1244: sctx.newshift = PETSC_FALSE;
1245: /* Now loop over each block-row, and do the factorization */
1246: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1247: nodesz = ns[inod];
1249: switch (nodesz) {
1250: case 1:
1251: /*----------*/
1252: /* zero rtmp1 */
1253: /* L part */
1254: nz = bi[i+1] - bi[i];
1255: bjtmp = bj + bi[i];
1256: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1258: /* U part */
1259: nz = bdiag[i]-bdiag[i+1];
1260: bjtmp = bj + bdiag[i+1]+1;
1261: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1263: /* load in initial (unfactored row) */
1264: nz = ai[r[i]+1] - ai[r[i]];
1265: ajtmp = aj + ai[r[i]];
1266: v = aa + ai[r[i]];
1267: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1269: /* ZeropivotApply() */
1270: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1272: /* elimination */
1273: bjtmp = bj + bi[i];
1274: row = *bjtmp++;
1275: nzL = bi[i+1] - bi[i];
1276: for (k=0; k < nzL; k++) {
1277: pc = rtmp1 + row;
1278: if (*pc != 0.0) {
1279: pv = b->a + bdiag[row];
1280: mul1 = *pc * (*pv);
1281: *pc = mul1;
1282: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1283: pv = b->a + bdiag[row+1]+1;
1284: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1285: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1286: PetscLogFlops(1+2*nz);
1287: }
1288: row = *bjtmp++;
1289: }
1291: /* finished row so stick it into b->a */
1292: rs = 0.0;
1293: /* L part */
1294: pv = b->a + bi[i];
1295: pj = b->j + bi[i];
1296: nz = bi[i+1] - bi[i];
1297: for (j=0; j<nz; j++) {
1298: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1299: }
1301: /* U part */
1302: pv = b->a + bdiag[i+1]+1;
1303: pj = b->j + bdiag[i+1]+1;
1304: nz = bdiag[i] - bdiag[i+1]-1;
1305: for (j=0; j<nz; j++) {
1306: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1307: }
1309: /* Check zero pivot */
1310: sctx.rs = rs;
1311: sctx.pv = rtmp1[i];
1312: MatPivotCheck(B,A,info,&sctx,i);
1313: if (sctx.newshift) break;
1315: /* Mark diagonal and invert diagonal for simplier triangular solves */
1316: pv = b->a + bdiag[i];
1317: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1318: break;
1320: case 2:
1321: /*----------*/
1322: /* zero rtmp1 and rtmp2 */
1323: /* L part */
1324: nz = bi[i+1] - bi[i];
1325: bjtmp = bj + bi[i];
1326: for (j=0; j<nz; j++) {
1327: col = bjtmp[j];
1328: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1329: }
1331: /* U part */
1332: nz = bdiag[i]-bdiag[i+1];
1333: bjtmp = bj + bdiag[i+1]+1;
1334: for (j=0; j<nz; j++) {
1335: col = bjtmp[j];
1336: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1337: }
1339: /* load in initial (unfactored row) */
1340: nz = ai[r[i]+1] - ai[r[i]];
1341: ajtmp = aj + ai[r[i]];
1342: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1343: for (j=0; j<nz; j++) {
1344: col = ics[ajtmp[j]];
1345: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1346: }
1347: /* ZeropivotApply(): shift the diagonal of the matrix */
1348: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1350: /* elimination */
1351: bjtmp = bj + bi[i];
1352: row = *bjtmp++; /* pivot row */
1353: nzL = bi[i+1] - bi[i];
1354: for (k=0; k < nzL; k++) {
1355: pc1 = rtmp1 + row;
1356: pc2 = rtmp2 + row;
1357: if (*pc1 != 0.0 || *pc2 != 0.0) {
1358: pv = b->a + bdiag[row];
1359: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1360: *pc1 = mul1; *pc2 = mul2;
1362: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1363: pv = b->a + bdiag[row+1]+1;
1364: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1365: for (j=0; j<nz; j++) {
1366: col = pj[j];
1367: rtmp1[col] -= mul1 * pv[j];
1368: rtmp2[col] -= mul2 * pv[j];
1369: }
1370: PetscLogFlops(2+4*nz);
1371: }
1372: row = *bjtmp++;
1373: }
1375: /* finished row i; check zero pivot, then stick row i into b->a */
1376: rs = 0.0;
1377: /* L part */
1378: pc1 = b->a + bi[i];
1379: pj = b->j + bi[i];
1380: nz = bi[i+1] - bi[i];
1381: for (j=0; j<nz; j++) {
1382: col = pj[j];
1383: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1384: }
1385: /* U part */
1386: pc1 = b->a + bdiag[i+1]+1;
1387: pj = b->j + bdiag[i+1]+1;
1388: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1389: for (j=0; j<nz; j++) {
1390: col = pj[j];
1391: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1392: }
1394: sctx.rs = rs;
1395: sctx.pv = rtmp1[i];
1396: MatPivotCheck(B,A,info,&sctx,i);
1397: if (sctx.newshift) break;
1398: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1399: *pc1 = 1.0/sctx.pv;
1401: /* Now take care of diagonal 2x2 block. */
1402: pc2 = rtmp2 + i;
1403: if (*pc2 != 0.0) {
1404: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1405: *pc2 = mul1; /* insert L entry */
1406: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1407: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1408: for (j=0; j<nz; j++) {
1409: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1410: }
1411: PetscLogFlops(1+2*nz);
1412: }
1414: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1415: rs = 0.0;
1416: /* L part */
1417: pc2 = b->a + bi[i+1];
1418: pj = b->j + bi[i+1];
1419: nz = bi[i+2] - bi[i+1];
1420: for (j=0; j<nz; j++) {
1421: col = pj[j];
1422: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1423: }
1424: /* U part */
1425: pc2 = b->a + bdiag[i+2]+1;
1426: pj = b->j + bdiag[i+2]+1;
1427: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1428: for (j=0; j<nz; j++) {
1429: col = pj[j];
1430: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1431: }
1433: sctx.rs = rs;
1434: sctx.pv = rtmp2[i+1];
1435: MatPivotCheck(B,A,info,&sctx,i+1);
1436: if (sctx.newshift) break;
1437: pc2 = b->a + bdiag[i+1];
1438: *pc2 = 1.0/sctx.pv;
1439: break;
1441: case 3:
1442: /*----------*/
1443: /* zero rtmp */
1444: /* L part */
1445: nz = bi[i+1] - bi[i];
1446: bjtmp = bj + bi[i];
1447: for (j=0; j<nz; j++) {
1448: col = bjtmp[j];
1449: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1450: }
1452: /* U part */
1453: nz = bdiag[i]-bdiag[i+1];
1454: bjtmp = bj + bdiag[i+1]+1;
1455: for (j=0; j<nz; j++) {
1456: col = bjtmp[j];
1457: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1458: }
1460: /* load in initial (unfactored row) */
1461: nz = ai[r[i]+1] - ai[r[i]];
1462: ajtmp = aj + ai[r[i]];
1463: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1464: for (j=0; j<nz; j++) {
1465: col = ics[ajtmp[j]];
1466: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1467: }
1468: /* ZeropivotApply(): shift the diagonal of the matrix */
1469: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1471: /* elimination */
1472: bjtmp = bj + bi[i];
1473: row = *bjtmp++; /* pivot row */
1474: nzL = bi[i+1] - bi[i];
1475: for (k=0; k < nzL; k++) {
1476: pc1 = rtmp1 + row;
1477: pc2 = rtmp2 + row;
1478: pc3 = rtmp3 + row;
1479: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1480: pv = b->a + bdiag[row];
1481: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1482: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1484: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1485: pv = b->a + bdiag[row+1]+1;
1486: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1487: for (j=0; j<nz; j++) {
1488: col = pj[j];
1489: rtmp1[col] -= mul1 * pv[j];
1490: rtmp2[col] -= mul2 * pv[j];
1491: rtmp3[col] -= mul3 * pv[j];
1492: }
1493: PetscLogFlops(3+6*nz);
1494: }
1495: row = *bjtmp++;
1496: }
1498: /* finished row i; check zero pivot, then stick row i into b->a */
1499: rs = 0.0;
1500: /* L part */
1501: pc1 = b->a + bi[i];
1502: pj = b->j + bi[i];
1503: nz = bi[i+1] - bi[i];
1504: for (j=0; j<nz; j++) {
1505: col = pj[j];
1506: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1507: }
1508: /* U part */
1509: pc1 = b->a + bdiag[i+1]+1;
1510: pj = b->j + bdiag[i+1]+1;
1511: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1512: for (j=0; j<nz; j++) {
1513: col = pj[j];
1514: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1515: }
1517: sctx.rs = rs;
1518: sctx.pv = rtmp1[i];
1519: MatPivotCheck(B,A,info,&sctx,i);
1520: if (sctx.newshift) break;
1521: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1522: *pc1 = 1.0/sctx.pv;
1524: /* Now take care of 1st column of diagonal 3x3 block. */
1525: pc2 = rtmp2 + i;
1526: pc3 = rtmp3 + i;
1527: if (*pc2 != 0.0 || *pc3 != 0.0) {
1528: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1529: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1530: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1531: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1532: for (j=0; j<nz; j++) {
1533: col = pj[j];
1534: rtmp2[col] -= mul2 * rtmp1[col];
1535: rtmp3[col] -= mul3 * rtmp1[col];
1536: }
1537: PetscLogFlops(2+4*nz);
1538: }
1540: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1541: rs = 0.0;
1542: /* L part */
1543: pc2 = b->a + bi[i+1];
1544: pj = b->j + bi[i+1];
1545: nz = bi[i+2] - bi[i+1];
1546: for (j=0; j<nz; j++) {
1547: col = pj[j];
1548: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1549: }
1550: /* U part */
1551: pc2 = b->a + bdiag[i+2]+1;
1552: pj = b->j + bdiag[i+2]+1;
1553: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1554: for (j=0; j<nz; j++) {
1555: col = pj[j];
1556: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1557: }
1559: sctx.rs = rs;
1560: sctx.pv = rtmp2[i+1];
1561: MatPivotCheck(B,A,info,&sctx,i+1);
1562: if (sctx.newshift) break;
1563: pc2 = b->a + bdiag[i+1];
1564: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1566: /* Now take care of 2nd column of diagonal 3x3 block. */
1567: pc3 = rtmp3 + i+1;
1568: if (*pc3 != 0.0) {
1569: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1570: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1571: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1572: for (j=0; j<nz; j++) {
1573: col = pj[j];
1574: rtmp3[col] -= mul3 * rtmp2[col];
1575: }
1576: PetscLogFlops(1+2*nz);
1577: }
1579: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1580: rs = 0.0;
1581: /* L part */
1582: pc3 = b->a + bi[i+2];
1583: pj = b->j + bi[i+2];
1584: nz = bi[i+3] - bi[i+2];
1585: for (j=0; j<nz; j++) {
1586: col = pj[j];
1587: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1588: }
1589: /* U part */
1590: pc3 = b->a + bdiag[i+3]+1;
1591: pj = b->j + bdiag[i+3]+1;
1592: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1593: for (j=0; j<nz; j++) {
1594: col = pj[j];
1595: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1596: }
1598: sctx.rs = rs;
1599: sctx.pv = rtmp3[i+2];
1600: MatPivotCheck(B,A,info,&sctx,i+2);
1601: if (sctx.newshift) break;
1602: pc3 = b->a + bdiag[i+2];
1603: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1604: break;
1605: case 4:
1606: /*----------*/
1607: /* zero rtmp */
1608: /* L part */
1609: nz = bi[i+1] - bi[i];
1610: bjtmp = bj + bi[i];
1611: for (j=0; j<nz; j++) {
1612: col = bjtmp[j];
1613: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1614: }
1616: /* U part */
1617: nz = bdiag[i]-bdiag[i+1];
1618: bjtmp = bj + bdiag[i+1]+1;
1619: for (j=0; j<nz; j++) {
1620: col = bjtmp[j];
1621: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1622: }
1624: /* load in initial (unfactored row) */
1625: nz = ai[r[i]+1] - ai[r[i]];
1626: ajtmp = aj + ai[r[i]];
1627: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1628: for (j=0; j<nz; j++) {
1629: col = ics[ajtmp[j]];
1630: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1631: }
1632: /* ZeropivotApply(): shift the diagonal of the matrix */
1633: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1635: /* elimination */
1636: bjtmp = bj + bi[i];
1637: row = *bjtmp++; /* pivot row */
1638: nzL = bi[i+1] - bi[i];
1639: for (k=0; k < nzL; k++) {
1640: pc1 = rtmp1 + row;
1641: pc2 = rtmp2 + row;
1642: pc3 = rtmp3 + row;
1643: pc4 = rtmp4 + row;
1644: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1645: pv = b->a + bdiag[row];
1646: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1647: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1649: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1650: pv = b->a + bdiag[row+1]+1;
1651: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1652: for (j=0; j<nz; j++) {
1653: col = pj[j];
1654: rtmp1[col] -= mul1 * pv[j];
1655: rtmp2[col] -= mul2 * pv[j];
1656: rtmp3[col] -= mul3 * pv[j];
1657: rtmp4[col] -= mul4 * pv[j];
1658: }
1659: PetscLogFlops(4+8*nz);
1660: }
1661: row = *bjtmp++;
1662: }
1664: /* finished row i; check zero pivot, then stick row i into b->a */
1665: rs = 0.0;
1666: /* L part */
1667: pc1 = b->a + bi[i];
1668: pj = b->j + bi[i];
1669: nz = bi[i+1] - bi[i];
1670: for (j=0; j<nz; j++) {
1671: col = pj[j];
1672: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1673: }
1674: /* U part */
1675: pc1 = b->a + bdiag[i+1]+1;
1676: pj = b->j + bdiag[i+1]+1;
1677: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1678: for (j=0; j<nz; j++) {
1679: col = pj[j];
1680: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1681: }
1683: sctx.rs = rs;
1684: sctx.pv = rtmp1[i];
1685: MatPivotCheck(B,A,info,&sctx,i);
1686: if (sctx.newshift) break;
1687: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1688: *pc1 = 1.0/sctx.pv;
1690: /* Now take care of 1st column of diagonal 4x4 block. */
1691: pc2 = rtmp2 + i;
1692: pc3 = rtmp3 + i;
1693: pc4 = rtmp4 + i;
1694: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1695: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1696: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1697: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1698: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1699: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1700: for (j=0; j<nz; j++) {
1701: col = pj[j];
1702: rtmp2[col] -= mul2 * rtmp1[col];
1703: rtmp3[col] -= mul3 * rtmp1[col];
1704: rtmp4[col] -= mul4 * rtmp1[col];
1705: }
1706: PetscLogFlops(3+6*nz);
1707: }
1709: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1710: rs = 0.0;
1711: /* L part */
1712: pc2 = b->a + bi[i+1];
1713: pj = b->j + bi[i+1];
1714: nz = bi[i+2] - bi[i+1];
1715: for (j=0; j<nz; j++) {
1716: col = pj[j];
1717: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1718: }
1719: /* U part */
1720: pc2 = b->a + bdiag[i+2]+1;
1721: pj = b->j + bdiag[i+2]+1;
1722: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1723: for (j=0; j<nz; j++) {
1724: col = pj[j];
1725: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1726: }
1728: sctx.rs = rs;
1729: sctx.pv = rtmp2[i+1];
1730: MatPivotCheck(B,A,info,&sctx,i+1);
1731: if (sctx.newshift) break;
1732: pc2 = b->a + bdiag[i+1];
1733: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1735: /* Now take care of 2nd column of diagonal 4x4 block. */
1736: pc3 = rtmp3 + i+1;
1737: pc4 = rtmp4 + i+1;
1738: if (*pc3 != 0.0 || *pc4 != 0.0) {
1739: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1740: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1741: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1742: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1743: for (j=0; j<nz; j++) {
1744: col = pj[j];
1745: rtmp3[col] -= mul3 * rtmp2[col];
1746: rtmp4[col] -= mul4 * rtmp2[col];
1747: }
1748: PetscLogFlops(4*nz);
1749: }
1751: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1752: rs = 0.0;
1753: /* L part */
1754: pc3 = b->a + bi[i+2];
1755: pj = b->j + bi[i+2];
1756: nz = bi[i+3] - bi[i+2];
1757: for (j=0; j<nz; j++) {
1758: col = pj[j];
1759: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1760: }
1761: /* U part */
1762: pc3 = b->a + bdiag[i+3]+1;
1763: pj = b->j + bdiag[i+3]+1;
1764: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1765: for (j=0; j<nz; j++) {
1766: col = pj[j];
1767: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1768: }
1770: sctx.rs = rs;
1771: sctx.pv = rtmp3[i+2];
1772: MatPivotCheck(B,A,info,&sctx,i+2);
1773: if (sctx.newshift) break;
1774: pc3 = b->a + bdiag[i+2];
1775: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1777: /* Now take care of 3rd column of diagonal 4x4 block. */
1778: pc4 = rtmp4 + i+2;
1779: if (*pc4 != 0.0) {
1780: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1781: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1782: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1783: for (j=0; j<nz; j++) {
1784: col = pj[j];
1785: rtmp4[col] -= mul4 * rtmp3[col];
1786: }
1787: PetscLogFlops(1+2*nz);
1788: }
1790: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1791: rs = 0.0;
1792: /* L part */
1793: pc4 = b->a + bi[i+3];
1794: pj = b->j + bi[i+3];
1795: nz = bi[i+4] - bi[i+3];
1796: for (j=0; j<nz; j++) {
1797: col = pj[j];
1798: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1799: }
1800: /* U part */
1801: pc4 = b->a + bdiag[i+4]+1;
1802: pj = b->j + bdiag[i+4]+1;
1803: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1804: for (j=0; j<nz; j++) {
1805: col = pj[j];
1806: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1807: }
1809: sctx.rs = rs;
1810: sctx.pv = rtmp4[i+3];
1811: MatPivotCheck(B,A,info,&sctx,i+3);
1812: if (sctx.newshift) break;
1813: pc4 = b->a + bdiag[i+3];
1814: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1815: break;
1817: default:
1818: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1819: }
1820: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1821: i += nodesz; /* Update the row */
1822: }
1824: /* MatPivotRefine() */
1825: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1826: /*
1827: * if no shift in this attempt & shifting & started shifting & can refine,
1828: * then try lower shift
1829: */
1830: sctx.shift_hi = sctx.shift_fraction;
1831: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1832: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1833: sctx.newshift = PETSC_TRUE;
1834: sctx.nshift++;
1835: }
1836: } while (sctx.newshift);
1838: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1839: PetscFree(tmp_vec2);
1840: ISRestoreIndices(isicol,&ic);
1841: ISRestoreIndices(isrow,&r);
1843: if (b->inode.size) {
1844: C->ops->solve = MatSolve_SeqAIJ_Inode;
1845: } else {
1846: C->ops->solve = MatSolve_SeqAIJ;
1847: }
1848: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1849: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1850: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1851: C->ops->matsolve = MatMatSolve_SeqAIJ;
1852: C->assembled = PETSC_TRUE;
1853: C->preallocated = PETSC_TRUE;
1855: PetscLogFlops(C->cmap->n);
1857: /* MatShiftView(A,info,&sctx) */
1858: if (sctx.nshift) {
1859: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1860: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1861: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1862: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1863: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1864: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1865: }
1866: }
1867: return(0);
1868: }
1870: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1871: {
1872: Mat C = B;
1873: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1874: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1875: PetscErrorCode ierr;
1876: const PetscInt *r,*ic,*c,*ics;
1877: PetscInt n = A->rmap->n,*bi = b->i;
1878: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1879: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1880: PetscInt *ai = a->i,*aj = a->j;
1881: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1882: PetscScalar mul1,mul2,mul3,tmp;
1883: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1884: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1885: PetscReal rs=0.0;
1886: FactorShiftCtx sctx;
1889: sctx.shift_top = 0;
1890: sctx.nshift_max = 0;
1891: sctx.shift_lo = 0;
1892: sctx.shift_hi = 0;
1893: sctx.shift_fraction = 0;
1895: /* if both shift schemes are chosen by user, only use info->shiftpd */
1896: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1897: sctx.shift_top = 0;
1898: for (i=0; i<n; i++) {
1899: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1900: rs = 0.0;
1901: ajtmp = aj + ai[i];
1902: rtmp1 = aa + ai[i];
1903: nz = ai[i+1] - ai[i];
1904: for (j=0; j<nz; j++) {
1905: if (*ajtmp != i) {
1906: rs += PetscAbsScalar(*rtmp1++);
1907: } else {
1908: rs -= PetscRealPart(*rtmp1++);
1909: }
1910: ajtmp++;
1911: }
1912: if (rs>sctx.shift_top) sctx.shift_top = rs;
1913: }
1914: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1915: sctx.shift_top *= 1.1;
1916: sctx.nshift_max = 5;
1917: sctx.shift_lo = 0.;
1918: sctx.shift_hi = 1.;
1919: }
1920: sctx.shift_amount = 0;
1921: sctx.nshift = 0;
1923: ISGetIndices(isrow,&r);
1924: ISGetIndices(iscol,&c);
1925: ISGetIndices(isicol,&ic);
1926: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1927: ics = ic;
1929: node_max = a->inode.node_count;
1930: ns = a->inode.size;
1931: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1933: /* If max inode size > 3, split it into two inodes.*/
1934: /* also map the inode sizes according to the ordering */
1935: PetscMalloc1(n+1,&tmp_vec1);
1936: for (i=0,j=0; i<node_max; ++i,++j) {
1937: if (ns[i]>3) {
1938: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1939: ++j;
1940: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1941: } else {
1942: tmp_vec1[j] = ns[i];
1943: }
1944: }
1945: /* Use the correct node_max */
1946: node_max = j;
1948: /* Now reorder the inode info based on mat re-ordering info */
1949: /* First create a row -> inode_size_array_index map */
1950: PetscMalloc1(n+1,&nsmap);
1951: PetscMalloc1(node_max+1,&tmp_vec2);
1952: for (i=0,row=0; i<node_max; i++) {
1953: nodesz = tmp_vec1[i];
1954: for (j=0; j<nodesz; j++,row++) {
1955: nsmap[row] = i;
1956: }
1957: }
1958: /* Using nsmap, create a reordered ns structure */
1959: for (i=0,j=0; i< node_max; i++) {
1960: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1961: tmp_vec2[i] = nodesz;
1962: j += nodesz;
1963: }
1964: PetscFree(nsmap);
1965: PetscFree(tmp_vec1);
1966: /* Now use the correct ns */
1967: ns = tmp_vec2;
1969: do {
1970: sctx.newshift = PETSC_FALSE;
1971: /* Now loop over each block-row, and do the factorization */
1972: for (i=0,row=0; i<node_max; i++) {
1973: nodesz = ns[i];
1974: nz = bi[row+1] - bi[row];
1975: bjtmp = bj + bi[row];
1977: switch (nodesz) {
1978: case 1:
1979: for (j=0; j<nz; j++) {
1980: idx = bjtmp[j];
1981: rtmp11[idx] = 0.0;
1982: }
1984: /* load in initial (unfactored row) */
1985: idx = r[row];
1986: nz_tmp = ai[idx+1] - ai[idx];
1987: ajtmp = aj + ai[idx];
1988: v1 = aa + ai[idx];
1990: for (j=0; j<nz_tmp; j++) {
1991: idx = ics[ajtmp[j]];
1992: rtmp11[idx] = v1[j];
1993: }
1994: rtmp11[ics[r[row]]] += sctx.shift_amount;
1996: prow = *bjtmp++;
1997: while (prow < row) {
1998: pc1 = rtmp11 + prow;
1999: if (*pc1 != 0.0) {
2000: pv = ba + bd[prow];
2001: pj = nbj + bd[prow];
2002: mul1 = *pc1 * *pv++;
2003: *pc1 = mul1;
2004: nz_tmp = bi[prow+1] - bd[prow] - 1;
2005: PetscLogFlops(1+2*nz_tmp);
2006: for (j=0; j<nz_tmp; j++) {
2007: tmp = pv[j];
2008: idx = pj[j];
2009: rtmp11[idx] -= mul1 * tmp;
2010: }
2011: }
2012: prow = *bjtmp++;
2013: }
2014: pj = bj + bi[row];
2015: pc1 = ba + bi[row];
2017: sctx.pv = rtmp11[row];
2018: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2019: rs = 0.0;
2020: for (j=0; j<nz; j++) {
2021: idx = pj[j];
2022: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2023: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2024: }
2025: sctx.rs = rs;
2026: MatPivotCheck(B,A,info,&sctx,row);
2027: if (sctx.newshift) goto endofwhile;
2028: break;
2030: case 2:
2031: for (j=0; j<nz; j++) {
2032: idx = bjtmp[j];
2033: rtmp11[idx] = 0.0;
2034: rtmp22[idx] = 0.0;
2035: }
2037: /* load in initial (unfactored row) */
2038: idx = r[row];
2039: nz_tmp = ai[idx+1] - ai[idx];
2040: ajtmp = aj + ai[idx];
2041: v1 = aa + ai[idx];
2042: v2 = aa + ai[idx+1];
2043: for (j=0; j<nz_tmp; j++) {
2044: idx = ics[ajtmp[j]];
2045: rtmp11[idx] = v1[j];
2046: rtmp22[idx] = v2[j];
2047: }
2048: rtmp11[ics[r[row]]] += sctx.shift_amount;
2049: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2051: prow = *bjtmp++;
2052: while (prow < row) {
2053: pc1 = rtmp11 + prow;
2054: pc2 = rtmp22 + prow;
2055: if (*pc1 != 0.0 || *pc2 != 0.0) {
2056: pv = ba + bd[prow];
2057: pj = nbj + bd[prow];
2058: mul1 = *pc1 * *pv;
2059: mul2 = *pc2 * *pv;
2060: ++pv;
2061: *pc1 = mul1;
2062: *pc2 = mul2;
2064: nz_tmp = bi[prow+1] - bd[prow] - 1;
2065: for (j=0; j<nz_tmp; j++) {
2066: tmp = pv[j];
2067: idx = pj[j];
2068: rtmp11[idx] -= mul1 * tmp;
2069: rtmp22[idx] -= mul2 * tmp;
2070: }
2071: PetscLogFlops(2+4*nz_tmp);
2072: }
2073: prow = *bjtmp++;
2074: }
2076: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2077: pc1 = rtmp11 + prow;
2078: pc2 = rtmp22 + prow;
2080: sctx.pv = *pc1;
2081: pj = bj + bi[prow];
2082: rs = 0.0;
2083: for (j=0; j<nz; j++) {
2084: idx = pj[j];
2085: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2086: }
2087: sctx.rs = rs;
2088: MatPivotCheck(B,A,info,&sctx,row);
2089: if (sctx.newshift) goto endofwhile;
2091: if (*pc2 != 0.0) {
2092: pj = nbj + bd[prow];
2093: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2094: *pc2 = mul2;
2095: nz_tmp = bi[prow+1] - bd[prow] - 1;
2096: for (j=0; j<nz_tmp; j++) {
2097: idx = pj[j];
2098: tmp = rtmp11[idx];
2099: rtmp22[idx] -= mul2 * tmp;
2100: }
2101: PetscLogFlops(1+2*nz_tmp);
2102: }
2104: pj = bj + bi[row];
2105: pc1 = ba + bi[row];
2106: pc2 = ba + bi[row+1];
2108: sctx.pv = rtmp22[row+1];
2109: rs = 0.0;
2110: rtmp11[row] = 1.0/rtmp11[row];
2111: rtmp22[row+1] = 1.0/rtmp22[row+1];
2112: /* copy row entries from dense representation to sparse */
2113: for (j=0; j<nz; j++) {
2114: idx = pj[j];
2115: pc1[j] = rtmp11[idx];
2116: pc2[j] = rtmp22[idx];
2117: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2118: }
2119: sctx.rs = rs;
2120: MatPivotCheck(B,A,info,&sctx,row+1);
2121: if (sctx.newshift) goto endofwhile;
2122: break;
2124: case 3:
2125: for (j=0; j<nz; j++) {
2126: idx = bjtmp[j];
2127: rtmp11[idx] = 0.0;
2128: rtmp22[idx] = 0.0;
2129: rtmp33[idx] = 0.0;
2130: }
2131: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2132: idx = r[row];
2133: nz_tmp = ai[idx+1] - ai[idx];
2134: ajtmp = aj + ai[idx];
2135: v1 = aa + ai[idx];
2136: v2 = aa + ai[idx+1];
2137: v3 = aa + ai[idx+2];
2138: for (j=0; j<nz_tmp; j++) {
2139: idx = ics[ajtmp[j]];
2140: rtmp11[idx] = v1[j];
2141: rtmp22[idx] = v2[j];
2142: rtmp33[idx] = v3[j];
2143: }
2144: rtmp11[ics[r[row]]] += sctx.shift_amount;
2145: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2146: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2148: /* loop over all pivot row blocks above this row block */
2149: prow = *bjtmp++;
2150: while (prow < row) {
2151: pc1 = rtmp11 + prow;
2152: pc2 = rtmp22 + prow;
2153: pc3 = rtmp33 + prow;
2154: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2155: pv = ba + bd[prow];
2156: pj = nbj + bd[prow];
2157: mul1 = *pc1 * *pv;
2158: mul2 = *pc2 * *pv;
2159: mul3 = *pc3 * *pv;
2160: ++pv;
2161: *pc1 = mul1;
2162: *pc2 = mul2;
2163: *pc3 = mul3;
2165: nz_tmp = bi[prow+1] - bd[prow] - 1;
2166: /* update this row based on pivot row */
2167: for (j=0; j<nz_tmp; j++) {
2168: tmp = pv[j];
2169: idx = pj[j];
2170: rtmp11[idx] -= mul1 * tmp;
2171: rtmp22[idx] -= mul2 * tmp;
2172: rtmp33[idx] -= mul3 * tmp;
2173: }
2174: PetscLogFlops(3+6*nz_tmp);
2175: }
2176: prow = *bjtmp++;
2177: }
2179: /* Now take care of diagonal 3x3 block in this set of rows */
2180: /* note: prow = row here */
2181: pc1 = rtmp11 + prow;
2182: pc2 = rtmp22 + prow;
2183: pc3 = rtmp33 + prow;
2185: sctx.pv = *pc1;
2186: pj = bj + bi[prow];
2187: rs = 0.0;
2188: for (j=0; j<nz; j++) {
2189: idx = pj[j];
2190: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2191: }
2192: sctx.rs = rs;
2193: MatPivotCheck(B,A,info,&sctx,row);
2194: if (sctx.newshift) goto endofwhile;
2196: if (*pc2 != 0.0 || *pc3 != 0.0) {
2197: mul2 = (*pc2)/(*pc1);
2198: mul3 = (*pc3)/(*pc1);
2199: *pc2 = mul2;
2200: *pc3 = mul3;
2201: nz_tmp = bi[prow+1] - bd[prow] - 1;
2202: pj = nbj + bd[prow];
2203: for (j=0; j<nz_tmp; j++) {
2204: idx = pj[j];
2205: tmp = rtmp11[idx];
2206: rtmp22[idx] -= mul2 * tmp;
2207: rtmp33[idx] -= mul3 * tmp;
2208: }
2209: PetscLogFlops(2+4*nz_tmp);
2210: }
2211: ++prow;
2213: pc2 = rtmp22 + prow;
2214: pc3 = rtmp33 + prow;
2215: sctx.pv = *pc2;
2216: pj = bj + bi[prow];
2217: rs = 0.0;
2218: for (j=0; j<nz; j++) {
2219: idx = pj[j];
2220: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2221: }
2222: sctx.rs = rs;
2223: MatPivotCheck(B,A,info,&sctx,row+1);
2224: if (sctx.newshift) goto endofwhile;
2226: if (*pc3 != 0.0) {
2227: mul3 = (*pc3)/(*pc2);
2228: *pc3 = mul3;
2229: pj = nbj + bd[prow];
2230: nz_tmp = bi[prow+1] - bd[prow] - 1;
2231: for (j=0; j<nz_tmp; j++) {
2232: idx = pj[j];
2233: tmp = rtmp22[idx];
2234: rtmp33[idx] -= mul3 * tmp;
2235: }
2236: PetscLogFlops(1+2*nz_tmp);
2237: }
2239: pj = bj + bi[row];
2240: pc1 = ba + bi[row];
2241: pc2 = ba + bi[row+1];
2242: pc3 = ba + bi[row+2];
2244: sctx.pv = rtmp33[row+2];
2245: rs = 0.0;
2246: rtmp11[row] = 1.0/rtmp11[row];
2247: rtmp22[row+1] = 1.0/rtmp22[row+1];
2248: rtmp33[row+2] = 1.0/rtmp33[row+2];
2249: /* copy row entries from dense representation to sparse */
2250: for (j=0; j<nz; j++) {
2251: idx = pj[j];
2252: pc1[j] = rtmp11[idx];
2253: pc2[j] = rtmp22[idx];
2254: pc3[j] = rtmp33[idx];
2255: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2256: }
2258: sctx.rs = rs;
2259: MatPivotCheck(B,A,info,&sctx,row+2);
2260: if (sctx.newshift) goto endofwhile;
2261: break;
2263: default:
2264: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2265: }
2266: row += nodesz; /* Update the row */
2267: }
2268: endofwhile:;
2269: } while (sctx.newshift);
2270: PetscFree3(rtmp11,rtmp22,rtmp33);
2271: PetscFree(tmp_vec2);
2272: ISRestoreIndices(isicol,&ic);
2273: ISRestoreIndices(isrow,&r);
2274: ISRestoreIndices(iscol,&c);
2276: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2277: /* do not set solve add, since MatSolve_Inode + Add is faster */
2278: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2279: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2280: C->assembled = PETSC_TRUE;
2281: C->preallocated = PETSC_TRUE;
2282: if (sctx.nshift) {
2283: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2284: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2285: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2286: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2287: }
2288: }
2289: PetscLogFlops(C->cmap->n);
2290: MatSeqAIJCheckInode(C);
2291: return(0);
2292: }
2295: /* ----------------------------------------------------------- */
2296: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2297: {
2298: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2299: IS iscol = a->col,isrow = a->row;
2300: PetscErrorCode ierr;
2301: const PetscInt *r,*c,*rout,*cout;
2302: PetscInt i,j,n = A->rmap->n;
2303: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2304: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2305: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2306: PetscScalar sum1,sum2,sum3,sum4,sum5;
2307: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2308: const PetscScalar *b;
2311: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2312: node_max = a->inode.node_count;
2313: ns = a->inode.size; /* Node Size array */
2315: VecGetArrayRead(bb,&b);
2316: VecGetArray(xx,&x);
2317: tmp = a->solve_work;
2319: ISGetIndices(isrow,&rout); r = rout;
2320: ISGetIndices(iscol,&cout); c = cout;
2322: /* forward solve the lower triangular */
2323: tmps = tmp;
2324: aa = a_a;
2325: aj = a_j;
2326: ad = a->diag;
2328: for (i = 0,row = 0; i< node_max; ++i) {
2329: nsz = ns[i];
2330: aii = ai[row];
2331: v1 = aa + aii;
2332: vi = aj + aii;
2333: nz = ai[row+1]- ai[row];
2335: if (i < node_max-1) {
2336: /* Prefetch the indices for the next block */
2337: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2338: /* Prefetch the data for the next block */
2339: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2340: }
2342: switch (nsz) { /* Each loop in 'case' is unrolled */
2343: case 1:
2344: sum1 = b[r[row]];
2345: for (j=0; j<nz-1; j+=2) {
2346: i0 = vi[j];
2347: i1 = vi[j+1];
2348: tmp0 = tmps[i0];
2349: tmp1 = tmps[i1];
2350: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2351: }
2352: if (j == nz-1) {
2353: tmp0 = tmps[vi[j]];
2354: sum1 -= v1[j]*tmp0;
2355: }
2356: tmp[row++]=sum1;
2357: break;
2358: case 2:
2359: sum1 = b[r[row]];
2360: sum2 = b[r[row+1]];
2361: v2 = aa + ai[row+1];
2363: for (j=0; j<nz-1; j+=2) {
2364: i0 = vi[j];
2365: i1 = vi[j+1];
2366: tmp0 = tmps[i0];
2367: tmp1 = tmps[i1];
2368: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2369: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2370: }
2371: if (j == nz-1) {
2372: tmp0 = tmps[vi[j]];
2373: sum1 -= v1[j] *tmp0;
2374: sum2 -= v2[j] *tmp0;
2375: }
2376: sum2 -= v2[nz] * sum1;
2377: tmp[row++]=sum1;
2378: tmp[row++]=sum2;
2379: break;
2380: case 3:
2381: sum1 = b[r[row]];
2382: sum2 = b[r[row+1]];
2383: sum3 = b[r[row+2]];
2384: v2 = aa + ai[row+1];
2385: v3 = aa + ai[row+2];
2387: for (j=0; j<nz-1; j+=2) {
2388: i0 = vi[j];
2389: i1 = vi[j+1];
2390: tmp0 = tmps[i0];
2391: tmp1 = tmps[i1];
2392: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2395: }
2396: if (j == nz-1) {
2397: tmp0 = tmps[vi[j]];
2398: sum1 -= v1[j] *tmp0;
2399: sum2 -= v2[j] *tmp0;
2400: sum3 -= v3[j] *tmp0;
2401: }
2402: sum2 -= v2[nz] * sum1;
2403: sum3 -= v3[nz] * sum1;
2404: sum3 -= v3[nz+1] * sum2;
2405: tmp[row++]=sum1;
2406: tmp[row++]=sum2;
2407: tmp[row++]=sum3;
2408: break;
2410: case 4:
2411: sum1 = b[r[row]];
2412: sum2 = b[r[row+1]];
2413: sum3 = b[r[row+2]];
2414: sum4 = b[r[row+3]];
2415: v2 = aa + ai[row+1];
2416: v3 = aa + ai[row+2];
2417: v4 = aa + ai[row+3];
2419: for (j=0; j<nz-1; j+=2) {
2420: i0 = vi[j];
2421: i1 = vi[j+1];
2422: tmp0 = tmps[i0];
2423: tmp1 = tmps[i1];
2424: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428: }
2429: if (j == nz-1) {
2430: tmp0 = tmps[vi[j]];
2431: sum1 -= v1[j] *tmp0;
2432: sum2 -= v2[j] *tmp0;
2433: sum3 -= v3[j] *tmp0;
2434: sum4 -= v4[j] *tmp0;
2435: }
2436: sum2 -= v2[nz] * sum1;
2437: sum3 -= v3[nz] * sum1;
2438: sum4 -= v4[nz] * sum1;
2439: sum3 -= v3[nz+1] * sum2;
2440: sum4 -= v4[nz+1] * sum2;
2441: sum4 -= v4[nz+2] * sum3;
2443: tmp[row++]=sum1;
2444: tmp[row++]=sum2;
2445: tmp[row++]=sum3;
2446: tmp[row++]=sum4;
2447: break;
2448: case 5:
2449: sum1 = b[r[row]];
2450: sum2 = b[r[row+1]];
2451: sum3 = b[r[row+2]];
2452: sum4 = b[r[row+3]];
2453: sum5 = b[r[row+4]];
2454: v2 = aa + ai[row+1];
2455: v3 = aa + ai[row+2];
2456: v4 = aa + ai[row+3];
2457: v5 = aa + ai[row+4];
2459: for (j=0; j<nz-1; j+=2) {
2460: i0 = vi[j];
2461: i1 = vi[j+1];
2462: tmp0 = tmps[i0];
2463: tmp1 = tmps[i1];
2464: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2465: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2466: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2467: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2468: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2469: }
2470: if (j == nz-1) {
2471: tmp0 = tmps[vi[j]];
2472: sum1 -= v1[j] *tmp0;
2473: sum2 -= v2[j] *tmp0;
2474: sum3 -= v3[j] *tmp0;
2475: sum4 -= v4[j] *tmp0;
2476: sum5 -= v5[j] *tmp0;
2477: }
2479: sum2 -= v2[nz] * sum1;
2480: sum3 -= v3[nz] * sum1;
2481: sum4 -= v4[nz] * sum1;
2482: sum5 -= v5[nz] * sum1;
2483: sum3 -= v3[nz+1] * sum2;
2484: sum4 -= v4[nz+1] * sum2;
2485: sum5 -= v5[nz+1] * sum2;
2486: sum4 -= v4[nz+2] * sum3;
2487: sum5 -= v5[nz+2] * sum3;
2488: sum5 -= v5[nz+3] * sum4;
2490: tmp[row++]=sum1;
2491: tmp[row++]=sum2;
2492: tmp[row++]=sum3;
2493: tmp[row++]=sum4;
2494: tmp[row++]=sum5;
2495: break;
2496: default:
2497: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2498: }
2499: }
2500: /* backward solve the upper triangular */
2501: for (i=node_max -1,row = n-1; i>=0; i--) {
2502: nsz = ns[i];
2503: aii = ad[row+1] + 1;
2504: v1 = aa + aii;
2505: vi = aj + aii;
2506: nz = ad[row]- ad[row+1] - 1;
2508: if (i > 0) {
2509: /* Prefetch the indices for the next block */
2510: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2511: /* Prefetch the data for the next block */
2512: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2513: }
2515: switch (nsz) { /* Each loop in 'case' is unrolled */
2516: case 1:
2517: sum1 = tmp[row];
2519: for (j=0; j<nz-1; j+=2) {
2520: i0 = vi[j];
2521: i1 = vi[j+1];
2522: tmp0 = tmps[i0];
2523: tmp1 = tmps[i1];
2524: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2525: }
2526: if (j == nz-1) {
2527: tmp0 = tmps[vi[j]];
2528: sum1 -= v1[j]*tmp0;
2529: }
2530: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2531: break;
2532: case 2:
2533: sum1 = tmp[row];
2534: sum2 = tmp[row-1];
2535: v2 = aa + ad[row] + 1;
2536: for (j=0; j<nz-1; j+=2) {
2537: i0 = vi[j];
2538: i1 = vi[j+1];
2539: tmp0 = tmps[i0];
2540: tmp1 = tmps[i1];
2541: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2542: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2543: }
2544: if (j == nz-1) {
2545: tmp0 = tmps[vi[j]];
2546: sum1 -= v1[j]* tmp0;
2547: sum2 -= v2[j+1]* tmp0;
2548: }
2550: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2551: sum2 -= v2[0] * tmp0;
2552: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2553: break;
2554: case 3:
2555: sum1 = tmp[row];
2556: sum2 = tmp[row -1];
2557: sum3 = tmp[row -2];
2558: v2 = aa + ad[row] + 1;
2559: v3 = aa + ad[row -1] + 1;
2560: for (j=0; j<nz-1; j+=2) {
2561: i0 = vi[j];
2562: i1 = vi[j+1];
2563: tmp0 = tmps[i0];
2564: tmp1 = tmps[i1];
2565: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2568: }
2569: if (j== nz-1) {
2570: tmp0 = tmps[vi[j]];
2571: sum1 -= v1[j] * tmp0;
2572: sum2 -= v2[j+1] * tmp0;
2573: sum3 -= v3[j+2] * tmp0;
2574: }
2575: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2576: sum2 -= v2[0]* tmp0;
2577: sum3 -= v3[1] * tmp0;
2578: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2579: sum3 -= v3[0]* tmp0;
2580: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2582: break;
2583: case 4:
2584: sum1 = tmp[row];
2585: sum2 = tmp[row -1];
2586: sum3 = tmp[row -2];
2587: sum4 = tmp[row -3];
2588: v2 = aa + ad[row]+1;
2589: v3 = aa + ad[row -1]+1;
2590: v4 = aa + ad[row -2]+1;
2592: for (j=0; j<nz-1; j+=2) {
2593: i0 = vi[j];
2594: i1 = vi[j+1];
2595: tmp0 = tmps[i0];
2596: tmp1 = tmps[i1];
2597: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2601: }
2602: if (j== nz-1) {
2603: tmp0 = tmps[vi[j]];
2604: sum1 -= v1[j] * tmp0;
2605: sum2 -= v2[j+1] * tmp0;
2606: sum3 -= v3[j+2] * tmp0;
2607: sum4 -= v4[j+3] * tmp0;
2608: }
2610: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611: sum2 -= v2[0] * tmp0;
2612: sum3 -= v3[1] * tmp0;
2613: sum4 -= v4[2] * tmp0;
2614: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2615: sum3 -= v3[0] * tmp0;
2616: sum4 -= v4[1] * tmp0;
2617: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2618: sum4 -= v4[0] * tmp0;
2619: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2620: break;
2621: case 5:
2622: sum1 = tmp[row];
2623: sum2 = tmp[row -1];
2624: sum3 = tmp[row -2];
2625: sum4 = tmp[row -3];
2626: sum5 = tmp[row -4];
2627: v2 = aa + ad[row]+1;
2628: v3 = aa + ad[row -1]+1;
2629: v4 = aa + ad[row -2]+1;
2630: v5 = aa + ad[row -3]+1;
2631: for (j=0; j<nz-1; j+=2) {
2632: i0 = vi[j];
2633: i1 = vi[j+1];
2634: tmp0 = tmps[i0];
2635: tmp1 = tmps[i1];
2636: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2637: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2641: }
2642: if (j==nz-1) {
2643: tmp0 = tmps[vi[j]];
2644: sum1 -= v1[j] * tmp0;
2645: sum2 -= v2[j+1] * tmp0;
2646: sum3 -= v3[j+2] * tmp0;
2647: sum4 -= v4[j+3] * tmp0;
2648: sum5 -= v5[j+4] * tmp0;
2649: }
2651: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2652: sum2 -= v2[0] * tmp0;
2653: sum3 -= v3[1] * tmp0;
2654: sum4 -= v4[2] * tmp0;
2655: sum5 -= v5[3] * tmp0;
2656: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2657: sum3 -= v3[0] * tmp0;
2658: sum4 -= v4[1] * tmp0;
2659: sum5 -= v5[2] * tmp0;
2660: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2661: sum4 -= v4[0] * tmp0;
2662: sum5 -= v5[1] * tmp0;
2663: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2664: sum5 -= v5[0] * tmp0;
2665: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2666: break;
2667: default:
2668: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2669: }
2670: }
2671: ISRestoreIndices(isrow,&rout);
2672: ISRestoreIndices(iscol,&cout);
2673: VecRestoreArrayRead(bb,&b);
2674: VecRestoreArray(xx,&x);
2675: PetscLogFlops(2.0*a->nz - A->cmap->n);
2676: return(0);
2677: }
2680: /*
2681: Makes a longer coloring[] array and calls the usual code with that
2682: */
2683: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2684: {
2685: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2686: PetscErrorCode ierr;
2687: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2688: PetscInt *colorused,i;
2689: ISColoringValue *newcolor;
2692: PetscMalloc1(n+1,&newcolor);
2693: /* loop over inodes, marking a color for each column*/
2694: row = 0;
2695: for (i=0; i<m; i++) {
2696: for (j=0; j<ns[i]; j++) {
2697: newcolor[row++] = coloring[i] + j*ncolors;
2698: }
2699: }
2701: /* eliminate unneeded colors */
2702: PetscCalloc1(5*ncolors,&colorused);
2703: for (i=0; i<n; i++) {
2704: colorused[newcolor[i]] = 1;
2705: }
2707: for (i=1; i<5*ncolors; i++) {
2708: colorused[i] += colorused[i-1];
2709: }
2710: ncolors = colorused[5*ncolors-1];
2711: for (i=0; i<n; i++) {
2712: newcolor[i] = colorused[newcolor[i]]-1;
2713: }
2714: PetscFree(colorused);
2715: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2716: PetscFree(coloring);
2717: return(0);
2718: }
2720: #include <petsc/private/kernels/blockinvert.h>
2722: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2723: {
2724: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2725: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2726: MatScalar *ibdiag,*bdiag,work[25],*t;
2727: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2728: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2729: const PetscScalar *xb, *b;
2730: PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2731: PetscErrorCode ierr;
2732: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2733: PetscInt sz,k,ipvt[5];
2734: PetscBool allowzeropivot,zeropivotdetected;
2735: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2738: allowzeropivot = PetscNot(A->erroriffailure);
2739: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2740: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2742: if (!a->inode.ibdiagvalid) {
2743: if (!a->inode.ibdiag) {
2744: /* calculate space needed for diagonal blocks */
2745: for (i=0; i<m; i++) {
2746: cnt += sizes[i]*sizes[i];
2747: }
2748: a->inode.bdiagsize = cnt;
2750: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2751: }
2753: /* copy over the diagonal blocks and invert them */
2754: ibdiag = a->inode.ibdiag;
2755: bdiag = a->inode.bdiag;
2756: cnt = 0;
2757: for (i=0, row = 0; i<m; i++) {
2758: for (j=0; j<sizes[i]; j++) {
2759: for (k=0; k<sizes[i]; k++) {
2760: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2761: }
2762: }
2763: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2765: switch (sizes[i]) {
2766: case 1:
2767: /* Create matrix data structure */
2768: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2769: if (allowzeropivot) {
2770: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2771: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2772: A->factorerror_zeropivot_row = row;
2773: PetscInfo1(A,"Zero pivot, row %D\n",row);
2774: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2775: }
2776: ibdiag[cnt] = 1.0/ibdiag[cnt];
2777: break;
2778: case 2:
2779: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2780: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2781: break;
2782: case 3:
2783: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2784: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2785: break;
2786: case 4:
2787: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2788: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2789: break;
2790: case 5:
2791: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2792: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2793: break;
2794: default:
2795: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2796: }
2797: cnt += sizes[i]*sizes[i];
2798: row += sizes[i];
2799: }
2800: a->inode.ibdiagvalid = PETSC_TRUE;
2801: }
2802: ibdiag = a->inode.ibdiag;
2803: bdiag = a->inode.bdiag;
2804: t = a->inode.ssor_work;
2806: VecGetArray(xx,&x);
2807: VecGetArrayRead(bb,&b);
2808: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2809: if (flag & SOR_ZERO_INITIAL_GUESS) {
2810: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2812: for (i=0, row=0; i<m; i++) {
2813: sz = diag[row] - ii[row];
2814: v1 = a->a + ii[row];
2815: idx = a->j + ii[row];
2817: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2818: switch (sizes[i]) {
2819: case 1:
2821: sum1 = b[row];
2822: for (n = 0; n<sz-1; n+=2) {
2823: i1 = idx[0];
2824: i2 = idx[1];
2825: idx += 2;
2826: tmp0 = x[i1];
2827: tmp1 = x[i2];
2828: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2829: }
2831: if (n == sz-1) {
2832: tmp0 = x[*idx];
2833: sum1 -= *v1 * tmp0;
2834: }
2835: t[row] = sum1;
2836: x[row++] = sum1*(*ibdiag++);
2837: break;
2838: case 2:
2839: v2 = a->a + ii[row+1];
2840: sum1 = b[row];
2841: sum2 = b[row+1];
2842: for (n = 0; n<sz-1; n+=2) {
2843: i1 = idx[0];
2844: i2 = idx[1];
2845: idx += 2;
2846: tmp0 = x[i1];
2847: tmp1 = x[i2];
2848: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2849: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2850: }
2852: if (n == sz-1) {
2853: tmp0 = x[*idx];
2854: sum1 -= v1[0] * tmp0;
2855: sum2 -= v2[0] * tmp0;
2856: }
2857: t[row] = sum1;
2858: t[row+1] = sum2;
2859: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2860: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2861: ibdiag += 4;
2862: break;
2863: case 3:
2864: v2 = a->a + ii[row+1];
2865: v3 = a->a + ii[row+2];
2866: sum1 = b[row];
2867: sum2 = b[row+1];
2868: sum3 = b[row+2];
2869: for (n = 0; n<sz-1; n+=2) {
2870: i1 = idx[0];
2871: i2 = idx[1];
2872: idx += 2;
2873: tmp0 = x[i1];
2874: tmp1 = x[i2];
2875: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2876: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2877: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2878: }
2880: if (n == sz-1) {
2881: tmp0 = x[*idx];
2882: sum1 -= v1[0] * tmp0;
2883: sum2 -= v2[0] * tmp0;
2884: sum3 -= v3[0] * tmp0;
2885: }
2886: t[row] = sum1;
2887: t[row+1] = sum2;
2888: t[row+2] = sum3;
2889: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2890: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2891: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2892: ibdiag += 9;
2893: break;
2894: case 4:
2895: v2 = a->a + ii[row+1];
2896: v3 = a->a + ii[row+2];
2897: v4 = a->a + ii[row+3];
2898: sum1 = b[row];
2899: sum2 = b[row+1];
2900: sum3 = b[row+2];
2901: sum4 = b[row+3];
2902: for (n = 0; n<sz-1; n+=2) {
2903: i1 = idx[0];
2904: i2 = idx[1];
2905: idx += 2;
2906: tmp0 = x[i1];
2907: tmp1 = x[i2];
2908: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2909: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2910: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2911: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2912: }
2914: if (n == sz-1) {
2915: tmp0 = x[*idx];
2916: sum1 -= v1[0] * tmp0;
2917: sum2 -= v2[0] * tmp0;
2918: sum3 -= v3[0] * tmp0;
2919: sum4 -= v4[0] * tmp0;
2920: }
2921: t[row] = sum1;
2922: t[row+1] = sum2;
2923: t[row+2] = sum3;
2924: t[row+3] = sum4;
2925: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2926: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2927: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2928: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2929: ibdiag += 16;
2930: break;
2931: case 5:
2932: v2 = a->a + ii[row+1];
2933: v3 = a->a + ii[row+2];
2934: v4 = a->a + ii[row+3];
2935: v5 = a->a + ii[row+4];
2936: sum1 = b[row];
2937: sum2 = b[row+1];
2938: sum3 = b[row+2];
2939: sum4 = b[row+3];
2940: sum5 = b[row+4];
2941: for (n = 0; n<sz-1; n+=2) {
2942: i1 = idx[0];
2943: i2 = idx[1];
2944: idx += 2;
2945: tmp0 = x[i1];
2946: tmp1 = x[i2];
2947: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2948: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2949: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2950: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2951: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2952: }
2954: if (n == sz-1) {
2955: tmp0 = x[*idx];
2956: sum1 -= v1[0] * tmp0;
2957: sum2 -= v2[0] * tmp0;
2958: sum3 -= v3[0] * tmp0;
2959: sum4 -= v4[0] * tmp0;
2960: sum5 -= v5[0] * tmp0;
2961: }
2962: t[row] = sum1;
2963: t[row+1] = sum2;
2964: t[row+2] = sum3;
2965: t[row+3] = sum4;
2966: t[row+4] = sum5;
2967: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2968: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2969: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2970: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2971: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2972: ibdiag += 25;
2973: break;
2974: default:
2975: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2976: }
2977: }
2979: xb = t;
2980: PetscLogFlops(a->nz);
2981: } else xb = b;
2982: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2984: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2985: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2986: ibdiag -= sizes[i]*sizes[i];
2987: sz = ii[row+1] - diag[row] - 1;
2988: v1 = a->a + diag[row] + 1;
2989: idx = a->j + diag[row] + 1;
2991: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2992: switch (sizes[i]) {
2993: case 1:
2995: sum1 = xb[row];
2996: for (n = 0; n<sz-1; n+=2) {
2997: i1 = idx[0];
2998: i2 = idx[1];
2999: idx += 2;
3000: tmp0 = x[i1];
3001: tmp1 = x[i2];
3002: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3003: }
3005: if (n == sz-1) {
3006: tmp0 = x[*idx];
3007: sum1 -= *v1*tmp0;
3008: }
3009: x[row--] = sum1*(*ibdiag);
3010: break;
3012: case 2:
3014: sum1 = xb[row];
3015: sum2 = xb[row-1];
3016: /* note that sum1 is associated with the second of the two rows */
3017: v2 = a->a + diag[row-1] + 2;
3018: for (n = 0; n<sz-1; n+=2) {
3019: i1 = idx[0];
3020: i2 = idx[1];
3021: idx += 2;
3022: tmp0 = x[i1];
3023: tmp1 = x[i2];
3024: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3025: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3026: }
3028: if (n == sz-1) {
3029: tmp0 = x[*idx];
3030: sum1 -= *v1*tmp0;
3031: sum2 -= *v2*tmp0;
3032: }
3033: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3034: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3035: break;
3036: case 3:
3038: sum1 = xb[row];
3039: sum2 = xb[row-1];
3040: sum3 = xb[row-2];
3041: v2 = a->a + diag[row-1] + 2;
3042: v3 = a->a + diag[row-2] + 3;
3043: for (n = 0; n<sz-1; n+=2) {
3044: i1 = idx[0];
3045: i2 = idx[1];
3046: idx += 2;
3047: tmp0 = x[i1];
3048: tmp1 = x[i2];
3049: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3050: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3051: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3052: }
3054: if (n == sz-1) {
3055: tmp0 = x[*idx];
3056: sum1 -= *v1*tmp0;
3057: sum2 -= *v2*tmp0;
3058: sum3 -= *v3*tmp0;
3059: }
3060: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3061: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3062: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3063: break;
3064: case 4:
3066: sum1 = xb[row];
3067: sum2 = xb[row-1];
3068: sum3 = xb[row-2];
3069: sum4 = xb[row-3];
3070: v2 = a->a + diag[row-1] + 2;
3071: v3 = a->a + diag[row-2] + 3;
3072: v4 = a->a + diag[row-3] + 4;
3073: for (n = 0; n<sz-1; n+=2) {
3074: i1 = idx[0];
3075: i2 = idx[1];
3076: idx += 2;
3077: tmp0 = x[i1];
3078: tmp1 = x[i2];
3079: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3080: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3081: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3082: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3083: }
3085: if (n == sz-1) {
3086: tmp0 = x[*idx];
3087: sum1 -= *v1*tmp0;
3088: sum2 -= *v2*tmp0;
3089: sum3 -= *v3*tmp0;
3090: sum4 -= *v4*tmp0;
3091: }
3092: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3093: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3094: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3095: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3096: break;
3097: case 5:
3099: sum1 = xb[row];
3100: sum2 = xb[row-1];
3101: sum3 = xb[row-2];
3102: sum4 = xb[row-3];
3103: sum5 = xb[row-4];
3104: v2 = a->a + diag[row-1] + 2;
3105: v3 = a->a + diag[row-2] + 3;
3106: v4 = a->a + diag[row-3] + 4;
3107: v5 = a->a + diag[row-4] + 5;
3108: for (n = 0; n<sz-1; n+=2) {
3109: i1 = idx[0];
3110: i2 = idx[1];
3111: idx += 2;
3112: tmp0 = x[i1];
3113: tmp1 = x[i2];
3114: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3115: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3116: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3117: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3118: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3119: }
3121: if (n == sz-1) {
3122: tmp0 = x[*idx];
3123: sum1 -= *v1*tmp0;
3124: sum2 -= *v2*tmp0;
3125: sum3 -= *v3*tmp0;
3126: sum4 -= *v4*tmp0;
3127: sum5 -= *v5*tmp0;
3128: }
3129: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3130: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3131: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3132: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3133: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3134: break;
3135: default:
3136: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3137: }
3138: }
3140: PetscLogFlops(a->nz);
3141: }
3142: its--;
3143: }
3144: while (its--) {
3146: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3147: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3148: i<m;
3149: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3151: sz = diag[row] - ii[row];
3152: v1 = a->a + ii[row];
3153: idx = a->j + ii[row];
3154: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3155: switch (sizes[i]) {
3156: case 1:
3157: sum1 = b[row];
3158: for (n = 0; n<sz-1; n+=2) {
3159: i1 = idx[0];
3160: i2 = idx[1];
3161: idx += 2;
3162: tmp0 = x[i1];
3163: tmp1 = x[i2];
3164: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3165: }
3166: if (n == sz-1) {
3167: tmp0 = x[*idx++];
3168: sum1 -= *v1 * tmp0;
3169: v1++;
3170: }
3171: t[row] = sum1;
3172: sz = ii[row+1] - diag[row] - 1;
3173: idx = a->j + diag[row] + 1;
3174: v1 += 1;
3175: for (n = 0; n<sz-1; n+=2) {
3176: i1 = idx[0];
3177: i2 = idx[1];
3178: idx += 2;
3179: tmp0 = x[i1];
3180: tmp1 = x[i2];
3181: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3182: }
3183: if (n == sz-1) {
3184: tmp0 = x[*idx++];
3185: sum1 -= *v1 * tmp0;
3186: }
3187: /* in MatSOR_SeqAIJ this line would be
3188: *
3189: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3190: *
3191: * but omega == 1, so this becomes
3192: *
3193: * x[row] = sum1*(*ibdiag++);
3194: *
3195: */
3196: x[row] = sum1*(*ibdiag);
3197: break;
3198: case 2:
3199: v2 = a->a + ii[row+1];
3200: sum1 = b[row];
3201: sum2 = b[row+1];
3202: for (n = 0; n<sz-1; n+=2) {
3203: i1 = idx[0];
3204: i2 = idx[1];
3205: idx += 2;
3206: tmp0 = x[i1];
3207: tmp1 = x[i2];
3208: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3209: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3210: }
3211: if (n == sz-1) {
3212: tmp0 = x[*idx++];
3213: sum1 -= v1[0] * tmp0;
3214: sum2 -= v2[0] * tmp0;
3215: v1++; v2++;
3216: }
3217: t[row] = sum1;
3218: t[row+1] = sum2;
3219: sz = ii[row+1] - diag[row] - 2;
3220: idx = a->j + diag[row] + 2;
3221: v1 += 2;
3222: v2 += 2;
3223: for (n = 0; n<sz-1; n+=2) {
3224: i1 = idx[0];
3225: i2 = idx[1];
3226: idx += 2;
3227: tmp0 = x[i1];
3228: tmp1 = x[i2];
3229: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3230: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3231: }
3232: if (n == sz-1) {
3233: tmp0 = x[*idx];
3234: sum1 -= v1[0] * tmp0;
3235: sum2 -= v2[0] * tmp0;
3236: }
3237: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3238: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3239: break;
3240: case 3:
3241: v2 = a->a + ii[row+1];
3242: v3 = a->a + ii[row+2];
3243: sum1 = b[row];
3244: sum2 = b[row+1];
3245: sum3 = b[row+2];
3246: for (n = 0; n<sz-1; n+=2) {
3247: i1 = idx[0];
3248: i2 = idx[1];
3249: idx += 2;
3250: tmp0 = x[i1];
3251: tmp1 = x[i2];
3252: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3253: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3254: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3255: }
3256: if (n == sz-1) {
3257: tmp0 = x[*idx++];
3258: sum1 -= v1[0] * tmp0;
3259: sum2 -= v2[0] * tmp0;
3260: sum3 -= v3[0] * tmp0;
3261: v1++; v2++; v3++;
3262: }
3263: t[row] = sum1;
3264: t[row+1] = sum2;
3265: t[row+2] = sum3;
3266: sz = ii[row+1] - diag[row] - 3;
3267: idx = a->j + diag[row] + 3;
3268: v1 += 3;
3269: v2 += 3;
3270: v3 += 3;
3271: for (n = 0; n<sz-1; n+=2) {
3272: i1 = idx[0];
3273: i2 = idx[1];
3274: idx += 2;
3275: tmp0 = x[i1];
3276: tmp1 = x[i2];
3277: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3278: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3279: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3280: }
3281: if (n == sz-1) {
3282: tmp0 = x[*idx];
3283: sum1 -= v1[0] * tmp0;
3284: sum2 -= v2[0] * tmp0;
3285: sum3 -= v3[0] * tmp0;
3286: }
3287: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3288: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3289: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3290: break;
3291: case 4:
3292: v2 = a->a + ii[row+1];
3293: v3 = a->a + ii[row+2];
3294: v4 = a->a + ii[row+3];
3295: sum1 = b[row];
3296: sum2 = b[row+1];
3297: sum3 = b[row+2];
3298: sum4 = b[row+3];
3299: for (n = 0; n<sz-1; n+=2) {
3300: i1 = idx[0];
3301: i2 = idx[1];
3302: idx += 2;
3303: tmp0 = x[i1];
3304: tmp1 = x[i2];
3305: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3306: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3307: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3308: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3309: }
3310: if (n == sz-1) {
3311: tmp0 = x[*idx++];
3312: sum1 -= v1[0] * tmp0;
3313: sum2 -= v2[0] * tmp0;
3314: sum3 -= v3[0] * tmp0;
3315: sum4 -= v4[0] * tmp0;
3316: v1++; v2++; v3++; v4++;
3317: }
3318: t[row] = sum1;
3319: t[row+1] = sum2;
3320: t[row+2] = sum3;
3321: t[row+3] = sum4;
3322: sz = ii[row+1] - diag[row] - 4;
3323: idx = a->j + diag[row] + 4;
3324: v1 += 4;
3325: v2 += 4;
3326: v3 += 4;
3327: v4 += 4;
3328: for (n = 0; n<sz-1; n+=2) {
3329: i1 = idx[0];
3330: i2 = idx[1];
3331: idx += 2;
3332: tmp0 = x[i1];
3333: tmp1 = x[i2];
3334: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3335: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3336: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3337: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3338: }
3339: if (n == sz-1) {
3340: tmp0 = x[*idx];
3341: sum1 -= v1[0] * tmp0;
3342: sum2 -= v2[0] * tmp0;
3343: sum3 -= v3[0] * tmp0;
3344: sum4 -= v4[0] * tmp0;
3345: }
3346: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3347: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3348: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3349: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3350: break;
3351: case 5:
3352: v2 = a->a + ii[row+1];
3353: v3 = a->a + ii[row+2];
3354: v4 = a->a + ii[row+3];
3355: v5 = a->a + ii[row+4];
3356: sum1 = b[row];
3357: sum2 = b[row+1];
3358: sum3 = b[row+2];
3359: sum4 = b[row+3];
3360: sum5 = b[row+4];
3361: for (n = 0; n<sz-1; n+=2) {
3362: i1 = idx[0];
3363: i2 = idx[1];
3364: idx += 2;
3365: tmp0 = x[i1];
3366: tmp1 = x[i2];
3367: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3368: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3369: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3370: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3371: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3372: }
3373: if (n == sz-1) {
3374: tmp0 = x[*idx++];
3375: sum1 -= v1[0] * tmp0;
3376: sum2 -= v2[0] * tmp0;
3377: sum3 -= v3[0] * tmp0;
3378: sum4 -= v4[0] * tmp0;
3379: sum5 -= v5[0] * tmp0;
3380: v1++; v2++; v3++; v4++; v5++;
3381: }
3382: t[row] = sum1;
3383: t[row+1] = sum2;
3384: t[row+2] = sum3;
3385: t[row+3] = sum4;
3386: t[row+4] = sum5;
3387: sz = ii[row+1] - diag[row] - 5;
3388: idx = a->j + diag[row] + 5;
3389: v1 += 5;
3390: v2 += 5;
3391: v3 += 5;
3392: v4 += 5;
3393: v5 += 5;
3394: for (n = 0; n<sz-1; n+=2) {
3395: i1 = idx[0];
3396: i2 = idx[1];
3397: idx += 2;
3398: tmp0 = x[i1];
3399: tmp1 = x[i2];
3400: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3401: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3402: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3403: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3404: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3405: }
3406: if (n == sz-1) {
3407: tmp0 = x[*idx];
3408: sum1 -= v1[0] * tmp0;
3409: sum2 -= v2[0] * tmp0;
3410: sum3 -= v3[0] * tmp0;
3411: sum4 -= v4[0] * tmp0;
3412: sum5 -= v5[0] * tmp0;
3413: }
3414: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3415: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3416: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3417: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3418: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3419: break;
3420: default:
3421: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3422: }
3423: }
3424: xb = t;
3425: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3426: } else xb = b;
3428: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3430: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3431: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3432: ibdiag -= sizes[i]*sizes[i];
3434: /* set RHS */
3435: if (xb == b) {
3436: /* whole (old way) */
3437: sz = ii[row+1] - ii[row];
3438: idx = a->j + ii[row];
3439: switch (sizes[i]) {
3440: case 5:
3441: v5 = a->a + ii[row-4];
3442: case 4: /* fall through */
3443: v4 = a->a + ii[row-3];
3444: case 3:
3445: v3 = a->a + ii[row-2];
3446: case 2:
3447: v2 = a->a + ii[row-1];
3448: case 1:
3449: v1 = a->a + ii[row];
3450: break;
3451: default:
3452: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3453: }
3454: } else {
3455: /* upper, no diag */
3456: sz = ii[row+1] - diag[row] - 1;
3457: idx = a->j + diag[row] + 1;
3458: switch (sizes[i]) {
3459: case 5:
3460: v5 = a->a + diag[row-4] + 5;
3461: case 4: /* fall through */
3462: v4 = a->a + diag[row-3] + 4;
3463: case 3:
3464: v3 = a->a + diag[row-2] + 3;
3465: case 2:
3466: v2 = a->a + diag[row-1] + 2;
3467: case 1:
3468: v1 = a->a + diag[row] + 1;
3469: }
3470: }
3471: /* set sum */
3472: switch (sizes[i]) {
3473: case 5:
3474: sum5 = xb[row-4];
3475: case 4: /* fall through */
3476: sum4 = xb[row-3];
3477: case 3:
3478: sum3 = xb[row-2];
3479: case 2:
3480: sum2 = xb[row-1];
3481: case 1:
3482: /* note that sum1 is associated with the last row */
3483: sum1 = xb[row];
3484: }
3485: /* do sums */
3486: for (n = 0; n<sz-1; n+=2) {
3487: i1 = idx[0];
3488: i2 = idx[1];
3489: idx += 2;
3490: tmp0 = x[i1];
3491: tmp1 = x[i2];
3492: switch (sizes[i]) {
3493: case 5:
3494: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3495: case 4: /* fall through */
3496: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3497: case 3:
3498: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3499: case 2:
3500: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3501: case 1:
3502: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3503: }
3504: }
3505: /* ragged edge */
3506: if (n == sz-1) {
3507: tmp0 = x[*idx];
3508: switch (sizes[i]) {
3509: case 5:
3510: sum5 -= *v5*tmp0;
3511: case 4: /* fall through */
3512: sum4 -= *v4*tmp0;
3513: case 3:
3514: sum3 -= *v3*tmp0;
3515: case 2:
3516: sum2 -= *v2*tmp0;
3517: case 1:
3518: sum1 -= *v1*tmp0;
3519: }
3520: }
3521: /* update */
3522: if (xb == b) {
3523: /* whole (old way) w/ diag */
3524: switch (sizes[i]) {
3525: case 5:
3526: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3527: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3528: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3529: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3530: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3531: break;
3532: case 4:
3533: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3534: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3535: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3536: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3537: break;
3538: case 3:
3539: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3540: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3541: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3542: break;
3543: case 2:
3544: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3545: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3546: break;
3547: case 1:
3548: x[row--] += sum1*(*ibdiag);
3549: break;
3550: }
3551: } else {
3552: /* no diag so set = */
3553: switch (sizes[i]) {
3554: case 5:
3555: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3556: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3557: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3558: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3559: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3560: break;
3561: case 4:
3562: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3563: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3564: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3565: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3566: break;
3567: case 3:
3568: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3569: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3570: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3571: break;
3572: case 2:
3573: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3574: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3575: break;
3576: case 1:
3577: x[row--] = sum1*(*ibdiag);
3578: break;
3579: }
3580: }
3581: }
3582: if (xb == b) {
3583: PetscLogFlops(2.0*a->nz);
3584: } else {
3585: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3586: }
3587: }
3588: }
3589: if (flag & SOR_EISENSTAT) {
3590: /*
3591: Apply (U + D)^-1 where D is now the block diagonal
3592: */
3593: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3594: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3595: ibdiag -= sizes[i]*sizes[i];
3596: sz = ii[row+1] - diag[row] - 1;
3597: v1 = a->a + diag[row] + 1;
3598: idx = a->j + diag[row] + 1;
3599: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3600: switch (sizes[i]) {
3601: case 1:
3603: sum1 = b[row];
3604: for (n = 0; n<sz-1; n+=2) {
3605: i1 = idx[0];
3606: i2 = idx[1];
3607: idx += 2;
3608: tmp0 = x[i1];
3609: tmp1 = x[i2];
3610: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3611: }
3613: if (n == sz-1) {
3614: tmp0 = x[*idx];
3615: sum1 -= *v1*tmp0;
3616: }
3617: x[row] = sum1*(*ibdiag);row--;
3618: break;
3620: case 2:
3622: sum1 = b[row];
3623: sum2 = b[row-1];
3624: /* note that sum1 is associated with the second of the two rows */
3625: v2 = a->a + diag[row-1] + 2;
3626: for (n = 0; n<sz-1; n+=2) {
3627: i1 = idx[0];
3628: i2 = idx[1];
3629: idx += 2;
3630: tmp0 = x[i1];
3631: tmp1 = x[i2];
3632: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3633: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3634: }
3636: if (n == sz-1) {
3637: tmp0 = x[*idx];
3638: sum1 -= *v1*tmp0;
3639: sum2 -= *v2*tmp0;
3640: }
3641: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3642: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3643: row -= 2;
3644: break;
3645: case 3:
3647: sum1 = b[row];
3648: sum2 = b[row-1];
3649: sum3 = b[row-2];
3650: v2 = a->a + diag[row-1] + 2;
3651: v3 = a->a + diag[row-2] + 3;
3652: for (n = 0; n<sz-1; n+=2) {
3653: i1 = idx[0];
3654: i2 = idx[1];
3655: idx += 2;
3656: tmp0 = x[i1];
3657: tmp1 = x[i2];
3658: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3659: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3660: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3661: }
3663: if (n == sz-1) {
3664: tmp0 = x[*idx];
3665: sum1 -= *v1*tmp0;
3666: sum2 -= *v2*tmp0;
3667: sum3 -= *v3*tmp0;
3668: }
3669: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3670: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3671: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3672: row -= 3;
3673: break;
3674: case 4:
3676: sum1 = b[row];
3677: sum2 = b[row-1];
3678: sum3 = b[row-2];
3679: sum4 = b[row-3];
3680: v2 = a->a + diag[row-1] + 2;
3681: v3 = a->a + diag[row-2] + 3;
3682: v4 = a->a + diag[row-3] + 4;
3683: for (n = 0; n<sz-1; n+=2) {
3684: i1 = idx[0];
3685: i2 = idx[1];
3686: idx += 2;
3687: tmp0 = x[i1];
3688: tmp1 = x[i2];
3689: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3690: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3691: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3692: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3693: }
3695: if (n == sz-1) {
3696: tmp0 = x[*idx];
3697: sum1 -= *v1*tmp0;
3698: sum2 -= *v2*tmp0;
3699: sum3 -= *v3*tmp0;
3700: sum4 -= *v4*tmp0;
3701: }
3702: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3703: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3704: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3705: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3706: row -= 4;
3707: break;
3708: case 5:
3710: sum1 = b[row];
3711: sum2 = b[row-1];
3712: sum3 = b[row-2];
3713: sum4 = b[row-3];
3714: sum5 = b[row-4];
3715: v2 = a->a + diag[row-1] + 2;
3716: v3 = a->a + diag[row-2] + 3;
3717: v4 = a->a + diag[row-3] + 4;
3718: v5 = a->a + diag[row-4] + 5;
3719: for (n = 0; n<sz-1; n+=2) {
3720: i1 = idx[0];
3721: i2 = idx[1];
3722: idx += 2;
3723: tmp0 = x[i1];
3724: tmp1 = x[i2];
3725: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3726: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3727: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3728: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3729: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3730: }
3732: if (n == sz-1) {
3733: tmp0 = x[*idx];
3734: sum1 -= *v1*tmp0;
3735: sum2 -= *v2*tmp0;
3736: sum3 -= *v3*tmp0;
3737: sum4 -= *v4*tmp0;
3738: sum5 -= *v5*tmp0;
3739: }
3740: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3741: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3742: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3743: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3744: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3745: row -= 5;
3746: break;
3747: default:
3748: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3749: }
3750: }
3751: PetscLogFlops(a->nz);
3753: /*
3754: t = b - D x where D is the block diagonal
3755: */
3756: cnt = 0;
3757: for (i=0, row=0; i<m; i++) {
3758: switch (sizes[i]) {
3759: case 1:
3760: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3761: break;
3762: case 2:
3763: x1 = x[row]; x2 = x[row+1];
3764: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3765: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3766: t[row] = b[row] - tmp1;
3767: t[row+1] = b[row+1] - tmp2; row += 2;
3768: cnt += 4;
3769: break;
3770: case 3:
3771: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3772: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3773: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3774: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3775: t[row] = b[row] - tmp1;
3776: t[row+1] = b[row+1] - tmp2;
3777: t[row+2] = b[row+2] - tmp3; row += 3;
3778: cnt += 9;
3779: break;
3780: case 4:
3781: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3782: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3783: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3784: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3785: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3786: t[row] = b[row] - tmp1;
3787: t[row+1] = b[row+1] - tmp2;
3788: t[row+2] = b[row+2] - tmp3;
3789: t[row+3] = b[row+3] - tmp4; row += 4;
3790: cnt += 16;
3791: break;
3792: case 5:
3793: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3794: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3795: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3796: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3797: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3798: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3799: t[row] = b[row] - tmp1;
3800: t[row+1] = b[row+1] - tmp2;
3801: t[row+2] = b[row+2] - tmp3;
3802: t[row+3] = b[row+3] - tmp4;
3803: t[row+4] = b[row+4] - tmp5;row += 5;
3804: cnt += 25;
3805: break;
3806: default:
3807: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3808: }
3809: }
3810: PetscLogFlops(m);
3814: /*
3815: Apply (L + D)^-1 where D is the block diagonal
3816: */
3817: for (i=0, row=0; i<m; i++) {
3818: sz = diag[row] - ii[row];
3819: v1 = a->a + ii[row];
3820: idx = a->j + ii[row];
3821: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3822: switch (sizes[i]) {
3823: case 1:
3825: sum1 = t[row];
3826: for (n = 0; n<sz-1; n+=2) {
3827: i1 = idx[0];
3828: i2 = idx[1];
3829: idx += 2;
3830: tmp0 = t[i1];
3831: tmp1 = t[i2];
3832: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3833: }
3835: if (n == sz-1) {
3836: tmp0 = t[*idx];
3837: sum1 -= *v1 * tmp0;
3838: }
3839: x[row] += t[row] = sum1*(*ibdiag++); row++;
3840: break;
3841: case 2:
3842: v2 = a->a + ii[row+1];
3843: sum1 = t[row];
3844: sum2 = t[row+1];
3845: for (n = 0; n<sz-1; n+=2) {
3846: i1 = idx[0];
3847: i2 = idx[1];
3848: idx += 2;
3849: tmp0 = t[i1];
3850: tmp1 = t[i2];
3851: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3852: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3853: }
3855: if (n == sz-1) {
3856: tmp0 = t[*idx];
3857: sum1 -= v1[0] * tmp0;
3858: sum2 -= v2[0] * tmp0;
3859: }
3860: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3861: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3862: ibdiag += 4; row += 2;
3863: break;
3864: case 3:
3865: v2 = a->a + ii[row+1];
3866: v3 = a->a + ii[row+2];
3867: sum1 = t[row];
3868: sum2 = t[row+1];
3869: sum3 = t[row+2];
3870: for (n = 0; n<sz-1; n+=2) {
3871: i1 = idx[0];
3872: i2 = idx[1];
3873: idx += 2;
3874: tmp0 = t[i1];
3875: tmp1 = t[i2];
3876: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3877: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3878: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3879: }
3881: if (n == sz-1) {
3882: tmp0 = t[*idx];
3883: sum1 -= v1[0] * tmp0;
3884: sum2 -= v2[0] * tmp0;
3885: sum3 -= v3[0] * tmp0;
3886: }
3887: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3888: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3889: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3890: ibdiag += 9; row += 3;
3891: break;
3892: case 4:
3893: v2 = a->a + ii[row+1];
3894: v3 = a->a + ii[row+2];
3895: v4 = a->a + ii[row+3];
3896: sum1 = t[row];
3897: sum2 = t[row+1];
3898: sum3 = t[row+2];
3899: sum4 = t[row+3];
3900: for (n = 0; n<sz-1; n+=2) {
3901: i1 = idx[0];
3902: i2 = idx[1];
3903: idx += 2;
3904: tmp0 = t[i1];
3905: tmp1 = t[i2];
3906: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3908: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3909: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3910: }
3912: if (n == sz-1) {
3913: tmp0 = t[*idx];
3914: sum1 -= v1[0] * tmp0;
3915: sum2 -= v2[0] * tmp0;
3916: sum3 -= v3[0] * tmp0;
3917: sum4 -= v4[0] * tmp0;
3918: }
3919: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3920: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3921: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3922: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3923: ibdiag += 16; row += 4;
3924: break;
3925: case 5:
3926: v2 = a->a + ii[row+1];
3927: v3 = a->a + ii[row+2];
3928: v4 = a->a + ii[row+3];
3929: v5 = a->a + ii[row+4];
3930: sum1 = t[row];
3931: sum2 = t[row+1];
3932: sum3 = t[row+2];
3933: sum4 = t[row+3];
3934: sum5 = t[row+4];
3935: for (n = 0; n<sz-1; n+=2) {
3936: i1 = idx[0];
3937: i2 = idx[1];
3938: idx += 2;
3939: tmp0 = t[i1];
3940: tmp1 = t[i2];
3941: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3942: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3943: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3944: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3945: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3946: }
3948: if (n == sz-1) {
3949: tmp0 = t[*idx];
3950: sum1 -= v1[0] * tmp0;
3951: sum2 -= v2[0] * tmp0;
3952: sum3 -= v3[0] * tmp0;
3953: sum4 -= v4[0] * tmp0;
3954: sum5 -= v5[0] * tmp0;
3955: }
3956: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3957: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3958: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3959: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3960: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3961: ibdiag += 25; row += 5;
3962: break;
3963: default:
3964: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3965: }
3966: }
3967: PetscLogFlops(a->nz);
3968: }
3969: VecRestoreArray(xx,&x);
3970: VecRestoreArrayRead(bb,&b);
3971: return(0);
3972: }
3974: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3975: {
3976: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3977: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3978: const MatScalar *bdiag = a->inode.bdiag;
3979: const PetscScalar *b;
3980: PetscErrorCode ierr;
3981: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3982: const PetscInt *sizes = a->inode.size;
3985: VecGetArray(xx,&x);
3986: VecGetArrayRead(bb,&b);
3987: cnt = 0;
3988: for (i=0, row=0; i<m; i++) {
3989: switch (sizes[i]) {
3990: case 1:
3991: x[row] = b[row]*bdiag[cnt++];row++;
3992: break;
3993: case 2:
3994: x1 = b[row]; x2 = b[row+1];
3995: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3996: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3997: x[row++] = tmp1;
3998: x[row++] = tmp2;
3999: cnt += 4;
4000: break;
4001: case 3:
4002: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4003: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4004: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4005: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4006: x[row++] = tmp1;
4007: x[row++] = tmp2;
4008: x[row++] = tmp3;
4009: cnt += 9;
4010: break;
4011: case 4:
4012: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4013: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4014: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4015: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4016: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4017: x[row++] = tmp1;
4018: x[row++] = tmp2;
4019: x[row++] = tmp3;
4020: x[row++] = tmp4;
4021: cnt += 16;
4022: break;
4023: case 5:
4024: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4025: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4026: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4027: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4028: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4029: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4030: x[row++] = tmp1;
4031: x[row++] = tmp2;
4032: x[row++] = tmp3;
4033: x[row++] = tmp4;
4034: x[row++] = tmp5;
4035: cnt += 25;
4036: break;
4037: default:
4038: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4039: }
4040: }
4041: PetscLogFlops(2*cnt);
4042: VecRestoreArray(xx,&x);
4043: VecRestoreArrayRead(bb,&b);
4044: return(0);
4045: }
4047: /*
4048: samestructure indicates that the matrix has not changed its nonzero structure so we
4049: do not need to recompute the inodes
4050: */
4051: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4052: {
4053: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4055: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4056: PetscBool flag;
4057: const PetscInt *idx,*idy,*ii;
4060: if (!a->inode.use) return(0);
4061: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4063: m = A->rmap->n;
4064: if (a->inode.size) ns = a->inode.size;
4065: else {
4066: PetscMalloc1(m+1,&ns);
4067: }
4069: i = 0;
4070: node_count = 0;
4071: idx = a->j;
4072: ii = a->i;
4073: while (i < m) { /* For each row */
4074: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4075: /* Limits the number of elements in a node to 'a->inode.limit' */
4076: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4077: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4078: if (nzy != nzx) break;
4079: idy += nzx; /* Same nonzero pattern */
4080: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4081: if (!flag) break;
4082: }
4083: ns[node_count++] = blk_size;
4084: idx += blk_size*nzx;
4085: i = j;
4086: }
4087: /* If not enough inodes found,, do not use inode version of the routines */
4088: if (!m || node_count > .8*m) {
4089: PetscFree(ns);
4091: a->inode.node_count = 0;
4092: a->inode.size = NULL;
4093: a->inode.use = PETSC_FALSE;
4094: A->ops->mult = MatMult_SeqAIJ;
4095: A->ops->sor = MatSOR_SeqAIJ;
4096: A->ops->multadd = MatMultAdd_SeqAIJ;
4097: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4098: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4099: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4100: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4101: A->ops->coloringpatch = 0;
4102: A->ops->multdiagonalblock = 0;
4104: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4105: } else {
4106: if (!A->factortype) {
4107: A->ops->mult = MatMult_SeqAIJ_Inode;
4108: A->ops->sor = MatSOR_SeqAIJ_Inode;
4109: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4110: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4111: if (A->rmap->n == A->cmap->n) {
4112: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4113: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4114: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4115: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4116: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4117: }
4118: } else {
4119: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4120: }
4121: a->inode.node_count = node_count;
4122: a->inode.size = ns;
4123: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4124: }
4125: a->inode.checked = PETSC_TRUE;
4126: a->inode.mat_nonzerostate = A->nonzerostate;
4127: return(0);
4128: }
4130: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4131: {
4132: Mat B =*C;
4133: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4135: PetscInt m=A->rmap->n;
4138: c->inode.use = a->inode.use;
4139: c->inode.limit = a->inode.limit;
4140: c->inode.max_limit = a->inode.max_limit;
4141: if (a->inode.size) {
4142: PetscMalloc1(m+1,&c->inode.size);
4143: c->inode.node_count = a->inode.node_count;
4144: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4145: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4146: if (!B->factortype) {
4147: B->ops->mult = MatMult_SeqAIJ_Inode;
4148: B->ops->sor = MatSOR_SeqAIJ_Inode;
4149: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4150: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4151: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4152: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4153: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4154: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4155: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4156: } else {
4157: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4158: }
4159: } else {
4160: c->inode.size = 0;
4161: c->inode.node_count = 0;
4162: }
4163: c->inode.ibdiagvalid = PETSC_FALSE;
4164: c->inode.ibdiag = 0;
4165: c->inode.bdiag = 0;
4166: return(0);
4167: }
4169: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4170: {
4171: PetscInt k;
4172: const PetscInt *vi;
4175: vi = aj + ai[row];
4176: for (k=0; k<nzl; k++) cols[k] = vi[k];
4177: vi = aj + adiag[row];
4178: cols[nzl] = vi[0];
4179: vi = aj + adiag[row+1]+1;
4180: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4181: return(0);
4182: }
4183: /*
4184: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4185: Modified from MatSeqAIJCheckInode().
4187: Input Parameters:
4188: . Mat A - ILU or LU matrix factor
4190: */
4191: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4192: {
4193: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4195: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4196: PetscInt *cols1,*cols2,*ns;
4197: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4198: PetscBool flag;
4201: if (!a->inode.use) return(0);
4202: if (a->inode.checked) return(0);
4204: m = A->rmap->n;
4205: if (a->inode.size) ns = a->inode.size;
4206: else {
4207: PetscMalloc1(m+1,&ns);
4208: }
4210: i = 0;
4211: node_count = 0;
4212: PetscMalloc2(m,&cols1,m,&cols2);
4213: while (i < m) { /* For each row */
4214: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4215: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4216: nzx = nzl1 + nzu1 + 1;
4217: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4219: /* Limits the number of elements in a node to 'a->inode.limit' */
4220: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4221: nzl2 = ai[j+1] - ai[j];
4222: nzu2 = adiag[j] - adiag[j+1] - 1;
4223: nzy = nzl2 + nzu2 + 1;
4224: if (nzy != nzx) break;
4225: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4226: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4227: if (!flag) break;
4228: }
4229: ns[node_count++] = blk_size;
4230: i = j;
4231: }
4232: PetscFree2(cols1,cols2);
4233: /* If not enough inodes found,, do not use inode version of the routines */
4234: if (!m || node_count > .8*m) {
4235: PetscFree(ns);
4237: a->inode.node_count = 0;
4238: a->inode.size = NULL;
4239: a->inode.use = PETSC_FALSE;
4241: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4242: } else {
4243: A->ops->mult = 0;
4244: A->ops->sor = 0;
4245: A->ops->multadd = 0;
4246: A->ops->getrowij = 0;
4247: A->ops->restorerowij = 0;
4248: A->ops->getcolumnij = 0;
4249: A->ops->restorecolumnij = 0;
4250: A->ops->coloringpatch = 0;
4251: A->ops->multdiagonalblock = 0;
4252: a->inode.node_count = node_count;
4253: a->inode.size = ns;
4255: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4256: }
4257: a->inode.checked = PETSC_TRUE;
4258: return(0);
4259: }
4261: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4262: {
4263: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4266: a->inode.ibdiagvalid = PETSC_FALSE;
4267: return(0);
4268: }
4270: /*
4271: This is really ugly. if inodes are used this replaces the
4272: permutations with ones that correspond to rows/cols of the matrix
4273: rather then inode blocks
4274: */
4275: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4276: {
4280: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4281: return(0);
4282: }
4284: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4285: {
4286: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4288: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4289: const PetscInt *ridx,*cidx;
4290: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4291: PetscInt nslim_col,*ns_col;
4292: IS ris = *rperm,cis = *cperm;
4295: if (!a->inode.size) return(0); /* no inodes so return */
4296: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4298: MatCreateColInode_Private(A,&nslim_col,&ns_col);
4299: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4300: PetscMalloc2(m,&permr,n,&permc);
4302: ISGetIndices(ris,&ridx);
4303: ISGetIndices(cis,&cidx);
4305: /* Form the inode structure for the rows of permuted matric using inv perm*/
4306: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4308: /* Construct the permutations for rows*/
4309: for (i=0,row = 0; i<nslim_row; ++i) {
4310: indx = ridx[i];
4311: start_val = tns[indx];
4312: end_val = tns[indx + 1];
4313: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4314: }
4316: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4317: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4319: /* Construct permutations for columns */
4320: for (i=0,col=0; i<nslim_col; ++i) {
4321: indx = cidx[i];
4322: start_val = tns[indx];
4323: end_val = tns[indx + 1];
4324: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4325: }
4327: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4328: ISSetPermutation(*rperm);
4329: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4330: ISSetPermutation(*cperm);
4332: ISRestoreIndices(ris,&ridx);
4333: ISRestoreIndices(cis,&cidx);
4335: PetscFree(ns_col);
4336: PetscFree2(permr,permc);
4337: ISDestroy(&cis);
4338: ISDestroy(&ris);
4339: PetscFree(tns);
4340: return(0);
4341: }
4343: /*@C
4344: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4346: Not Collective
4348: Input Parameter:
4349: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4351: Output Parameter:
4352: + node_count - no of inodes present in the matrix.
4353: . sizes - an array of size node_count,with sizes of each inode.
4354: - limit - the max size used to generate the inodes.
4356: Level: advanced
4358: Notes:
4359: This routine returns some internal storage information
4360: of the matrix, it is intended to be used by advanced users.
4361: It should be called after the matrix is assembled.
4362: The contents of the sizes[] array should not be changed.
4363: NULL may be passed for information not requested.
4365: .keywords: matrix, seqaij, get, inode
4367: .seealso: MatGetInfo()
4368: @*/
4369: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4370: {
4371: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4374: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4375: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4376: if (f) {
4377: (*f)(A,node_count,sizes,limit);
4378: }
4379: return(0);
4380: }
4382: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4383: {
4384: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4387: if (node_count) *node_count = a->inode.node_count;
4388: if (sizes) *sizes = a->inode.size;
4389: if (limit) *limit = a->inode.limit;
4390: return(0);
4391: }