Actual source code: inode.c
petsc-3.3-p7 2013-05-11
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
20:
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
24: for(; count+1 < n; count++,i++);
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
32:
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for(; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
69: nslim_row = a->inode.node_count;
70: m = A->rmap->n;
71: n = A->cmap->n;
72: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73:
74: /* Use the row_inode as column_inode */
75: nslim_col = nslim_row;
76: ns_col = ns_row;
78: /* allocate space for reformated inode structure */
79: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
80: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
81: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83: for (i1=0,col=0; i1<nslim_col; ++i1){
84: nsz = ns_col[i1];
85: for (i2=0; i2<nsz; ++i2,++col)
86: tvc[col] = i1;
87: }
88: /* allocate space for row pointers */
89: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
90: *iia = ia;
91: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
92: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
94: /* determine the number of columns in each row */
95: ia[0] = oshift;
96: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98: j = aj + ai[row] + ishift;
99: jmax = aj + ai[row+1] + ishift;
100: i2 = 0;
101: col = *j++ + ishift;
102: i2 = tvc[col];
103: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104: ia[i1+1]++;
105: ia[i2+1]++;
106: i2++; /* Start col of next node */
107: while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108: i2 = tvc[col];
109: }
110: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
111: }
113: /* shift ia[i] to point to next row */
114: for (i1=1; i1<nslim_row+1; i1++) {
115: row = ia[i1-1];
116: ia[i1] += row;
117: work[i1-1] = row - oshift;
118: }
120: /* allocate space for column pointers */
121: nz = ia[nslim_row] + (!ishift);
122: PetscMalloc(nz*sizeof(PetscInt),&ja);
123: *jja = ja;
125: /* loop over lower triangular part putting into ja */
126: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127: j = aj + ai[row] + ishift;
128: jmax = aj + ai[row+1] + ishift;
129: i2 = 0; /* Col inode index */
130: col = *j++ + ishift;
131: i2 = tvc[col];
132: while (i2<i1 && j<jmax) {
133: ja[work[i2]++] = i1 + oshift;
134: ja[work[i1]++] = i2 + oshift;
135: ++i2;
136: while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137: i2 = tvc[col];
138: }
139: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141: }
142: PetscFree(work);
143: PetscFree(tns);
144: PetscFree(tvc);
145: return(0);
146: }
148: /*
149: This builds nonsymmetric version of nonzero structure,
150: */
153: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154: {
155: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
157: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
161: nslim_row = a->inode.node_count;
162: n = A->cmap->n;
164: /* Create The column_inode for this matrix */
165: Mat_CreateColInode(A,&nslim_col,&ns_col);
166:
167: /* allocate space for reformated column_inode structure */
168: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
169: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
170: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172: for (i1=0,col=0; i1<nslim_col; ++i1){
173: nsz = ns_col[i1];
174: for (i2=0; i2<nsz; ++i2,++col)
175: tvc[col] = i1;
176: }
177: /* allocate space for row pointers */
178: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
179: *iia = ia;
180: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
181: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
183: /* determine the number of columns in each row */
184: ia[0] = oshift;
185: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186: j = aj + ai[row] + ishift;
187: col = *j++ + ishift;
188: i2 = tvc[col];
189: nz = ai[row+1] - ai[row];
190: while (nz-- > 0) { /* off-diagonal elemets */
191: ia[i1+1]++;
192: i2++; /* Start col of next node */
193: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
194: if (nz > 0) i2 = tvc[col];
195: }
196: }
198: /* shift ia[i] to point to next row */
199: for (i1=1; i1<nslim_row+1; i1++) {
200: row = ia[i1-1];
201: ia[i1] += row;
202: work[i1-1] = row - oshift;
203: }
205: /* allocate space for column pointers */
206: nz = ia[nslim_row] + (!ishift);
207: PetscMalloc(nz*sizeof(PetscInt),&ja);
208: *jja = ja;
210: /* loop over matrix putting into ja */
211: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
212: j = aj + ai[row] + ishift;
213: i2 = 0; /* Col inode index */
214: col = *j++ + ishift;
215: i2 = tvc[col];
216: nz = ai[row+1] - ai[row];
217: while (nz-- > 0) {
218: ja[work[i1]++] = i2 + oshift;
219: ++i2;
220: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
221: if (nz > 0) i2 = tvc[col];
222: }
223: }
224: PetscFree(ns_col);
225: PetscFree(work);
226: PetscFree(tns);
227: PetscFree(tvc);
228: return(0);
229: }
233: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done)
234: {
235: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
239: *n = a->inode.node_count;
240: if (!ia) return(0);
241: if (!blockcompressed) {
242: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
243: } else if (symmetric) {
244: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
245: } else {
246: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
247: }
248: return(0);
249: }
253: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done)
254: {
258: if (!ia) return(0);
260: if (!blockcompressed) {
261: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
262: } else {
263: PetscFree(*ia);
264: PetscFree(*ja);
265: }
267: return(0);
268: }
270: /* ----------------------------------------------------------- */
274: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275: {
276: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
278: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
282: nslim_row = a->inode.node_count;
283: n = A->cmap->n;
285: /* Create The column_inode for this matrix */
286: Mat_CreateColInode(A,&nslim_col,&ns_col);
287:
288: /* allocate space for reformated column_inode structure */
289: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
290: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
291: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293: for (i1=0,col=0; i1<nslim_col; ++i1){
294: nsz = ns_col[i1];
295: for (i2=0; i2<nsz; ++i2,++col)
296: tvc[col] = i1;
297: }
298: /* allocate space for column pointers */
299: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
300: *iia = ia;
301: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
302: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
304: /* determine the number of columns in each row */
305: ia[0] = oshift;
306: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
307: j = aj + ai[row] + ishift;
308: col = *j++ + ishift;
309: i2 = tvc[col];
310: nz = ai[row+1] - ai[row];
311: while (nz-- > 0) { /* off-diagonal elemets */
312: /* ia[i1+1]++; */
313: ia[i2+1]++;
314: i2++;
315: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
316: if (nz > 0) i2 = tvc[col];
317: }
318: }
320: /* shift ia[i] to point to next col */
321: for (i1=1; i1<nslim_col+1; i1++) {
322: col = ia[i1-1];
323: ia[i1] += col;
324: work[i1-1] = col - oshift;
325: }
327: /* allocate space for column pointers */
328: nz = ia[nslim_col] + (!ishift);
329: PetscMalloc(nz*sizeof(PetscInt),&ja);
330: *jja = ja;
332: /* loop over matrix putting into ja */
333: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
334: j = aj + ai[row] + ishift;
335: i2 = 0; /* Col inode index */
336: col = *j++ + ishift;
337: i2 = tvc[col];
338: nz = ai[row+1] - ai[row];
339: while (nz-- > 0) {
340: /* ja[work[i1]++] = i2 + oshift; */
341: ja[work[i2]++] = i1 + oshift;
342: i2++;
343: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
344: if (nz > 0) i2 = tvc[col];
345: }
346: }
347: PetscFree(ns_col);
348: PetscFree(work);
349: PetscFree(tns);
350: PetscFree(tvc);
351: return(0);
352: }
356: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done)
357: {
361: Mat_CreateColInode(A,n,PETSC_NULL);
362: if (!ia) return(0);
364: if (!blockcompressed) {
365: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
366: } else if (symmetric) {
367: /* Since the indices are symmetric it does'nt matter */
368: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
369: } else {
370: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
371: }
372: return(0);
373: }
377: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done)
378: {
382: if (!ia) return(0);
383: if (!blockcompressed) {
384: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
385: } else {
386: PetscFree(*ia);
387: PetscFree(*ja);
388: }
389: return(0);
390: }
392: /* ----------------------------------------------------------- */
396: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
397: {
398: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
399: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
400: PetscScalar *y;
401: const PetscScalar *x;
402: const MatScalar *v1,*v2,*v3,*v4,*v5;
403: PetscErrorCode ierr;
404: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405:
406: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408: #endif
411: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
412: node_max = a->inode.node_count;
413: ns = a->inode.size; /* Node Size array */
414: VecGetArrayRead(xx,&x);
415: VecGetArray(yy,&y);
416: idx = a->j;
417: v1 = a->a;
418: ii = a->i;
420: for (i = 0,row = 0; i< node_max; ++i){
421: nsz = ns[i];
422: n = ii[1] - ii[0];
423: nonzerorow += (n>0)*nsz;
424: ii += nsz;
425: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
426: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
427: sz = n; /* No of non zeros in this row */
428: /* Switch on the size of Node */
429: switch (nsz){ /* Each loop in 'case' is unrolled */
430: case 1 :
431: sum1 = 0.;
432:
433: for(n = 0; n< sz-1; n+=2) {
434: i1 = idx[0]; /* The instructions are ordered to */
435: i2 = idx[1]; /* make the compiler's job easy */
436: idx += 2;
437: tmp0 = x[i1];
438: tmp1 = x[i2];
439: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440: }
441:
442: if (n == sz-1){ /* Take care of the last nonzero */
443: tmp0 = x[*idx++];
444: sum1 += *v1++ * tmp0;
445: }
446: y[row++]=sum1;
447: break;
448: case 2:
449: sum1 = 0.;
450: sum2 = 0.;
451: v2 = v1 + n;
452:
453: for (n = 0; n< sz-1; n+=2) {
454: i1 = idx[0];
455: i2 = idx[1];
456: idx += 2;
457: tmp0 = x[i1];
458: tmp1 = x[i2];
459: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
460: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
461: }
462: if (n == sz-1){
463: tmp0 = x[*idx++];
464: sum1 += *v1++ * tmp0;
465: sum2 += *v2++ * tmp0;
466: }
467: y[row++]=sum1;
468: y[row++]=sum2;
469: v1 =v2; /* Since the next block to be processed starts there*/
470: idx +=sz;
471: break;
472: case 3:
473: sum1 = 0.;
474: sum2 = 0.;
475: sum3 = 0.;
476: v2 = v1 + n;
477: v3 = v2 + n;
478:
479: for (n = 0; n< sz-1; n+=2) {
480: i1 = idx[0];
481: i2 = idx[1];
482: idx += 2;
483: tmp0 = x[i1];
484: tmp1 = x[i2];
485: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
486: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
487: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
488: }
489: if (n == sz-1){
490: tmp0 = x[*idx++];
491: sum1 += *v1++ * tmp0;
492: sum2 += *v2++ * tmp0;
493: sum3 += *v3++ * tmp0;
494: }
495: y[row++]=sum1;
496: y[row++]=sum2;
497: y[row++]=sum3;
498: v1 =v3; /* Since the next block to be processed starts there*/
499: idx +=2*sz;
500: break;
501: case 4:
502: sum1 = 0.;
503: sum2 = 0.;
504: sum3 = 0.;
505: sum4 = 0.;
506: v2 = v1 + n;
507: v3 = v2 + n;
508: v4 = v3 + n;
509:
510: for (n = 0; n< sz-1; n+=2) {
511: i1 = idx[0];
512: i2 = idx[1];
513: idx += 2;
514: tmp0 = x[i1];
515: tmp1 = x[i2];
516: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
517: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
518: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
519: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
520: }
521: if (n == sz-1){
522: tmp0 = x[*idx++];
523: sum1 += *v1++ * tmp0;
524: sum2 += *v2++ * tmp0;
525: sum3 += *v3++ * tmp0;
526: sum4 += *v4++ * tmp0;
527: }
528: y[row++]=sum1;
529: y[row++]=sum2;
530: y[row++]=sum3;
531: y[row++]=sum4;
532: v1 =v4; /* Since the next block to be processed starts there*/
533: idx +=3*sz;
534: break;
535: case 5:
536: sum1 = 0.;
537: sum2 = 0.;
538: sum3 = 0.;
539: sum4 = 0.;
540: sum5 = 0.;
541: v2 = v1 + n;
542: v3 = v2 + n;
543: v4 = v3 + n;
544: v5 = v4 + n;
545:
546: for (n = 0; n<sz-1; n+=2) {
547: i1 = idx[0];
548: i2 = idx[1];
549: idx += 2;
550: tmp0 = x[i1];
551: tmp1 = x[i2];
552: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
553: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
554: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
555: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
556: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
557: }
558: if (n == sz-1){
559: tmp0 = x[*idx++];
560: sum1 += *v1++ * tmp0;
561: sum2 += *v2++ * tmp0;
562: sum3 += *v3++ * tmp0;
563: sum4 += *v4++ * tmp0;
564: sum5 += *v5++ * tmp0;
565: }
566: y[row++]=sum1;
567: y[row++]=sum2;
568: y[row++]=sum3;
569: y[row++]=sum4;
570: y[row++]=sum5;
571: v1 =v5; /* Since the next block to be processed starts there */
572: idx +=4*sz;
573: break;
574: default :
575: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
576: }
577: }
578: VecRestoreArrayRead(xx,&x);
579: VecRestoreArray(yy,&y);
580: PetscLogFlops(2.0*a->nz - nonzerorow);
581: return(0);
582: }
583: /* ----------------------------------------------------------- */
584: /* Almost same code as the MatMult_SeqAIJ_Inode() */
587: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
588: {
589: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
590: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
591: MatScalar *v1,*v2,*v3,*v4,*v5;
592: PetscScalar *x,*y,*z,*zt;
594: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
595:
597: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
598: node_max = a->inode.node_count;
599: ns = a->inode.size; /* Node Size array */
600: VecGetArray(xx,&x);
601: VecGetArray(yy,&y);
602: if (zz != yy) {
603: VecGetArray(zz,&z);
604: } else {
605: z = y;
606: }
607: zt = z;
609: idx = a->j;
610: v1 = a->a;
611: ii = a->i;
613: for (i = 0,row = 0; i< node_max; ++i){
614: nsz = ns[i];
615: n = ii[1] - ii[0];
616: ii += nsz;
617: sz = n; /* No of non zeros in this row */
618: /* Switch on the size of Node */
619: switch (nsz){ /* Each loop in 'case' is unrolled */
620: case 1 :
621: sum1 = *zt++;
622:
623: for(n = 0; n< sz-1; n+=2) {
624: i1 = idx[0]; /* The instructions are ordered to */
625: i2 = idx[1]; /* make the compiler's job easy */
626: idx += 2;
627: tmp0 = x[i1];
628: tmp1 = x[i2];
629: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
630: }
631:
632: if(n == sz-1){ /* Take care of the last nonzero */
633: tmp0 = x[*idx++];
634: sum1 += *v1++ * tmp0;
635: }
636: y[row++]=sum1;
637: break;
638: case 2:
639: sum1 = *zt++;
640: sum2 = *zt++;
641: v2 = v1 + n;
642:
643: for(n = 0; n< sz-1; n+=2) {
644: i1 = idx[0];
645: i2 = idx[1];
646: idx += 2;
647: tmp0 = x[i1];
648: tmp1 = x[i2];
649: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
650: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
651: }
652: if(n == sz-1){
653: tmp0 = x[*idx++];
654: sum1 += *v1++ * tmp0;
655: sum2 += *v2++ * tmp0;
656: }
657: y[row++]=sum1;
658: y[row++]=sum2;
659: v1 =v2; /* Since the next block to be processed starts there*/
660: idx +=sz;
661: break;
662: case 3:
663: sum1 = *zt++;
664: sum2 = *zt++;
665: sum3 = *zt++;
666: v2 = v1 + n;
667: v3 = v2 + n;
668:
669: for (n = 0; n< sz-1; n+=2) {
670: i1 = idx[0];
671: i2 = idx[1];
672: idx += 2;
673: tmp0 = x[i1];
674: tmp1 = x[i2];
675: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
676: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
677: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
678: }
679: if (n == sz-1){
680: tmp0 = x[*idx++];
681: sum1 += *v1++ * tmp0;
682: sum2 += *v2++ * tmp0;
683: sum3 += *v3++ * tmp0;
684: }
685: y[row++]=sum1;
686: y[row++]=sum2;
687: y[row++]=sum3;
688: v1 =v3; /* Since the next block to be processed starts there*/
689: idx +=2*sz;
690: break;
691: case 4:
692: sum1 = *zt++;
693: sum2 = *zt++;
694: sum3 = *zt++;
695: sum4 = *zt++;
696: v2 = v1 + n;
697: v3 = v2 + n;
698: v4 = v3 + n;
699:
700: for (n = 0; n< sz-1; n+=2) {
701: i1 = idx[0];
702: i2 = idx[1];
703: idx += 2;
704: tmp0 = x[i1];
705: tmp1 = x[i2];
706: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
707: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
708: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
709: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
710: }
711: if (n == sz-1){
712: tmp0 = x[*idx++];
713: sum1 += *v1++ * tmp0;
714: sum2 += *v2++ * tmp0;
715: sum3 += *v3++ * tmp0;
716: sum4 += *v4++ * tmp0;
717: }
718: y[row++]=sum1;
719: y[row++]=sum2;
720: y[row++]=sum3;
721: y[row++]=sum4;
722: v1 =v4; /* Since the next block to be processed starts there*/
723: idx +=3*sz;
724: break;
725: case 5:
726: sum1 = *zt++;
727: sum2 = *zt++;
728: sum3 = *zt++;
729: sum4 = *zt++;
730: sum5 = *zt++;
731: v2 = v1 + n;
732: v3 = v2 + n;
733: v4 = v3 + n;
734: v5 = v4 + n;
735:
736: for (n = 0; n<sz-1; n+=2) {
737: i1 = idx[0];
738: i2 = idx[1];
739: idx += 2;
740: tmp0 = x[i1];
741: tmp1 = x[i2];
742: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
743: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
744: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
745: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
746: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
747: }
748: if(n == sz-1){
749: tmp0 = x[*idx++];
750: sum1 += *v1++ * tmp0;
751: sum2 += *v2++ * tmp0;
752: sum3 += *v3++ * tmp0;
753: sum4 += *v4++ * tmp0;
754: sum5 += *v5++ * tmp0;
755: }
756: y[row++]=sum1;
757: y[row++]=sum2;
758: y[row++]=sum3;
759: y[row++]=sum4;
760: y[row++]=sum5;
761: v1 =v5; /* Since the next block to be processed starts there */
762: idx +=4*sz;
763: break;
764: default :
765: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
766: }
767: }
768: VecRestoreArray(xx,&x);
769: VecRestoreArray(yy,&y);
770: if (zz != yy) {
771: VecRestoreArray(zz,&z);
772: }
773: PetscLogFlops(2.0*a->nz);
774: return(0);
775: }
777: /* ----------------------------------------------------------- */
780: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
781: {
782: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
783: IS iscol = a->col,isrow = a->row;
784: PetscErrorCode ierr;
785: const PetscInt *r,*c,*rout,*cout;
786: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
787: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
788: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
789: PetscScalar sum1,sum2,sum3,sum4,sum5;
790: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
791: const PetscScalar *b;
794: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
795: node_max = a->inode.node_count;
796: ns = a->inode.size; /* Node Size array */
798: VecGetArrayRead(bb,&b);
799: VecGetArray(xx,&x);
800: tmp = a->solve_work;
801:
802: ISGetIndices(isrow,&rout); r = rout;
803: ISGetIndices(iscol,&cout); c = cout + (n-1);
804:
805: /* forward solve the lower triangular */
806: tmps = tmp ;
807: aa = a_a ;
808: aj = a_j ;
809: ad = a->diag;
811: for (i = 0,row = 0; i< node_max; ++i){
812: nsz = ns[i];
813: aii = ai[row];
814: v1 = aa + aii;
815: vi = aj + aii;
816: nz = ad[row]- aii;
817: if (i < node_max-1) {
818: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
819: * but our indexing to determine it's size could. */
820: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
821: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
822: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
823: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
824: }
826: switch (nsz){ /* Each loop in 'case' is unrolled */
827: case 1 :
828: sum1 = b[*r++];
829: for(j=0; j<nz-1; j+=2){
830: i0 = vi[0];
831: i1 = vi[1];
832: vi +=2;
833: tmp0 = tmps[i0];
834: tmp1 = tmps[i1];
835: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
836: }
837: if(j == nz-1){
838: tmp0 = tmps[*vi++];
839: sum1 -= *v1++ *tmp0;
840: }
841: tmp[row ++]=sum1;
842: break;
843: case 2:
844: sum1 = b[*r++];
845: sum2 = b[*r++];
846: v2 = aa + ai[row+1];
848: for(j=0; j<nz-1; j+=2){
849: i0 = vi[0];
850: i1 = vi[1];
851: vi +=2;
852: tmp0 = tmps[i0];
853: tmp1 = tmps[i1];
854: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
855: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
856: }
857: if(j == nz-1){
858: tmp0 = tmps[*vi++];
859: sum1 -= *v1++ *tmp0;
860: sum2 -= *v2++ *tmp0;
861: }
862: sum2 -= *v2++ * sum1;
863: tmp[row ++]=sum1;
864: tmp[row ++]=sum2;
865: break;
866: case 3:
867: sum1 = b[*r++];
868: sum2 = b[*r++];
869: sum3 = b[*r++];
870: v2 = aa + ai[row+1];
871: v3 = aa + ai[row+2];
872:
873: for (j=0; j<nz-1; j+=2){
874: i0 = vi[0];
875: i1 = vi[1];
876: vi +=2;
877: tmp0 = tmps[i0];
878: tmp1 = tmps[i1];
879: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
880: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
881: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
882: }
883: if (j == nz-1){
884: tmp0 = tmps[*vi++];
885: sum1 -= *v1++ *tmp0;
886: sum2 -= *v2++ *tmp0;
887: sum3 -= *v3++ *tmp0;
888: }
889: sum2 -= *v2++ * sum1;
890: sum3 -= *v3++ * sum1;
891: sum3 -= *v3++ * sum2;
892: tmp[row ++]=sum1;
893: tmp[row ++]=sum2;
894: tmp[row ++]=sum3;
895: break;
896:
897: case 4:
898: sum1 = b[*r++];
899: sum2 = b[*r++];
900: sum3 = b[*r++];
901: sum4 = b[*r++];
902: v2 = aa + ai[row+1];
903: v3 = aa + ai[row+2];
904: v4 = aa + ai[row+3];
905:
906: for (j=0; j<nz-1; j+=2){
907: i0 = vi[0];
908: i1 = vi[1];
909: vi +=2;
910: tmp0 = tmps[i0];
911: tmp1 = tmps[i1];
912: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916: }
917: if (j == nz-1){
918: tmp0 = tmps[*vi++];
919: sum1 -= *v1++ *tmp0;
920: sum2 -= *v2++ *tmp0;
921: sum3 -= *v3++ *tmp0;
922: sum4 -= *v4++ *tmp0;
923: }
924: sum2 -= *v2++ * sum1;
925: sum3 -= *v3++ * sum1;
926: sum4 -= *v4++ * sum1;
927: sum3 -= *v3++ * sum2;
928: sum4 -= *v4++ * sum2;
929: sum4 -= *v4++ * sum3;
930:
931: tmp[row ++]=sum1;
932: tmp[row ++]=sum2;
933: tmp[row ++]=sum3;
934: tmp[row ++]=sum4;
935: break;
936: case 5:
937: sum1 = b[*r++];
938: sum2 = b[*r++];
939: sum3 = b[*r++];
940: sum4 = b[*r++];
941: sum5 = b[*r++];
942: v2 = aa + ai[row+1];
943: v3 = aa + ai[row+2];
944: v4 = aa + ai[row+3];
945: v5 = aa + ai[row+4];
946:
947: for (j=0; j<nz-1; j+=2){
948: i0 = vi[0];
949: i1 = vi[1];
950: vi +=2;
951: tmp0 = tmps[i0];
952: tmp1 = tmps[i1];
953: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958: }
959: if (j == nz-1){
960: tmp0 = tmps[*vi++];
961: sum1 -= *v1++ *tmp0;
962: sum2 -= *v2++ *tmp0;
963: sum3 -= *v3++ *tmp0;
964: sum4 -= *v4++ *tmp0;
965: sum5 -= *v5++ *tmp0;
966: }
968: sum2 -= *v2++ * sum1;
969: sum3 -= *v3++ * sum1;
970: sum4 -= *v4++ * sum1;
971: sum5 -= *v5++ * sum1;
972: sum3 -= *v3++ * sum2;
973: sum4 -= *v4++ * sum2;
974: sum5 -= *v5++ * sum2;
975: sum4 -= *v4++ * sum3;
976: sum5 -= *v5++ * sum3;
977: sum5 -= *v5++ * sum4;
978:
979: tmp[row ++]=sum1;
980: tmp[row ++]=sum2;
981: tmp[row ++]=sum3;
982: tmp[row ++]=sum4;
983: tmp[row ++]=sum5;
984: break;
985: default:
986: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987: }
988: }
989: /* backward solve the upper triangular */
990: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
991: nsz = ns[i];
992: aii = ai[row+1] -1;
993: v1 = aa + aii;
994: vi = aj + aii;
995: nz = aii- ad[row];
996: switch (nsz){ /* Each loop in 'case' is unrolled */
997: case 1 :
998: sum1 = tmp[row];
1000: for(j=nz ; j>1; j-=2){
1001: vi -=2;
1002: i0 = vi[2];
1003: i1 = vi[1];
1004: tmp0 = tmps[i0];
1005: tmp1 = tmps[i1];
1006: v1 -= 2;
1007: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008: }
1009: if (j==1){
1010: tmp0 = tmps[*vi--];
1011: sum1 -= *v1-- * tmp0;
1012: }
1013: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014: break;
1015: case 2 :
1016: sum1 = tmp[row];
1017: sum2 = tmp[row -1];
1018: v2 = aa + ai[row]-1;
1019: for (j=nz ; j>1; j-=2){
1020: vi -=2;
1021: i0 = vi[2];
1022: i1 = vi[1];
1023: tmp0 = tmps[i0];
1024: tmp1 = tmps[i1];
1025: v1 -= 2;
1026: v2 -= 2;
1027: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029: }
1030: if (j==1){
1031: tmp0 = tmps[*vi--];
1032: sum1 -= *v1-- * tmp0;
1033: sum2 -= *v2-- * tmp0;
1034: }
1035:
1036: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037: sum2 -= *v2-- * tmp0;
1038: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039: break;
1040: case 3 :
1041: sum1 = tmp[row];
1042: sum2 = tmp[row -1];
1043: sum3 = tmp[row -2];
1044: v2 = aa + ai[row]-1;
1045: v3 = aa + ai[row -1]-1;
1046: for (j=nz ; j>1; j-=2){
1047: vi -=2;
1048: i0 = vi[2];
1049: i1 = vi[1];
1050: tmp0 = tmps[i0];
1051: tmp1 = tmps[i1];
1052: v1 -= 2;
1053: v2 -= 2;
1054: v3 -= 2;
1055: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058: }
1059: if (j==1){
1060: tmp0 = tmps[*vi--];
1061: sum1 -= *v1-- * tmp0;
1062: sum2 -= *v2-- * tmp0;
1063: sum3 -= *v3-- * tmp0;
1064: }
1065: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066: sum2 -= *v2-- * tmp0;
1067: sum3 -= *v3-- * tmp0;
1068: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069: sum3 -= *v3-- * tmp0;
1070: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071:
1072: break;
1073: case 4 :
1074: sum1 = tmp[row];
1075: sum2 = tmp[row -1];
1076: sum3 = tmp[row -2];
1077: sum4 = tmp[row -3];
1078: v2 = aa + ai[row]-1;
1079: v3 = aa + ai[row -1]-1;
1080: v4 = aa + ai[row -2]-1;
1082: for (j=nz ; j>1; j-=2){
1083: vi -=2;
1084: i0 = vi[2];
1085: i1 = vi[1];
1086: tmp0 = tmps[i0];
1087: tmp1 = tmps[i1];
1088: v1 -= 2;
1089: v2 -= 2;
1090: v3 -= 2;
1091: v4 -= 2;
1092: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096: }
1097: if (j==1){
1098: tmp0 = tmps[*vi--];
1099: sum1 -= *v1-- * tmp0;
1100: sum2 -= *v2-- * tmp0;
1101: sum3 -= *v3-- * tmp0;
1102: sum4 -= *v4-- * tmp0;
1103: }
1105: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106: sum2 -= *v2-- * tmp0;
1107: sum3 -= *v3-- * tmp0;
1108: sum4 -= *v4-- * tmp0;
1109: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113: sum4 -= *v4-- * tmp0;
1114: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115: break;
1116: case 5 :
1117: sum1 = tmp[row];
1118: sum2 = tmp[row -1];
1119: sum3 = tmp[row -2];
1120: sum4 = tmp[row -3];
1121: sum5 = tmp[row -4];
1122: v2 = aa + ai[row]-1;
1123: v3 = aa + ai[row -1]-1;
1124: v4 = aa + ai[row -2]-1;
1125: v5 = aa + ai[row -3]-1;
1126: for (j=nz ; j>1; j-=2){
1127: vi -= 2;
1128: i0 = vi[2];
1129: i1 = vi[1];
1130: tmp0 = tmps[i0];
1131: tmp1 = tmps[i1];
1132: v1 -= 2;
1133: v2 -= 2;
1134: v3 -= 2;
1135: v4 -= 2;
1136: v5 -= 2;
1137: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142: }
1143: if (j==1){
1144: tmp0 = tmps[*vi--];
1145: sum1 -= *v1-- * tmp0;
1146: sum2 -= *v2-- * tmp0;
1147: sum3 -= *v3-- * tmp0;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: }
1152: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153: sum2 -= *v2-- * tmp0;
1154: sum3 -= *v3-- * tmp0;
1155: sum4 -= *v4-- * tmp0;
1156: sum5 -= *v5-- * tmp0;
1157: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158: sum3 -= *v3-- * tmp0;
1159: sum4 -= *v4-- * tmp0;
1160: sum5 -= *v5-- * tmp0;
1161: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162: sum4 -= *v4-- * tmp0;
1163: sum5 -= *v5-- * tmp0;
1164: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165: sum5 -= *v5-- * tmp0;
1166: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167: break;
1168: default:
1169: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170: }
1171: }
1172: ISRestoreIndices(isrow,&rout);
1173: ISRestoreIndices(iscol,&cout);
1174: VecRestoreArrayRead(bb,&b);
1175: VecRestoreArray(xx,&x);
1176: PetscLogFlops(2.0*a->nz - A->cmap->n);
1177: return(0);
1178: }
1182: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183: {
1184: Mat C=B;
1185: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1186: IS isrow = b->row,isicol = b->icol;
1187: PetscErrorCode ierr;
1188: const PetscInt *r,*ic,*ics;
1189: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190: PetscInt i,j,k,nz,nzL,row,*pj;
1191: const PetscInt *ajtmp,*bjtmp;
1192: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194: FactorShiftCtx sctx;
1195: const PetscInt *ddiag;
1196: PetscReal rs;
1197: MatScalar d;
1198: PetscInt inod,nodesz,node_max,col;
1199: const PetscInt *ns;
1200: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1203: /* MatPivotSetUp(): initialize shift context sctx */
1204: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1206: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207: ddiag = a->diag;
1208: sctx.shift_top = info->zeropivot;
1209: for (i=0; i<n; i++) {
1210: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211: d = (aa)[ddiag[i]];
1212: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213: v = aa+ai[i];
1214: nz = ai[i+1] - ai[i];
1215: for (j=0; j<nz; j++)
1216: rs += PetscAbsScalar(v[j]);
1217: if (rs>sctx.shift_top) sctx.shift_top = rs;
1218: }
1219: sctx.shift_top *= 1.1;
1220: sctx.nshift_max = 5;
1221: sctx.shift_lo = 0.;
1222: sctx.shift_hi = 1.;
1223: }
1225: ISGetIndices(isrow,&r);
1226: ISGetIndices(isicol,&ic);
1227:
1228: PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);
1229: PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));
1230: rtmp2 = rtmp1 + n;
1231: rtmp3 = rtmp2 + n;
1232: rtmp4 = rtmp3 + n;
1233: ics = ic;
1235: node_max = a->inode.node_count;
1236: ns = a->inode.size;
1237: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1239: /* If max inode size > 4, split it into two inodes.*/
1240: /* also map the inode sizes according to the ordering */
1241: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1242: for (i=0,j=0; i<node_max; ++i,++j){
1243: if (ns[i] > 4) {
1244: tmp_vec1[j] = 4;
1245: ++j;
1246: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247: } else {
1248: tmp_vec1[j] = ns[i];
1249: }
1250: }
1251: /* Use the correct node_max */
1252: node_max = j;
1253:
1254: /* Now reorder the inode info based on mat re-ordering info */
1255: /* First create a row -> inode_size_array_index map */
1256: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1257: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1258: for (i=0,row=0; i<node_max; i++) {
1259: nodesz = tmp_vec1[i];
1260: for (j=0; j<nodesz; j++,row++) {
1261: nsmap[row] = i;
1262: }
1263: }
1264: /* Using nsmap, create a reordered ns structure */
1265: for (i=0,j=0; i< node_max; i++) {
1266: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1267: tmp_vec2[i] = nodesz;
1268: j += nodesz;
1269: }
1270: PetscFree(nsmap);
1271: PetscFree(tmp_vec1);
1273: /* Now use the correct ns */
1274: ns = tmp_vec2;
1276: do {
1277: sctx.newshift = PETSC_FALSE;
1278: /* Now loop over each block-row, and do the factorization */
1279: for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1280: nodesz = ns[inod];
1281:
1282: switch (nodesz){
1283: case 1:
1284: /*----------*/
1285: /* zero rtmp1 */
1286: /* L part */
1287: nz = bi[i+1] - bi[i];
1288: bjtmp = bj + bi[i];
1289: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291: /* U part */
1292: nz = bdiag[i]-bdiag[i+1];
1293: bjtmp = bj + bdiag[i+1]+1;
1294: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295:
1296: /* load in initial (unfactored row) */
1297: nz = ai[r[i]+1] - ai[r[i]];
1298: ajtmp = aj + ai[r[i]];
1299: v = aa + ai[r[i]];
1300: for (j=0; j<nz; j++) {
1301: rtmp1[ics[ajtmp[j]]] = v[j];
1302: }
1303: /* ZeropivotApply() */
1304: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1305:
1306: /* elimination */
1307: bjtmp = bj + bi[i];
1308: row = *bjtmp++;
1309: nzL = bi[i+1] - bi[i];
1310: for(k=0; k < nzL;k++) {
1311: pc = rtmp1 + row;
1312: if (*pc != 0.0) {
1313: pv = b->a + bdiag[row];
1314: mul1 = *pc * (*pv);
1315: *pc = mul1;
1316: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1317: pv = b->a + bdiag[row+1]+1;
1318: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1319: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1320: PetscLogFlops(1+2*nz);
1321: }
1322: row = *bjtmp++;
1323: }
1325: /* finished row so stick it into b->a */
1326: rs = 0.0;
1327: /* L part */
1328: pv = b->a + bi[i] ;
1329: pj = b->j + bi[i] ;
1330: nz = bi[i+1] - bi[i];
1331: for (j=0; j<nz; j++) {
1332: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333: }
1335: /* U part */
1336: pv = b->a + bdiag[i+1]+1;
1337: pj = b->j + bdiag[i+1]+1;
1338: nz = bdiag[i] - bdiag[i+1]-1;
1339: for (j=0; j<nz; j++) {
1340: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1341: }
1343: /* Check zero pivot */
1344: sctx.rs = rs;
1345: sctx.pv = rtmp1[i];
1346: MatPivotCheck(A,info,&sctx,i);
1347: if(sctx.newshift) break;
1348:
1349: /* Mark diagonal and invert diagonal for simplier triangular solves */
1350: pv = b->a + bdiag[i];
1351: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352: break;
1353:
1354: case 2:
1355: /*----------*/
1356: /* zero rtmp1 and rtmp2 */
1357: /* L part */
1358: nz = bi[i+1] - bi[i];
1359: bjtmp = bj + bi[i];
1360: for (j=0; j<nz; j++) {
1361: col = bjtmp[j];
1362: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363: }
1365: /* U part */
1366: nz = bdiag[i]-bdiag[i+1];
1367: bjtmp = bj + bdiag[i+1]+1;
1368: for (j=0; j<nz; j++) {
1369: col = bjtmp[j];
1370: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371: }
1372:
1373: /* load in initial (unfactored row) */
1374: nz = ai[r[i]+1] - ai[r[i]];
1375: ajtmp = aj + ai[r[i]];
1376: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377: for (j=0; j<nz; j++) {
1378: col = ics[ajtmp[j]];
1379: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380: }
1381: /* ZeropivotApply(): shift the diagonal of the matrix */
1382: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383:
1384: /* elimination */
1385: bjtmp = bj + bi[i];
1386: row = *bjtmp++; /* pivot row */
1387: nzL = bi[i+1] - bi[i];
1388: for(k=0; k < nzL;k++) {
1389: pc1 = rtmp1 + row;
1390: pc2 = rtmp2 + row;
1391: if (*pc1 != 0.0 || *pc2 != 0.0) {
1392: pv = b->a + bdiag[row];
1393: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394: *pc1 = mul1; *pc2 = mul2;
1396: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1397: pv = b->a + bdiag[row+1]+1;
1398: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399: for (j=0; j<nz; j++){
1400: col = pj[j];
1401: rtmp1[col] -= mul1 * pv[j];
1402: rtmp2[col] -= mul2 * pv[j];
1403: }
1404: PetscLogFlops(2+4*nz);
1405: }
1406: row = *bjtmp++;
1407: }
1409: /* finished row i; check zero pivot, then stick row i into b->a */
1410: rs = 0.0;
1411: /* L part */
1412: pc1 = b->a + bi[i];
1413: pj = b->j + bi[i] ;
1414: nz = bi[i+1] - bi[i];
1415: for (j=0; j<nz; j++) {
1416: col = pj[j];
1417: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418: }
1419: /* U part */
1420: pc1 = b->a + bdiag[i+1]+1;
1421: pj = b->j + bdiag[i+1]+1;
1422: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423: for (j=0; j<nz; j++) {
1424: col = pj[j];
1425: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426: }
1427:
1428: sctx.rs = rs;
1429: sctx.pv = rtmp1[i];
1430: MatPivotCheck(A,info,&sctx,i);
1431: if(sctx.newshift) break;
1432: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1433: *pc1 = 1.0/sctx.pv;
1435: /* Now take care of diagonal 2x2 block. */
1436: pc2 = rtmp2 + i;
1437: if (*pc2 != 0.0){
1438: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439: *pc2 = mul1; /* insert L entry */
1440: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1441: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442: for (j=0; j<nz; j++) {
1443: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444: }
1445: PetscLogFlops(1+2*nz);
1446: }
1448: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449: rs = 0.0;
1450: /* L part */
1451: pc2 = b->a + bi[i+1];
1452: pj = b->j + bi[i+1] ;
1453: nz = bi[i+2] - bi[i+1];
1454: for (j=0; j<nz; j++) {
1455: col = pj[j];
1456: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457: }
1458: /* U part */
1459: pc2 = b->a + bdiag[i+2]+1;
1460: pj = b->j + bdiag[i+2]+1;
1461: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462: for (j=0; j<nz; j++) {
1463: col = pj[j];
1464: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465: }
1467: sctx.rs = rs;
1468: sctx.pv = rtmp2[i+1];
1469: MatPivotCheck(A,info,&sctx,i+1);
1470: if(sctx.newshift) break;
1471: pc2 = b->a + bdiag[i+1];
1472: *pc2 = 1.0/sctx.pv;
1473: break;
1475: case 3:
1476: /*----------*/
1477: /* zero rtmp */
1478: /* L part */
1479: nz = bi[i+1] - bi[i];
1480: bjtmp = bj + bi[i];
1481: for (j=0; j<nz; j++) {
1482: col = bjtmp[j];
1483: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484: }
1486: /* U part */
1487: nz = bdiag[i]-bdiag[i+1];
1488: bjtmp = bj + bdiag[i+1]+1;
1489: for (j=0; j<nz; j++) {
1490: col = bjtmp[j];
1491: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492: }
1493:
1494: /* load in initial (unfactored row) */
1495: nz = ai[r[i]+1] - ai[r[i]];
1496: ajtmp = aj + ai[r[i]];
1497: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498: for (j=0; j<nz; j++) {
1499: col = ics[ajtmp[j]];
1500: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501: }
1502: /* ZeropivotApply(): shift the diagonal of the matrix */
1503: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504:
1505: /* elimination */
1506: bjtmp = bj + bi[i];
1507: row = *bjtmp++; /* pivot row */
1508: nzL = bi[i+1] - bi[i];
1509: for(k=0; k < nzL;k++) {
1510: pc1 = rtmp1 + row;
1511: pc2 = rtmp2 + row;
1512: pc3 = rtmp3 + row;
1513: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514: pv = b->a + bdiag[row];
1515: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1518: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519: pv = b->a + bdiag[row+1]+1;
1520: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521: for (j=0; j<nz; j++){
1522: col = pj[j];
1523: rtmp1[col] -= mul1 * pv[j];
1524: rtmp2[col] -= mul2 * pv[j];
1525: rtmp3[col] -= mul3 * pv[j];
1526: }
1527: PetscLogFlops(3+6*nz);
1528: }
1529: row = *bjtmp++;
1530: }
1532: /* finished row i; check zero pivot, then stick row i into b->a */
1533: rs = 0.0;
1534: /* L part */
1535: pc1 = b->a + bi[i];
1536: pj = b->j + bi[i] ;
1537: nz = bi[i+1] - bi[i];
1538: for (j=0; j<nz; j++) {
1539: col = pj[j];
1540: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541: }
1542: /* U part */
1543: pc1 = b->a + bdiag[i+1]+1;
1544: pj = b->j + bdiag[i+1]+1;
1545: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546: for (j=0; j<nz; j++) {
1547: col = pj[j];
1548: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549: }
1550:
1551: sctx.rs = rs;
1552: sctx.pv = rtmp1[i];
1553: MatPivotCheck(A,info,&sctx,i);
1554: if(sctx.newshift) break;
1555: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1556: *pc1 = 1.0/sctx.pv;
1558: /* Now take care of 1st column of diagonal 3x3 block. */
1559: pc2 = rtmp2 + i;
1560: pc3 = rtmp3 + i;
1561: if (*pc2 != 0.0 || *pc3 != 0.0){
1562: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1563: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1564: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1565: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566: for (j=0; j<nz; j++) {
1567: col = pj[j];
1568: rtmp2[col] -= mul2 * rtmp1[col];
1569: rtmp3[col] -= mul3 * rtmp1[col];
1570: }
1571: PetscLogFlops(2+4*nz);
1572: }
1573:
1574: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1575: rs = 0.0;
1576: /* L part */
1577: pc2 = b->a + bi[i+1];
1578: pj = b->j + bi[i+1] ;
1579: nz = bi[i+2] - bi[i+1];
1580: for (j=0; j<nz; j++) {
1581: col = pj[j];
1582: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583: }
1584: /* U part */
1585: pc2 = b->a + bdiag[i+2]+1;
1586: pj = b->j + bdiag[i+2]+1;
1587: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1588: for (j=0; j<nz; j++) {
1589: col = pj[j];
1590: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1591: }
1593: sctx.rs = rs;
1594: sctx.pv = rtmp2[i+1];
1595: MatPivotCheck(A,info,&sctx,i+1);
1596: if(sctx.newshift) break;
1597: pc2 = b->a + bdiag[i+1];
1598: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1600: /* Now take care of 2nd column of diagonal 3x3 block. */
1601: pc3 = rtmp3 + i+1;
1602: if (*pc3 != 0.0){
1603: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1604: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1605: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1606: for (j=0; j<nz; j++) {
1607: col = pj[j];
1608: rtmp3[col] -= mul3 * rtmp2[col];
1609: }
1610: PetscLogFlops(1+2*nz);
1611: }
1613: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1614: rs = 0.0;
1615: /* L part */
1616: pc3 = b->a + bi[i+2];
1617: pj = b->j + bi[i+2] ;
1618: nz = bi[i+3] - bi[i+2];
1619: for (j=0; j<nz; j++) {
1620: col = pj[j];
1621: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622: }
1623: /* U part */
1624: pc3 = b->a + bdiag[i+3]+1;
1625: pj = b->j + bdiag[i+3]+1;
1626: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1627: for (j=0; j<nz; j++) {
1628: col = pj[j];
1629: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1630: }
1632: sctx.rs = rs;
1633: sctx.pv = rtmp3[i+2];
1634: MatPivotCheck(A,info,&sctx,i+2);
1635: if(sctx.newshift) break;
1636: pc3 = b->a + bdiag[i+2];
1637: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1638: break;
1639: case 4:
1640: /*----------*/
1641: /* zero rtmp */
1642: /* L part */
1643: nz = bi[i+1] - bi[i];
1644: bjtmp = bj + bi[i];
1645: for (j=0; j<nz; j++) {
1646: col = bjtmp[j];
1647: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1648: }
1650: /* U part */
1651: nz = bdiag[i]-bdiag[i+1];
1652: bjtmp = bj + bdiag[i+1]+1;
1653: for (j=0; j<nz; j++) {
1654: col = bjtmp[j];
1655: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1656: }
1657:
1658: /* load in initial (unfactored row) */
1659: nz = ai[r[i]+1] - ai[r[i]];
1660: ajtmp = aj + ai[r[i]];
1661: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1662: for (j=0; j<nz; j++) {
1663: col = ics[ajtmp[j]];
1664: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1665: }
1666: /* ZeropivotApply(): shift the diagonal of the matrix */
1667: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1668:
1669: /* elimination */
1670: bjtmp = bj + bi[i];
1671: row = *bjtmp++; /* pivot row */
1672: nzL = bi[i+1] - bi[i];
1673: for(k=0; k < nzL;k++) {
1674: pc1 = rtmp1 + row;
1675: pc2 = rtmp2 + row;
1676: pc3 = rtmp3 + row;
1677: pc4 = rtmp4 + row;
1678: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1679: pv = b->a + bdiag[row];
1680: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1681: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1683: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1684: pv = b->a + bdiag[row+1]+1;
1685: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1686: for (j=0; j<nz; j++){
1687: col = pj[j];
1688: rtmp1[col] -= mul1 * pv[j];
1689: rtmp2[col] -= mul2 * pv[j];
1690: rtmp3[col] -= mul3 * pv[j];
1691: rtmp4[col] -= mul4 * pv[j];
1692: }
1693: PetscLogFlops(4+8*nz);
1694: }
1695: row = *bjtmp++;
1696: }
1698: /* finished row i; check zero pivot, then stick row i into b->a */
1699: rs = 0.0;
1700: /* L part */
1701: pc1 = b->a + bi[i];
1702: pj = b->j + bi[i] ;
1703: nz = bi[i+1] - bi[i];
1704: for (j=0; j<nz; j++) {
1705: col = pj[j];
1706: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707: }
1708: /* U part */
1709: pc1 = b->a + bdiag[i+1]+1;
1710: pj = b->j + bdiag[i+1]+1;
1711: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1712: for (j=0; j<nz; j++) {
1713: col = pj[j];
1714: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1715: }
1716:
1717: sctx.rs = rs;
1718: sctx.pv = rtmp1[i];
1719: MatPivotCheck(A,info,&sctx,i);
1720: if(sctx.newshift) break;
1721: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1722: *pc1 = 1.0/sctx.pv;
1724: /* Now take care of 1st column of diagonal 4x4 block. */
1725: pc2 = rtmp2 + i;
1726: pc3 = rtmp3 + i;
1727: pc4 = rtmp4 + i;
1728: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1729: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1730: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1731: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1732: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1733: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1734: for (j=0; j<nz; j++) {
1735: col = pj[j];
1736: rtmp2[col] -= mul2 * rtmp1[col];
1737: rtmp3[col] -= mul3 * rtmp1[col];
1738: rtmp4[col] -= mul4 * rtmp1[col];
1739: }
1740: PetscLogFlops(3+6*nz);
1741: }
1742:
1743: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1744: rs = 0.0;
1745: /* L part */
1746: pc2 = b->a + bi[i+1];
1747: pj = b->j + bi[i+1] ;
1748: nz = bi[i+2] - bi[i+1];
1749: for (j=0; j<nz; j++) {
1750: col = pj[j];
1751: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752: }
1753: /* U part */
1754: pc2 = b->a + bdiag[i+2]+1;
1755: pj = b->j + bdiag[i+2]+1;
1756: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1757: for (j=0; j<nz; j++) {
1758: col = pj[j];
1759: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1760: }
1762: sctx.rs = rs;
1763: sctx.pv = rtmp2[i+1];
1764: MatPivotCheck(A,info,&sctx,i+1);
1765: if(sctx.newshift) break;
1766: pc2 = b->a + bdiag[i+1];
1767: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1769: /* Now take care of 2nd column of diagonal 4x4 block. */
1770: pc3 = rtmp3 + i+1;
1771: pc4 = rtmp4 + i+1;
1772: if (*pc3 != 0.0 || *pc4 != 0.0){
1773: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1774: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1775: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1776: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1777: for (j=0; j<nz; j++) {
1778: col = pj[j];
1779: rtmp3[col] -= mul3 * rtmp2[col];
1780: rtmp4[col] -= mul4 * rtmp2[col];
1781: }
1782: PetscLogFlops(4*nz);
1783: }
1785: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1786: rs = 0.0;
1787: /* L part */
1788: pc3 = b->a + bi[i+2];
1789: pj = b->j + bi[i+2] ;
1790: nz = bi[i+3] - bi[i+2];
1791: for (j=0; j<nz; j++) {
1792: col = pj[j];
1793: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794: }
1795: /* U part */
1796: pc3 = b->a + bdiag[i+3]+1;
1797: pj = b->j + bdiag[i+3]+1;
1798: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1799: for (j=0; j<nz; j++) {
1800: col = pj[j];
1801: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1802: }
1804: sctx.rs = rs;
1805: sctx.pv = rtmp3[i+2];
1806: MatPivotCheck(A,info,&sctx,i+2);
1807: if(sctx.newshift) break;
1808: pc3 = b->a + bdiag[i+2];
1809: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1810:
1811: /* Now take care of 3rd column of diagonal 4x4 block. */
1812: pc4 = rtmp4 + i+2;
1813: if (*pc4 != 0.0){
1814: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1815: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1816: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1817: for (j=0; j<nz; j++) {
1818: col = pj[j];
1819: rtmp4[col] -= mul4 * rtmp3[col];
1820: }
1821: PetscLogFlops(1+2*nz);
1822: }
1824: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1825: rs = 0.0;
1826: /* L part */
1827: pc4 = b->a + bi[i+3];
1828: pj = b->j + bi[i+3] ;
1829: nz = bi[i+4] - bi[i+3];
1830: for (j=0; j<nz; j++) {
1831: col = pj[j];
1832: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833: }
1834: /* U part */
1835: pc4 = b->a + bdiag[i+4]+1;
1836: pj = b->j + bdiag[i+4]+1;
1837: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1838: for (j=0; j<nz; j++) {
1839: col = pj[j];
1840: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1841: }
1843: sctx.rs = rs;
1844: sctx.pv = rtmp4[i+3];
1845: MatPivotCheck(A,info,&sctx,i+3);
1846: if(sctx.newshift) break;
1847: pc4 = b->a + bdiag[i+3];
1848: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1849: break;
1851: default:
1852: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1853: }
1854: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1855: i += nodesz; /* Update the row */
1856: }
1858: /* MatPivotRefine() */
1859: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1860: /*
1861: * if no shift in this attempt & shifting & started shifting & can refine,
1862: * then try lower shift
1863: */
1864: sctx.shift_hi = sctx.shift_fraction;
1865: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1866: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1867: sctx.newshift = PETSC_TRUE;
1868: sctx.nshift++;
1869: }
1870: } while (sctx.newshift);
1872: PetscFree(rtmp1);
1873: PetscFree(tmp_vec2);
1874: ISRestoreIndices(isicol,&ic);
1875: ISRestoreIndices(isrow,&r);
1877: C->ops->solve = MatSolve_SeqAIJ;
1878: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1879: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1880: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1881: C->ops->matsolve = MatMatSolve_SeqAIJ;
1882: C->assembled = PETSC_TRUE;
1883: C->preallocated = PETSC_TRUE;
1884: PetscLogFlops(C->cmap->n);
1886: /* MatShiftView(A,info,&sctx) */
1887: if (sctx.nshift){
1888: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1889: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1890: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1891: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1892: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1893: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);
1894: }
1895: }
1896: Mat_CheckInode_FactorLU(C,PETSC_FALSE);
1897: return(0);
1898: }
1902: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1903: {
1904: Mat C = B;
1905: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1906: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1907: PetscErrorCode ierr;
1908: const PetscInt *r,*ic,*c,*ics;
1909: PetscInt n = A->rmap->n,*bi = b->i;
1910: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1911: PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1912: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913: PetscScalar mul1,mul2,mul3,tmp;
1914: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916: PetscReal rs=0.0;
1917: FactorShiftCtx sctx;
1920: sctx.shift_top = 0;
1921: sctx.nshift_max = 0;
1922: sctx.shift_lo = 0;
1923: sctx.shift_hi = 0;
1924: sctx.shift_fraction = 0;
1926: /* if both shift schemes are chosen by user, only use info->shiftpd */
1927: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928: sctx.shift_top = 0;
1929: for (i=0; i<n; i++) {
1930: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931: rs = 0.0;
1932: ajtmp = aj + ai[i];
1933: rtmp1 = aa + ai[i];
1934: nz = ai[i+1] - ai[i];
1935: for (j=0; j<nz; j++){
1936: if (*ajtmp != i){
1937: rs += PetscAbsScalar(*rtmp1++);
1938: } else {
1939: rs -= PetscRealPart(*rtmp1++);
1940: }
1941: ajtmp++;
1942: }
1943: if (rs>sctx.shift_top) sctx.shift_top = rs;
1944: }
1945: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946: sctx.shift_top *= 1.1;
1947: sctx.nshift_max = 5;
1948: sctx.shift_lo = 0.;
1949: sctx.shift_hi = 1.;
1950: }
1951: sctx.shift_amount = 0;
1952: sctx.nshift = 0;
1954: ISGetIndices(isrow,&r);
1955: ISGetIndices(iscol,&c);
1956: ISGetIndices(isicol,&ic);
1957: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1958: PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1959: ics = ic ;
1960: rtmp22 = rtmp11 + n;
1961: rtmp33 = rtmp22 + n;
1962:
1963: node_max = a->inode.node_count;
1964: ns = a->inode.size;
1965: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1967: /* If max inode size > 3, split it into two inodes.*/
1968: /* also map the inode sizes according to the ordering */
1969: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1970: for (i=0,j=0; i<node_max; ++i,++j){
1971: if (ns[i]>3) {
1972: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1973: ++j;
1974: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975: } else {
1976: tmp_vec1[j] = ns[i];
1977: }
1978: }
1979: /* Use the correct node_max */
1980: node_max = j;
1982: /* Now reorder the inode info based on mat re-ordering info */
1983: /* First create a row -> inode_size_array_index map */
1984: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1985: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1986: for (i=0,row=0; i<node_max; i++) {
1987: nodesz = tmp_vec1[i];
1988: for (j=0; j<nodesz; j++,row++) {
1989: nsmap[row] = i;
1990: }
1991: }
1992: /* Using nsmap, create a reordered ns structure */
1993: for (i=0,j=0; i< node_max; i++) {
1994: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1995: tmp_vec2[i] = nodesz;
1996: j += nodesz;
1997: }
1998: PetscFree(nsmap);
1999: PetscFree(tmp_vec1);
2000: /* Now use the correct ns */
2001: ns = tmp_vec2;
2003: do {
2004: sctx.newshift = PETSC_FALSE;
2005: /* Now loop over each block-row, and do the factorization */
2006: for (i=0,row=0; i<node_max; i++) {
2007: nodesz = ns[i];
2008: nz = bi[row+1] - bi[row];
2009: bjtmp = bj + bi[row];
2011: switch (nodesz){
2012: case 1:
2013: for (j=0; j<nz; j++){
2014: idx = bjtmp[j];
2015: rtmp11[idx] = 0.0;
2016: }
2017:
2018: /* load in initial (unfactored row) */
2019: idx = r[row];
2020: nz_tmp = ai[idx+1] - ai[idx];
2021: ajtmp = aj + ai[idx];
2022: v1 = aa + ai[idx];
2024: for (j=0; j<nz_tmp; j++) {
2025: idx = ics[ajtmp[j]];
2026: rtmp11[idx] = v1[j];
2027: }
2028: rtmp11[ics[r[row]]] += sctx.shift_amount;
2030: prow = *bjtmp++ ;
2031: while (prow < row) {
2032: pc1 = rtmp11 + prow;
2033: if (*pc1 != 0.0){
2034: pv = ba + bd[prow];
2035: pj = nbj + bd[prow];
2036: mul1 = *pc1 * *pv++;
2037: *pc1 = mul1;
2038: nz_tmp = bi[prow+1] - bd[prow] - 1;
2039: PetscLogFlops(1+2*nz_tmp);
2040: for (j=0; j<nz_tmp; j++) {
2041: tmp = pv[j];
2042: idx = pj[j];
2043: rtmp11[idx] -= mul1 * tmp;
2044: }
2045: }
2046: prow = *bjtmp++ ;
2047: }
2048: pj = bj + bi[row];
2049: pc1 = ba + bi[row];
2051: sctx.pv = rtmp11[row];
2052: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053: rs = 0.0;
2054: for (j=0; j<nz; j++) {
2055: idx = pj[j];
2056: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058: }
2059: sctx.rs = rs;
2060: MatPivotCheck(A,info,&sctx,row);
2061: if (sctx.newshift) goto endofwhile;
2062: break;
2063:
2064: case 2:
2065: for (j=0; j<nz; j++) {
2066: idx = bjtmp[j];
2067: rtmp11[idx] = 0.0;
2068: rtmp22[idx] = 0.0;
2069: }
2070:
2071: /* load in initial (unfactored row) */
2072: idx = r[row];
2073: nz_tmp = ai[idx+1] - ai[idx];
2074: ajtmp = aj + ai[idx];
2075: v1 = aa + ai[idx];
2076: v2 = aa + ai[idx+1];
2077: for (j=0; j<nz_tmp; j++) {
2078: idx = ics[ajtmp[j]];
2079: rtmp11[idx] = v1[j];
2080: rtmp22[idx] = v2[j];
2081: }
2082: rtmp11[ics[r[row]]] += sctx.shift_amount;
2083: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2085: prow = *bjtmp++ ;
2086: while (prow < row) {
2087: pc1 = rtmp11 + prow;
2088: pc2 = rtmp22 + prow;
2089: if (*pc1 != 0.0 || *pc2 != 0.0){
2090: pv = ba + bd[prow];
2091: pj = nbj + bd[prow];
2092: mul1 = *pc1 * *pv;
2093: mul2 = *pc2 * *pv;
2094: ++pv;
2095: *pc1 = mul1;
2096: *pc2 = mul2;
2097:
2098: nz_tmp = bi[prow+1] - bd[prow] - 1;
2099: for (j=0; j<nz_tmp; j++) {
2100: tmp = pv[j];
2101: idx = pj[j];
2102: rtmp11[idx] -= mul1 * tmp;
2103: rtmp22[idx] -= mul2 * tmp;
2104: }
2105: PetscLogFlops(2+4*nz_tmp);
2106: }
2107: prow = *bjtmp++ ;
2108: }
2110: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111: pc1 = rtmp11 + prow;
2112: pc2 = rtmp22 + prow;
2114: sctx.pv = *pc1;
2115: pj = bj + bi[prow];
2116: rs = 0.0;
2117: for (j=0; j<nz; j++){
2118: idx = pj[j];
2119: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120: }
2121: sctx.rs = rs;
2122: MatPivotCheck(A,info,&sctx,row);
2123: if (sctx.newshift) goto endofwhile;
2125: if (*pc2 != 0.0){
2126: pj = nbj + bd[prow];
2127: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128: *pc2 = mul2;
2129: nz_tmp = bi[prow+1] - bd[prow] - 1;
2130: for (j=0; j<nz_tmp; j++) {
2131: idx = pj[j] ;
2132: tmp = rtmp11[idx];
2133: rtmp22[idx] -= mul2 * tmp;
2134: }
2135: PetscLogFlops(1+2*nz_tmp);
2136: }
2137:
2138: pj = bj + bi[row];
2139: pc1 = ba + bi[row];
2140: pc2 = ba + bi[row+1];
2142: sctx.pv = rtmp22[row+1];
2143: rs = 0.0;
2144: rtmp11[row] = 1.0/rtmp11[row];
2145: rtmp22[row+1] = 1.0/rtmp22[row+1];
2146: /* copy row entries from dense representation to sparse */
2147: for (j=0; j<nz; j++) {
2148: idx = pj[j];
2149: pc1[j] = rtmp11[idx];
2150: pc2[j] = rtmp22[idx];
2151: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152: }
2153: sctx.rs = rs;
2154: MatPivotCheck(A,info,&sctx,row+1);
2155: if (sctx.newshift) goto endofwhile;
2156: break;
2158: case 3:
2159: for (j=0; j<nz; j++) {
2160: idx = bjtmp[j];
2161: rtmp11[idx] = 0.0;
2162: rtmp22[idx] = 0.0;
2163: rtmp33[idx] = 0.0;
2164: }
2165: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166: idx = r[row];
2167: nz_tmp = ai[idx+1] - ai[idx];
2168: ajtmp = aj + ai[idx];
2169: v1 = aa + ai[idx];
2170: v2 = aa + ai[idx+1];
2171: v3 = aa + ai[idx+2];
2172: for (j=0; j<nz_tmp; j++) {
2173: idx = ics[ajtmp[j]];
2174: rtmp11[idx] = v1[j];
2175: rtmp22[idx] = v2[j];
2176: rtmp33[idx] = v3[j];
2177: }
2178: rtmp11[ics[r[row]]] += sctx.shift_amount;
2179: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2182: /* loop over all pivot row blocks above this row block */
2183: prow = *bjtmp++ ;
2184: while (prow < row) {
2185: pc1 = rtmp11 + prow;
2186: pc2 = rtmp22 + prow;
2187: pc3 = rtmp33 + prow;
2188: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189: pv = ba + bd[prow];
2190: pj = nbj + bd[prow];
2191: mul1 = *pc1 * *pv;
2192: mul2 = *pc2 * *pv;
2193: mul3 = *pc3 * *pv;
2194: ++pv;
2195: *pc1 = mul1;
2196: *pc2 = mul2;
2197: *pc3 = mul3;
2198:
2199: nz_tmp = bi[prow+1] - bd[prow] - 1;
2200: /* update this row based on pivot row */
2201: for (j=0; j<nz_tmp; j++) {
2202: tmp = pv[j];
2203: idx = pj[j];
2204: rtmp11[idx] -= mul1 * tmp;
2205: rtmp22[idx] -= mul2 * tmp;
2206: rtmp33[idx] -= mul3 * tmp;
2207: }
2208: PetscLogFlops(3+6*nz_tmp);
2209: }
2210: prow = *bjtmp++ ;
2211: }
2213: /* Now take care of diagonal 3x3 block in this set of rows */
2214: /* note: prow = row here */
2215: pc1 = rtmp11 + prow;
2216: pc2 = rtmp22 + prow;
2217: pc3 = rtmp33 + prow;
2219: sctx.pv = *pc1;
2220: pj = bj + bi[prow];
2221: rs = 0.0;
2222: for (j=0; j<nz; j++){
2223: idx = pj[j];
2224: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225: }
2226: sctx.rs = rs;
2227: MatPivotCheck(A,info,&sctx,row);
2228: if (sctx.newshift) goto endofwhile;
2230: if (*pc2 != 0.0 || *pc3 != 0.0){
2231: mul2 = (*pc2)/(*pc1);
2232: mul3 = (*pc3)/(*pc1);
2233: *pc2 = mul2;
2234: *pc3 = mul3;
2235: nz_tmp = bi[prow+1] - bd[prow] - 1;
2236: pj = nbj + bd[prow];
2237: for (j=0; j<nz_tmp; j++) {
2238: idx = pj[j] ;
2239: tmp = rtmp11[idx];
2240: rtmp22[idx] -= mul2 * tmp;
2241: rtmp33[idx] -= mul3 * tmp;
2242: }
2243: PetscLogFlops(2+4*nz_tmp);
2244: }
2245: ++prow;
2247: pc2 = rtmp22 + prow;
2248: pc3 = rtmp33 + prow;
2249: sctx.pv = *pc2;
2250: pj = bj + bi[prow];
2251: rs = 0.0;
2252: for (j=0; j<nz; j++){
2253: idx = pj[j];
2254: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255: }
2256: sctx.rs = rs;
2257: MatPivotCheck(A,info,&sctx,row+1);
2258: if (sctx.newshift) goto endofwhile;
2260: if (*pc3 != 0.0){
2261: mul3 = (*pc3)/(*pc2);
2262: *pc3 = mul3;
2263: pj = nbj + bd[prow];
2264: nz_tmp = bi[prow+1] - bd[prow] - 1;
2265: for (j=0; j<nz_tmp; j++) {
2266: idx = pj[j] ;
2267: tmp = rtmp22[idx];
2268: rtmp33[idx] -= mul3 * tmp;
2269: }
2270: PetscLogFlops(1+2*nz_tmp);
2271: }
2273: pj = bj + bi[row];
2274: pc1 = ba + bi[row];
2275: pc2 = ba + bi[row+1];
2276: pc3 = ba + bi[row+2];
2278: sctx.pv = rtmp33[row+2];
2279: rs = 0.0;
2280: rtmp11[row] = 1.0/rtmp11[row];
2281: rtmp22[row+1] = 1.0/rtmp22[row+1];
2282: rtmp33[row+2] = 1.0/rtmp33[row+2];
2283: /* copy row entries from dense representation to sparse */
2284: for (j=0; j<nz; j++) {
2285: idx = pj[j];
2286: pc1[j] = rtmp11[idx];
2287: pc2[j] = rtmp22[idx];
2288: pc3[j] = rtmp33[idx];
2289: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290: }
2292: sctx.rs = rs;
2293: MatPivotCheck(A,info,&sctx,row+2);
2294: if (sctx.newshift) goto endofwhile;
2295: break;
2297: default:
2298: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299: }
2300: row += nodesz; /* Update the row */
2301: }
2302: endofwhile:;
2303: } while (sctx.newshift);
2304: PetscFree(rtmp11);
2305: PetscFree(tmp_vec2);
2306: ISRestoreIndices(isicol,&ic);
2307: ISRestoreIndices(isrow,&r);
2308: ISRestoreIndices(iscol,&c);
2309: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2310: /* do not set solve add, since MatSolve_Inode + Add is faster */
2311: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2312: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2313: C->assembled = PETSC_TRUE;
2314: C->preallocated = PETSC_TRUE;
2315: if (sctx.nshift) {
2316: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
2318: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
2320: }
2321: }
2322: PetscLogFlops(C->cmap->n);
2323: Mat_CheckInode(C,PETSC_FALSE);
2324: return(0);
2325: }
2328: /* ----------------------------------------------------------- */
2331: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332: {
2333: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2334: IS iscol = a->col,isrow = a->row;
2335: PetscErrorCode ierr;
2336: const PetscInt *r,*c,*rout,*cout;
2337: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2340: PetscScalar sum1,sum2,sum3,sum4,sum5;
2341: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342: const PetscScalar *b;
2345: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346: node_max = a->inode.node_count;
2347: ns = a->inode.size; /* Node Size array */
2349: VecGetArrayRead(bb,&b);
2350: VecGetArray(xx,&x);
2351: tmp = a->solve_work;
2352:
2353: ISGetIndices(isrow,&rout); r = rout;
2354: ISGetIndices(iscol,&cout); c = cout;
2355:
2356: /* forward solve the lower triangular */
2357: tmps = tmp ;
2358: aa = a_a ;
2359: aj = a_j ;
2360: ad = a->diag;
2362: for (i = 0,row = 0; i< node_max; ++i){
2363: nsz = ns[i];
2364: aii = ai[row];
2365: v1 = aa + aii;
2366: vi = aj + aii;
2367: nz = ai[row+1]- ai[row];
2368:
2369: if (i < node_max-1) {
2370: /* Prefetch the indices for the next block */
2371: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2372: /* Prefetch the data for the next block */
2373: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2374: }
2376: switch (nsz){ /* Each loop in 'case' is unrolled */
2377: case 1 :
2378: sum1 = b[r[row]];
2379: for(j=0; j<nz-1; j+=2){
2380: i0 = vi[j];
2381: i1 = vi[j+1];
2382: tmp0 = tmps[i0];
2383: tmp1 = tmps[i1];
2384: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385: }
2386: if(j == nz-1){
2387: tmp0 = tmps[vi[j]];
2388: sum1 -= v1[j]*tmp0;
2389: }
2390: tmp[row++]=sum1;
2391: break;
2392: case 2:
2393: sum1 = b[r[row]];
2394: sum2 = b[r[row+1]];
2395: v2 = aa + ai[row+1];
2397: for(j=0; j<nz-1; j+=2){
2398: i0 = vi[j];
2399: i1 = vi[j+1];
2400: tmp0 = tmps[i0];
2401: tmp1 = tmps[i1];
2402: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404: }
2405: if(j == nz-1){
2406: tmp0 = tmps[vi[j]];
2407: sum1 -= v1[j] *tmp0;
2408: sum2 -= v2[j] *tmp0;
2409: }
2410: sum2 -= v2[nz] * sum1;
2411: tmp[row ++]=sum1;
2412: tmp[row ++]=sum2;
2413: break;
2414: case 3:
2415: sum1 = b[r[row]];
2416: sum2 = b[r[row+1]];
2417: sum3 = b[r[row+2]];
2418: v2 = aa + ai[row+1];
2419: v3 = aa + ai[row+2];
2420:
2421: for (j=0; j<nz-1; j+=2){
2422: i0 = vi[j];
2423: i1 = vi[j+1];
2424: tmp0 = tmps[i0];
2425: tmp1 = tmps[i1];
2426: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429: }
2430: if (j == nz-1){
2431: tmp0 = tmps[vi[j]];
2432: sum1 -= v1[j] *tmp0;
2433: sum2 -= v2[j] *tmp0;
2434: sum3 -= v3[j] *tmp0;
2435: }
2436: sum2 -= v2[nz] * sum1;
2437: sum3 -= v3[nz] * sum1;
2438: sum3 -= v3[nz+1] * sum2;
2439: tmp[row ++]=sum1;
2440: tmp[row ++]=sum2;
2441: tmp[row ++]=sum3;
2442: break;
2443:
2444: case 4:
2445: sum1 = b[r[row]];
2446: sum2 = b[r[row+1]];
2447: sum3 = b[r[row+2]];
2448: sum4 = b[r[row+3]];
2449: v2 = aa + ai[row+1];
2450: v3 = aa + ai[row+2];
2451: v4 = aa + ai[row+3];
2452:
2453: for (j=0; j<nz-1; j+=2){
2454: i0 = vi[j];
2455: i1 = vi[j+1];
2456: tmp0 = tmps[i0];
2457: tmp1 = tmps[i1];
2458: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462: }
2463: if (j == nz-1){
2464: tmp0 = tmps[vi[j]];
2465: sum1 -= v1[j] *tmp0;
2466: sum2 -= v2[j] *tmp0;
2467: sum3 -= v3[j] *tmp0;
2468: sum4 -= v4[j] *tmp0;
2469: }
2470: sum2 -= v2[nz] * sum1;
2471: sum3 -= v3[nz] * sum1;
2472: sum4 -= v4[nz] * sum1;
2473: sum3 -= v3[nz+1] * sum2;
2474: sum4 -= v4[nz+1] * sum2;
2475: sum4 -= v4[nz+2] * sum3;
2476:
2477: tmp[row ++]=sum1;
2478: tmp[row ++]=sum2;
2479: tmp[row ++]=sum3;
2480: tmp[row ++]=sum4;
2481: break;
2482: case 5:
2483: sum1 = b[r[row]];
2484: sum2 = b[r[row+1]];
2485: sum3 = b[r[row+2]];
2486: sum4 = b[r[row+3]];
2487: sum5 = b[r[row+4]];
2488: v2 = aa + ai[row+1];
2489: v3 = aa + ai[row+2];
2490: v4 = aa + ai[row+3];
2491: v5 = aa + ai[row+4];
2492:
2493: for (j=0; j<nz-1; j+=2){
2494: i0 = vi[j];
2495: i1 = vi[j+1];
2496: tmp0 = tmps[i0];
2497: tmp1 = tmps[i1];
2498: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503: }
2504: if (j == nz-1){
2505: tmp0 = tmps[vi[j]];
2506: sum1 -= v1[j] *tmp0;
2507: sum2 -= v2[j] *tmp0;
2508: sum3 -= v3[j] *tmp0;
2509: sum4 -= v4[j] *tmp0;
2510: sum5 -= v5[j] *tmp0;
2511: }
2513: sum2 -= v2[nz] * sum1;
2514: sum3 -= v3[nz] * sum1;
2515: sum4 -= v4[nz] * sum1;
2516: sum5 -= v5[nz] * sum1;
2517: sum3 -= v3[nz+1] * sum2;
2518: sum4 -= v4[nz+1] * sum2;
2519: sum5 -= v5[nz+1] * sum2;
2520: sum4 -= v4[nz+2] * sum3;
2521: sum5 -= v5[nz+2] * sum3;
2522: sum5 -= v5[nz+3] * sum4;
2523:
2524: tmp[row ++]=sum1;
2525: tmp[row ++]=sum2;
2526: tmp[row ++]=sum3;
2527: tmp[row ++]=sum4;
2528: tmp[row ++]=sum5;
2529: break;
2530: default:
2531: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532: }
2533: }
2534: /* backward solve the upper triangular */
2535: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536: nsz = ns[i];
2537: aii = ad[row+1] + 1;
2538: v1 = aa + aii;
2539: vi = aj + aii;
2540: nz = ad[row]- ad[row+1] - 1;
2541:
2542: if (i > 0) {
2543: /* Prefetch the indices for the next block */
2544: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545: /* Prefetch the data for the next block */
2546: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2547: }
2549: switch (nsz){ /* Each loop in 'case' is unrolled */
2550: case 1 :
2551: sum1 = tmp[row];
2553: for(j=0 ; j<nz-1; j+=2){
2554: i0 = vi[j];
2555: i1 = vi[j+1];
2556: tmp0 = tmps[i0];
2557: tmp1 = tmps[i1];
2558: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559: }
2560: if (j == nz-1){
2561: tmp0 = tmps[vi[j]];
2562: sum1 -= v1[j]*tmp0;
2563: }
2564: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565: break;
2566: case 2 :
2567: sum1 = tmp[row];
2568: sum2 = tmp[row-1];
2569: v2 = aa + ad[row] + 1;
2570: for (j=0 ; j<nz-1; j+=2){
2571: i0 = vi[j];
2572: i1 = vi[j+1];
2573: tmp0 = tmps[i0];
2574: tmp1 = tmps[i1];
2575: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577: }
2578: if (j == nz-1){
2579: tmp0 = tmps[vi[j]];
2580: sum1 -= v1[j]* tmp0;
2581: sum2 -= v2[j+1]* tmp0;
2582: }
2583:
2584: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585: sum2 -= v2[0] * tmp0;
2586: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587: break;
2588: case 3 :
2589: sum1 = tmp[row];
2590: sum2 = tmp[row -1];
2591: sum3 = tmp[row -2];
2592: v2 = aa + ad[row] + 1;
2593: v3 = aa + ad[row -1] + 1;
2594: for (j=0 ; j<nz-1; j+=2){
2595: i0 = vi[j];
2596: i1 = vi[j+1];
2597: tmp0 = tmps[i0];
2598: tmp1 = tmps[i1];
2599: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602: }
2603: if (j== nz-1){
2604: tmp0 = tmps[vi[j]];
2605: sum1 -= v1[j] * tmp0;
2606: sum2 -= v2[j+1] * tmp0;
2607: sum3 -= v3[j+2] * tmp0;
2608: }
2609: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610: sum2 -= v2[0]* tmp0;
2611: sum3 -= v3[1] * tmp0;
2612: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613: sum3 -= v3[0]* tmp0;
2614: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615:
2616: break;
2617: case 4 :
2618: sum1 = tmp[row];
2619: sum2 = tmp[row -1];
2620: sum3 = tmp[row -2];
2621: sum4 = tmp[row -3];
2622: v2 = aa + ad[row]+1;
2623: v3 = aa + ad[row -1]+1;
2624: v4 = aa + ad[row -2]+1;
2626: for (j=0 ; j<nz-1; j+=2){
2627: i0 = vi[j];
2628: i1 = vi[j+1];
2629: tmp0 = tmps[i0];
2630: tmp1 = tmps[i1];
2631: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2632: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635: }
2636: if (j== nz-1){
2637: tmp0 = tmps[vi[j]];
2638: sum1 -= v1[j] * tmp0;
2639: sum2 -= v2[j+1] * tmp0;
2640: sum3 -= v3[j+2] * tmp0;
2641: sum4 -= v4[j+3] * tmp0;
2642: }
2644: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645: sum2 -= v2[0] * tmp0;
2646: sum3 -= v3[1] * tmp0;
2647: sum4 -= v4[2] * tmp0;
2648: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649: sum3 -= v3[0] * tmp0;
2650: sum4 -= v4[1] * tmp0;
2651: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652: sum4 -= v4[0] * tmp0;
2653: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654: break;
2655: case 5 :
2656: sum1 = tmp[row];
2657: sum2 = tmp[row -1];
2658: sum3 = tmp[row -2];
2659: sum4 = tmp[row -3];
2660: sum5 = tmp[row -4];
2661: v2 = aa + ad[row]+1;
2662: v3 = aa + ad[row -1]+1;
2663: v4 = aa + ad[row -2]+1;
2664: v5 = aa + ad[row -3]+1;
2665: for (j=0 ; j<nz-1; j+=2){
2666: i0 = vi[j];
2667: i1 = vi[j+1];
2668: tmp0 = tmps[i0];
2669: tmp1 = tmps[i1];
2670: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675: }
2676: if (j==nz-1){
2677: tmp0 = tmps[vi[j]];
2678: sum1 -= v1[j] * tmp0;
2679: sum2 -= v2[j+1] * tmp0;
2680: sum3 -= v3[j+2] * tmp0;
2681: sum4 -= v4[j+3] * tmp0;
2682: sum5 -= v5[j+4] * tmp0;
2683: }
2685: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686: sum2 -= v2[0] * tmp0;
2687: sum3 -= v3[1] * tmp0;
2688: sum4 -= v4[2] * tmp0;
2689: sum5 -= v5[3] * tmp0;
2690: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691: sum3 -= v3[0] * tmp0;
2692: sum4 -= v4[1] * tmp0;
2693: sum5 -= v5[2] * tmp0;
2694: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695: sum4 -= v4[0] * tmp0;
2696: sum5 -= v5[1] * tmp0;
2697: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698: sum5 -= v5[0] * tmp0;
2699: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700: break;
2701: default:
2702: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703: }
2704: }
2705: ISRestoreIndices(isrow,&rout);
2706: ISRestoreIndices(iscol,&cout);
2707: VecRestoreArrayRead(bb,&b);
2708: VecRestoreArray(xx,&x);
2709: PetscLogFlops(2.0*a->nz - A->cmap->n);
2710: return(0);
2711: }
2714: /*
2715: Makes a longer coloring[] array and calls the usual code with that
2716: */
2719: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720: {
2721: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2722: PetscErrorCode ierr;
2723: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724: PetscInt *colorused,i;
2725: ISColoringValue *newcolor;
2728: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
2729: /* loop over inodes, marking a color for each column*/
2730: row = 0;
2731: for (i=0; i<m; i++){
2732: for (j=0; j<ns[i]; j++) {
2733: newcolor[row++] = coloring[i] + j*ncolors;
2734: }
2735: }
2737: /* eliminate unneeded colors */
2738: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
2739: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
2740: for (i=0; i<n; i++) {
2741: colorused[newcolor[i]] = 1;
2742: }
2744: for (i=1; i<5*ncolors; i++) {
2745: colorused[i] += colorused[i-1];
2746: }
2747: ncolors = colorused[5*ncolors-1];
2748: for (i=0; i<n; i++) {
2749: newcolor[i] = colorused[newcolor[i]]-1;
2750: }
2751: PetscFree(colorused);
2752: ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
2753: PetscFree(coloring);
2754: return(0);
2755: }
2757: #include <../src/mat/blockinvert.h>
2761: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762: {
2763: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2764: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765: MatScalar *ibdiag,*bdiag,work[25],*t;
2766: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2767: const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768: const PetscScalar *xb, *b;
2769: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2770: PetscErrorCode ierr;
2771: PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772: PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2775: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2778: if (!a->inode.ibdiagvalid) {
2779: if (!a->inode.ibdiag) {
2780: /* calculate space needed for diagonal blocks */
2781: for (i=0; i<m; i++) {
2782: cnt += sizes[i]*sizes[i];
2783: }
2784: a->inode.bdiagsize = cnt;
2785: PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);
2786: }
2788: /* copy over the diagonal blocks and invert them */
2789: ibdiag = a->inode.ibdiag;
2790: bdiag = a->inode.bdiag;
2791: cnt = 0;
2792: for (i=0, row = 0; i<m; i++) {
2793: for (j=0; j<sizes[i]; j++) {
2794: for (k=0; k<sizes[i]; k++) {
2795: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2796: }
2797: }
2798: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2799:
2800: switch(sizes[i]) {
2801: case 1:
2802: /* Create matrix data structure */
2803: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2804: ibdiag[cnt] = 1.0/ibdiag[cnt];
2805: break;
2806: case 2:
2807: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2808: break;
2809: case 3:
2810: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2811: break;
2812: case 4:
2813: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2814: break;
2815: case 5:
2816: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2817: break;
2818: default:
2819: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2820: }
2821: cnt += sizes[i]*sizes[i];
2822: row += sizes[i];
2823: }
2824: a->inode.ibdiagvalid = PETSC_TRUE;
2825: }
2826: ibdiag = a->inode.ibdiag;
2827: bdiag = a->inode.bdiag;
2828: t = a->inode.ssor_work;
2830: VecGetArray(xx,&x);
2831: VecGetArrayRead(bb,&b);
2832: CHKMEMQ;
2833: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2834: if (flag & SOR_ZERO_INITIAL_GUESS) {
2835: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2837: for (i=0, row=0; i<m; i++) {
2838: sz = diag[row] - ii[row];
2839: v1 = a->a + ii[row];
2840: idx = a->j + ii[row];
2842: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2843: switch (sizes[i]){
2844: case 1:
2845:
2846: sum1 = b[row];
2847: for(n = 0; n<sz-1; n+=2) {
2848: i1 = idx[0];
2849: i2 = idx[1];
2850: idx += 2;
2851: tmp0 = x[i1];
2852: tmp1 = x[i2];
2853: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2854: }
2855:
2856: if (n == sz-1){
2857: tmp0 = x[*idx];
2858: sum1 -= *v1 * tmp0;
2859: }
2860: t[row] = sum1;
2861: x[row++] = sum1*(*ibdiag++);
2862: break;
2863: case 2:
2864: v2 = a->a + ii[row+1];
2865: sum1 = b[row];
2866: sum2 = b[row+1];
2867: for(n = 0; n<sz-1; n+=2) {
2868: i1 = idx[0];
2869: i2 = idx[1];
2870: idx += 2;
2871: tmp0 = x[i1];
2872: tmp1 = x[i2];
2873: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2874: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2875: }
2876:
2877: if (n == sz-1){
2878: tmp0 = x[*idx];
2879: sum1 -= v1[0] * tmp0;
2880: sum2 -= v2[0] * tmp0;
2881: }
2882: t[row] = sum1;
2883: t[row+1] = sum2;
2884: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2885: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2886: ibdiag += 4;
2887: break;
2888: case 3:
2889: v2 = a->a + ii[row+1];
2890: v3 = a->a + ii[row+2];
2891: sum1 = b[row];
2892: sum2 = b[row+1];
2893: sum3 = b[row+2];
2894: for(n = 0; n<sz-1; n+=2) {
2895: i1 = idx[0];
2896: i2 = idx[1];
2897: idx += 2;
2898: tmp0 = x[i1];
2899: tmp1 = x[i2];
2900: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2901: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2902: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2903: }
2904:
2905: if (n == sz-1){
2906: tmp0 = x[*idx];
2907: sum1 -= v1[0] * tmp0;
2908: sum2 -= v2[0] * tmp0;
2909: sum3 -= v3[0] * tmp0;
2910: }
2911: t[row] = sum1;
2912: t[row+1] = sum2;
2913: t[row+2] = sum3;
2914: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2915: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2916: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2917: ibdiag += 9;
2918: break;
2919: case 4:
2920: v2 = a->a + ii[row+1];
2921: v3 = a->a + ii[row+2];
2922: v4 = a->a + ii[row+3];
2923: sum1 = b[row];
2924: sum2 = b[row+1];
2925: sum3 = b[row+2];
2926: sum4 = b[row+3];
2927: for(n = 0; n<sz-1; n+=2) {
2928: i1 = idx[0];
2929: i2 = idx[1];
2930: idx += 2;
2931: tmp0 = x[i1];
2932: tmp1 = x[i2];
2933: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2934: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2935: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2936: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2937: }
2938:
2939: if (n == sz-1){
2940: tmp0 = x[*idx];
2941: sum1 -= v1[0] * tmp0;
2942: sum2 -= v2[0] * tmp0;
2943: sum3 -= v3[0] * tmp0;
2944: sum4 -= v4[0] * tmp0;
2945: }
2946: t[row] = sum1;
2947: t[row+1] = sum2;
2948: t[row+2] = sum3;
2949: t[row+3] = sum4;
2950: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2951: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2952: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2953: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2954: ibdiag += 16;
2955: break;
2956: case 5:
2957: v2 = a->a + ii[row+1];
2958: v3 = a->a + ii[row+2];
2959: v4 = a->a + ii[row+3];
2960: v5 = a->a + ii[row+4];
2961: sum1 = b[row];
2962: sum2 = b[row+1];
2963: sum3 = b[row+2];
2964: sum4 = b[row+3];
2965: sum5 = b[row+4];
2966: for(n = 0; n<sz-1; n+=2) {
2967: i1 = idx[0];
2968: i2 = idx[1];
2969: idx += 2;
2970: tmp0 = x[i1];
2971: tmp1 = x[i2];
2972: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2973: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2974: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2975: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2976: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2977: }
2978:
2979: if (n == sz-1){
2980: tmp0 = x[*idx];
2981: sum1 -= v1[0] * tmp0;
2982: sum2 -= v2[0] * tmp0;
2983: sum3 -= v3[0] * tmp0;
2984: sum4 -= v4[0] * tmp0;
2985: sum5 -= v5[0] * tmp0;
2986: }
2987: t[row] = sum1;
2988: t[row+1] = sum2;
2989: t[row+2] = sum3;
2990: t[row+3] = sum4;
2991: t[row+4] = sum5;
2992: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2993: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2994: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2995: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2996: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2997: ibdiag += 25;
2998: break;
2999: default:
3000: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3001: }
3002: }
3004: xb = t;
3005: PetscLogFlops(a->nz);
3006: } else xb = b;
3007: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3009: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3010: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3011: ibdiag -= sizes[i]*sizes[i];
3012: sz = ii[row+1] - diag[row] - 1;
3013: v1 = a->a + diag[row] + 1;
3014: idx = a->j + diag[row] + 1;
3016: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3017: switch (sizes[i]){
3018: case 1:
3019:
3020: sum1 = xb[row];
3021: for(n = 0; n<sz-1; n+=2) {
3022: i1 = idx[0];
3023: i2 = idx[1];
3024: idx += 2;
3025: tmp0 = x[i1];
3026: tmp1 = x[i2];
3027: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3028: }
3029:
3030: if (n == sz-1){
3031: tmp0 = x[*idx];
3032: sum1 -= *v1*tmp0;
3033: }
3034: x[row--] = sum1*(*ibdiag);
3035: break;
3037: case 2:
3038:
3039: sum1 = xb[row];
3040: sum2 = xb[row-1];
3041: /* note that sum1 is associated with the second of the two rows */
3042: v2 = a->a + diag[row-1] + 2;
3043: for(n = 0; n<sz-1; n+=2) {
3044: i1 = idx[0];
3045: i2 = idx[1];
3046: idx += 2;
3047: tmp0 = x[i1];
3048: tmp1 = x[i2];
3049: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3050: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3051: }
3052:
3053: if (n == sz-1){
3054: tmp0 = x[*idx];
3055: sum1 -= *v1*tmp0;
3056: sum2 -= *v2*tmp0;
3057: }
3058: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3059: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3060: break;
3061: case 3:
3062:
3063: sum1 = xb[row];
3064: sum2 = xb[row-1];
3065: sum3 = xb[row-2];
3066: v2 = a->a + diag[row-1] + 2;
3067: v3 = a->a + diag[row-2] + 3;
3068: for(n = 0; n<sz-1; n+=2) {
3069: i1 = idx[0];
3070: i2 = idx[1];
3071: idx += 2;
3072: tmp0 = x[i1];
3073: tmp1 = x[i2];
3074: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077: }
3078:
3079: if (n == sz-1){
3080: tmp0 = x[*idx];
3081: sum1 -= *v1*tmp0;
3082: sum2 -= *v2*tmp0;
3083: sum3 -= *v3*tmp0;
3084: }
3085: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3086: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3087: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3088: break;
3089: case 4:
3090:
3091: sum1 = xb[row];
3092: sum2 = xb[row-1];
3093: sum3 = xb[row-2];
3094: sum4 = xb[row-3];
3095: v2 = a->a + diag[row-1] + 2;
3096: v3 = a->a + diag[row-2] + 3;
3097: v4 = a->a + diag[row-3] + 4;
3098: for(n = 0; n<sz-1; n+=2) {
3099: i1 = idx[0];
3100: i2 = idx[1];
3101: idx += 2;
3102: tmp0 = x[i1];
3103: tmp1 = x[i2];
3104: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3105: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3106: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3107: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3108: }
3109:
3110: if (n == sz-1){
3111: tmp0 = x[*idx];
3112: sum1 -= *v1*tmp0;
3113: sum2 -= *v2*tmp0;
3114: sum3 -= *v3*tmp0;
3115: sum4 -= *v4*tmp0;
3116: }
3117: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3118: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3119: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3120: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3121: break;
3122: case 5:
3123:
3124: sum1 = xb[row];
3125: sum2 = xb[row-1];
3126: sum3 = xb[row-2];
3127: sum4 = xb[row-3];
3128: sum5 = xb[row-4];
3129: v2 = a->a + diag[row-1] + 2;
3130: v3 = a->a + diag[row-2] + 3;
3131: v4 = a->a + diag[row-3] + 4;
3132: v5 = a->a + diag[row-4] + 5;
3133: for(n = 0; n<sz-1; n+=2) {
3134: i1 = idx[0];
3135: i2 = idx[1];
3136: idx += 2;
3137: tmp0 = x[i1];
3138: tmp1 = x[i2];
3139: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3140: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3141: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3142: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3143: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3144: }
3145:
3146: if (n == sz-1){
3147: tmp0 = x[*idx];
3148: sum1 -= *v1*tmp0;
3149: sum2 -= *v2*tmp0;
3150: sum3 -= *v3*tmp0;
3151: sum4 -= *v4*tmp0;
3152: sum5 -= *v5*tmp0;
3153: }
3154: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3155: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3156: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3157: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3158: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3159: break;
3160: default:
3161: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3162: }
3163: }
3165: PetscLogFlops(a->nz);
3166: }
3167: its--;
3168: }
3169: while (its--) {
3171: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
3172: ibdiag = a->inode.ibdiag;
3174: for (i=0, row=0; i<m; i++) {
3175: sz = ii[row + 1] - ii[row];
3176: v1 = a->a + ii[row];
3177: idx = a->j + ii[row];
3179: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3180: switch (sizes[i]){
3181: case 1:
3182:
3183: sum1 = b[row];
3184: for(n = 0; n<sz-1; n+=2) {
3185: i1 = idx[0];
3186: i2 = idx[1];
3187: idx += 2;
3188: tmp0 = x[i1];
3189: tmp1 = x[i2];
3190: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3191: }
3192:
3193: if (n == sz-1){
3194: tmp0 = x[*idx];
3195: sum1 -= *v1 * tmp0;
3196: }
3198: /* in MatSOR_SeqAIJ this line would be
3199: *
3200: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3201: *
3202: * but omega == 1, so this becomes
3203: *
3204: * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3205: *
3206: * but bdiag and ibdiag cancel each other, so we can change this
3207: * to adding sum1*(*ibdiag++). We can skip bdiag for the larger
3208: * block sizes as well
3209: */
3210: x[row++] += sum1*(*ibdiag++);
3211: break;
3212: case 2:
3213: v2 = a->a + ii[row+1];
3214: sum1 = b[row];
3215: sum2 = b[row+1];
3216: for(n = 0; n<sz-1; n+=2) {
3217: i1 = idx[0];
3218: i2 = idx[1];
3219: idx += 2;
3220: tmp0 = x[i1];
3221: tmp1 = x[i2];
3222: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3223: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3224: }
3225:
3226: if (n == sz-1){
3227: tmp0 = x[*idx];
3228: sum1 -= v1[0] * tmp0;
3229: sum2 -= v2[0] * tmp0;
3230: }
3231: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3232: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3233: ibdiag += 4;
3234: break;
3235: case 3:
3236: v2 = a->a + ii[row+1];
3237: v3 = a->a + ii[row+2];
3238: sum1 = b[row];
3239: sum2 = b[row+1];
3240: sum3 = b[row+2];
3241: for(n = 0; n<sz-1; n+=2) {
3242: i1 = idx[0];
3243: i2 = idx[1];
3244: idx += 2;
3245: tmp0 = x[i1];
3246: tmp1 = x[i2];
3247: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250: }
3251:
3252: if (n == sz-1){
3253: tmp0 = x[*idx];
3254: sum1 -= v1[0] * tmp0;
3255: sum2 -= v2[0] * tmp0;
3256: sum3 -= v3[0] * tmp0;
3257: }
3258: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3259: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3260: x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3261: ibdiag += 9;
3262: break;
3263: case 4:
3264: v2 = a->a + ii[row+1];
3265: v3 = a->a + ii[row+2];
3266: v4 = a->a + ii[row+3];
3267: sum1 = b[row];
3268: sum2 = b[row+1];
3269: sum3 = b[row+2];
3270: sum4 = b[row+3];
3271: for(n = 0; n<sz-1; n+=2) {
3272: i1 = idx[0];
3273: i2 = idx[1];
3274: idx += 2;
3275: tmp0 = x[i1];
3276: tmp1 = x[i2];
3277: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3278: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3279: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3280: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3281: }
3282:
3283: if (n == sz-1){
3284: tmp0 = x[*idx];
3285: sum1 -= v1[0] * tmp0;
3286: sum2 -= v2[0] * tmp0;
3287: sum3 -= v3[0] * tmp0;
3288: sum4 -= v4[0] * tmp0;
3289: }
3290: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3291: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3292: x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3293: x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3294: ibdiag += 16;
3295: break;
3296: case 5:
3297: v2 = a->a + ii[row+1];
3298: v3 = a->a + ii[row+2];
3299: v4 = a->a + ii[row+3];
3300: v5 = a->a + ii[row+4];
3301: sum1 = b[row];
3302: sum2 = b[row+1];
3303: sum3 = b[row+2];
3304: sum4 = b[row+3];
3305: sum5 = b[row+4];
3306: for(n = 0; n<sz-1; n+=2) {
3307: i1 = idx[0];
3308: i2 = idx[1];
3309: idx += 2;
3310: tmp0 = x[i1];
3311: tmp1 = x[i2];
3312: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3313: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3314: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3315: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3316: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3317: }
3318:
3319: if (n == sz-1){
3320: tmp0 = x[*idx];
3321: sum1 -= v1[0] * tmp0;
3322: sum2 -= v2[0] * tmp0;
3323: sum3 -= v3[0] * tmp0;
3324: sum4 -= v4[0] * tmp0;
3325: sum5 -= v5[0] * tmp0;
3326: }
3327: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3328: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3329: x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3330: x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3331: x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3332: ibdiag += 25;
3333: break;
3334: default:
3335: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3336: }
3337: }
3339: PetscLogFlops(2.0*a->nz);
3340: }
3341: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3343: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3344: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3345: ibdiag -= sizes[i]*sizes[i];
3346: sz = ii[row+1] - ii[row];
3347: v1 = a->a + ii[row];
3348: idx = a->j + ii[row];
3350: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3351: switch (sizes[i]){
3352: case 1:
3353:
3354: sum1 = b[row];
3355: for(n = 0; n<sz-1; n+=2) {
3356: i1 = idx[0];
3357: i2 = idx[1];
3358: idx += 2;
3359: tmp0 = x[i1];
3360: tmp1 = x[i2];
3361: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362: }
3363:
3364: if (n == sz-1){
3365: tmp0 = x[*idx];
3366: sum1 -= *v1*tmp0;
3367: }
3368: x[row--] += sum1*(*ibdiag);
3369: break;
3371: case 2:
3372:
3373: sum1 = b[row];
3374: sum2 = b[row-1];
3375: /* note that sum1 is associated with the second of the two rows */
3376: v2 = a->a + ii[row - 1];
3377: for(n = 0; n<sz-1; n+=2) {
3378: i1 = idx[0];
3379: i2 = idx[1];
3380: idx += 2;
3381: tmp0 = x[i1];
3382: tmp1 = x[i2];
3383: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3384: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3385: }
3386:
3387: if (n == sz-1){
3388: tmp0 = x[*idx];
3389: sum1 -= *v1*tmp0;
3390: sum2 -= *v2*tmp0;
3391: }
3392: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3393: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3394: break;
3395: case 3:
3396:
3397: sum1 = b[row];
3398: sum2 = b[row-1];
3399: sum3 = b[row-2];
3400: v2 = a->a + ii[row-1];
3401: v3 = a->a + ii[row-2];
3402: for(n = 0; n<sz-1; n+=2) {
3403: i1 = idx[0];
3404: i2 = idx[1];
3405: idx += 2;
3406: tmp0 = x[i1];
3407: tmp1 = x[i2];
3408: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3409: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3410: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3411: }
3412:
3413: if (n == sz-1){
3414: tmp0 = x[*idx];
3415: sum1 -= *v1*tmp0;
3416: sum2 -= *v2*tmp0;
3417: sum3 -= *v3*tmp0;
3418: }
3419: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3420: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3421: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3422: break;
3423: case 4:
3424:
3425: sum1 = b[row];
3426: sum2 = b[row-1];
3427: sum3 = b[row-2];
3428: sum4 = b[row-3];
3429: v2 = a->a + ii[row-1];
3430: v3 = a->a + ii[row-2];
3431: v4 = a->a + ii[row-3];
3432: for(n = 0; n<sz-1; n+=2) {
3433: i1 = idx[0];
3434: i2 = idx[1];
3435: idx += 2;
3436: tmp0 = x[i1];
3437: tmp1 = x[i2];
3438: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3439: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3440: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3441: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3442: }
3443:
3444: if (n == sz-1){
3445: tmp0 = x[*idx];
3446: sum1 -= *v1*tmp0;
3447: sum2 -= *v2*tmp0;
3448: sum3 -= *v3*tmp0;
3449: sum4 -= *v4*tmp0;
3450: }
3451: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3452: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3453: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3454: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3455: break;
3456: case 5:
3457:
3458: sum1 = b[row];
3459: sum2 = b[row-1];
3460: sum3 = b[row-2];
3461: sum4 = b[row-3];
3462: sum5 = b[row-4];
3463: v2 = a->a + ii[row-1];
3464: v3 = a->a + ii[row-2];
3465: v4 = a->a + ii[row-3];
3466: v5 = a->a + ii[row-4];
3467: for(n = 0; n<sz-1; n+=2) {
3468: i1 = idx[0];
3469: i2 = idx[1];
3470: idx += 2;
3471: tmp0 = x[i1];
3472: tmp1 = x[i2];
3473: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3474: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3475: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3476: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3477: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3478: }
3479:
3480: if (n == sz-1){
3481: tmp0 = x[*idx];
3482: sum1 -= *v1*tmp0;
3483: sum2 -= *v2*tmp0;
3484: sum3 -= *v3*tmp0;
3485: sum4 -= *v4*tmp0;
3486: sum5 -= *v5*tmp0;
3487: }
3488: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3489: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3490: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3491: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3492: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3493: break;
3494: default:
3495: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3496: }
3497: }
3499: PetscLogFlops(2.0*a->nz);
3500: }
3501: }
3502: if (flag & SOR_EISENSTAT) {
3503: /*
3504: Apply (U + D)^-1 where D is now the block diagonal
3505: */
3506: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3507: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3508: ibdiag -= sizes[i]*sizes[i];
3509: sz = ii[row+1] - diag[row] - 1;
3510: v1 = a->a + diag[row] + 1;
3511: idx = a->j + diag[row] + 1;
3512: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3513: switch (sizes[i]){
3514: case 1:
3515:
3516: sum1 = b[row];
3517: for(n = 0; n<sz-1; n+=2) {
3518: i1 = idx[0];
3519: i2 = idx[1];
3520: idx += 2;
3521: tmp0 = x[i1];
3522: tmp1 = x[i2];
3523: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3524: }
3525:
3526: if (n == sz-1){
3527: tmp0 = x[*idx];
3528: sum1 -= *v1*tmp0;
3529: }
3530: x[row] = sum1*(*ibdiag);row--;
3531: break;
3533: case 2:
3534:
3535: sum1 = b[row];
3536: sum2 = b[row-1];
3537: /* note that sum1 is associated with the second of the two rows */
3538: v2 = a->a + diag[row-1] + 2;
3539: for(n = 0; n<sz-1; n+=2) {
3540: i1 = idx[0];
3541: i2 = idx[1];
3542: idx += 2;
3543: tmp0 = x[i1];
3544: tmp1 = x[i2];
3545: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3546: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3547: }
3548:
3549: if (n == sz-1){
3550: tmp0 = x[*idx];
3551: sum1 -= *v1*tmp0;
3552: sum2 -= *v2*tmp0;
3553: }
3554: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3555: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3556: row -= 2;
3557: break;
3558: case 3:
3559:
3560: sum1 = b[row];
3561: sum2 = b[row-1];
3562: sum3 = b[row-2];
3563: v2 = a->a + diag[row-1] + 2;
3564: v3 = a->a + diag[row-2] + 3;
3565: for(n = 0; n<sz-1; n+=2) {
3566: i1 = idx[0];
3567: i2 = idx[1];
3568: idx += 2;
3569: tmp0 = x[i1];
3570: tmp1 = x[i2];
3571: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3572: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3573: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3574: }
3575:
3576: if (n == sz-1){
3577: tmp0 = x[*idx];
3578: sum1 -= *v1*tmp0;
3579: sum2 -= *v2*tmp0;
3580: sum3 -= *v3*tmp0;
3581: }
3582: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3583: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3584: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3585: row -= 3;
3586: break;
3587: case 4:
3588:
3589: sum1 = b[row];
3590: sum2 = b[row-1];
3591: sum3 = b[row-2];
3592: sum4 = b[row-3];
3593: v2 = a->a + diag[row-1] + 2;
3594: v3 = a->a + diag[row-2] + 3;
3595: v4 = a->a + diag[row-3] + 4;
3596: for(n = 0; n<sz-1; n+=2) {
3597: i1 = idx[0];
3598: i2 = idx[1];
3599: idx += 2;
3600: tmp0 = x[i1];
3601: tmp1 = x[i2];
3602: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3603: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3604: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3605: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3606: }
3607:
3608: if (n == sz-1){
3609: tmp0 = x[*idx];
3610: sum1 -= *v1*tmp0;
3611: sum2 -= *v2*tmp0;
3612: sum3 -= *v3*tmp0;
3613: sum4 -= *v4*tmp0;
3614: }
3615: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3616: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3617: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3618: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3619: row -= 4;
3620: break;
3621: case 5:
3622:
3623: sum1 = b[row];
3624: sum2 = b[row-1];
3625: sum3 = b[row-2];
3626: sum4 = b[row-3];
3627: sum5 = b[row-4];
3628: v2 = a->a + diag[row-1] + 2;
3629: v3 = a->a + diag[row-2] + 3;
3630: v4 = a->a + diag[row-3] + 4;
3631: v5 = a->a + diag[row-4] + 5;
3632: for(n = 0; n<sz-1; n+=2) {
3633: i1 = idx[0];
3634: i2 = idx[1];
3635: idx += 2;
3636: tmp0 = x[i1];
3637: tmp1 = x[i2];
3638: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3639: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3640: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3641: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3642: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3643: }
3644:
3645: if (n == sz-1){
3646: tmp0 = x[*idx];
3647: sum1 -= *v1*tmp0;
3648: sum2 -= *v2*tmp0;
3649: sum3 -= *v3*tmp0;
3650: sum4 -= *v4*tmp0;
3651: sum5 -= *v5*tmp0;
3652: }
3653: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3654: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3655: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3656: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3657: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3658: row -= 5;
3659: break;
3660: default:
3661: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3662: }
3663: }
3664: PetscLogFlops(a->nz);
3666: /*
3667: t = b - D x where D is the block diagonal
3668: */
3669: cnt = 0;
3670: for (i=0, row=0; i<m; i++) {
3671: switch (sizes[i]){
3672: case 1:
3673: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3674: break;
3675: case 2:
3676: x1 = x[row]; x2 = x[row+1];
3677: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3678: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3679: t[row] = b[row] - tmp1;
3680: t[row+1] = b[row+1] - tmp2; row += 2;
3681: cnt += 4;
3682: break;
3683: case 3:
3684: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3685: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3686: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3687: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3688: t[row] = b[row] - tmp1;
3689: t[row+1] = b[row+1] - tmp2;
3690: t[row+2] = b[row+2] - tmp3; row += 3;
3691: cnt += 9;
3692: break;
3693: case 4:
3694: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3695: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3696: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3697: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3698: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3699: t[row] = b[row] - tmp1;
3700: t[row+1] = b[row+1] - tmp2;
3701: t[row+2] = b[row+2] - tmp3;
3702: t[row+3] = b[row+3] - tmp4; row += 4;
3703: cnt += 16;
3704: break;
3705: case 5:
3706: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3707: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3708: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3709: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3710: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3711: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3712: t[row] = b[row] - tmp1;
3713: t[row+1] = b[row+1] - tmp2;
3714: t[row+2] = b[row+2] - tmp3;
3715: t[row+3] = b[row+3] - tmp4;
3716: t[row+4] = b[row+4] - tmp5;row += 5;
3717: cnt += 25;
3718: break;
3719: default:
3720: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3721: }
3722: }
3723: PetscLogFlops(m);
3727: /*
3728: Apply (L + D)^-1 where D is the block diagonal
3729: */
3730: for (i=0, row=0; i<m; i++) {
3731: sz = diag[row] - ii[row];
3732: v1 = a->a + ii[row];
3733: idx = a->j + ii[row];
3734: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3735: switch (sizes[i]){
3736: case 1:
3737:
3738: sum1 = t[row];
3739: for(n = 0; n<sz-1; n+=2) {
3740: i1 = idx[0];
3741: i2 = idx[1];
3742: idx += 2;
3743: tmp0 = t[i1];
3744: tmp1 = t[i2];
3745: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3746: }
3747:
3748: if (n == sz-1){
3749: tmp0 = t[*idx];
3750: sum1 -= *v1 * tmp0;
3751: }
3752: x[row] += t[row] = sum1*(*ibdiag++); row++;
3753: break;
3754: case 2:
3755: v2 = a->a + ii[row+1];
3756: sum1 = t[row];
3757: sum2 = t[row+1];
3758: for(n = 0; n<sz-1; n+=2) {
3759: i1 = idx[0];
3760: i2 = idx[1];
3761: idx += 2;
3762: tmp0 = t[i1];
3763: tmp1 = t[i2];
3764: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3765: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3766: }
3767:
3768: if (n == sz-1){
3769: tmp0 = t[*idx];
3770: sum1 -= v1[0] * tmp0;
3771: sum2 -= v2[0] * tmp0;
3772: }
3773: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3774: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3775: ibdiag += 4; row += 2;
3776: break;
3777: case 3:
3778: v2 = a->a + ii[row+1];
3779: v3 = a->a + ii[row+2];
3780: sum1 = t[row];
3781: sum2 = t[row+1];
3782: sum3 = t[row+2];
3783: for(n = 0; n<sz-1; n+=2) {
3784: i1 = idx[0];
3785: i2 = idx[1];
3786: idx += 2;
3787: tmp0 = t[i1];
3788: tmp1 = t[i2];
3789: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3790: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3791: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3792: }
3793:
3794: if (n == sz-1){
3795: tmp0 = t[*idx];
3796: sum1 -= v1[0] * tmp0;
3797: sum2 -= v2[0] * tmp0;
3798: sum3 -= v3[0] * tmp0;
3799: }
3800: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3801: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3802: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3803: ibdiag += 9; row += 3;
3804: break;
3805: case 4:
3806: v2 = a->a + ii[row+1];
3807: v3 = a->a + ii[row+2];
3808: v4 = a->a + ii[row+3];
3809: sum1 = t[row];
3810: sum2 = t[row+1];
3811: sum3 = t[row+2];
3812: sum4 = t[row+3];
3813: for(n = 0; n<sz-1; n+=2) {
3814: i1 = idx[0];
3815: i2 = idx[1];
3816: idx += 2;
3817: tmp0 = t[i1];
3818: tmp1 = t[i2];
3819: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3820: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3821: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3822: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3823: }
3824:
3825: if (n == sz-1){
3826: tmp0 = t[*idx];
3827: sum1 -= v1[0] * tmp0;
3828: sum2 -= v2[0] * tmp0;
3829: sum3 -= v3[0] * tmp0;
3830: sum4 -= v4[0] * tmp0;
3831: }
3832: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3833: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3834: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3835: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3836: ibdiag += 16; row += 4;
3837: break;
3838: case 5:
3839: v2 = a->a + ii[row+1];
3840: v3 = a->a + ii[row+2];
3841: v4 = a->a + ii[row+3];
3842: v5 = a->a + ii[row+4];
3843: sum1 = t[row];
3844: sum2 = t[row+1];
3845: sum3 = t[row+2];
3846: sum4 = t[row+3];
3847: sum5 = t[row+4];
3848: for(n = 0; n<sz-1; n+=2) {
3849: i1 = idx[0];
3850: i2 = idx[1];
3851: idx += 2;
3852: tmp0 = t[i1];
3853: tmp1 = t[i2];
3854: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3855: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3856: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3857: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3858: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3859: }
3860:
3861: if (n == sz-1){
3862: tmp0 = t[*idx];
3863: sum1 -= v1[0] * tmp0;
3864: sum2 -= v2[0] * tmp0;
3865: sum3 -= v3[0] * tmp0;
3866: sum4 -= v4[0] * tmp0;
3867: sum5 -= v5[0] * tmp0;
3868: }
3869: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3870: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3871: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3872: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3873: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3874: ibdiag += 25; row += 5;
3875: break;
3876: default:
3877: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3878: }
3879: }
3880: PetscLogFlops(a->nz);
3881: }
3882: VecRestoreArray(xx,&x);
3883: VecRestoreArrayRead(bb,&b);
3884: CHKMEMQ; return(0);
3885: }
3889: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3890: {
3891: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3892: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3893: const MatScalar *bdiag = a->inode.bdiag;
3894: const PetscScalar *b;
3895: PetscErrorCode ierr;
3896: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3897: const PetscInt *sizes = a->inode.size;
3900: VecGetArray(xx,&x);
3901: VecGetArrayRead(bb,&b);
3902: cnt = 0;
3903: for (i=0, row=0; i<m; i++) {
3904: switch (sizes[i]){
3905: case 1:
3906: x[row] = b[row]*bdiag[cnt++];row++;
3907: break;
3908: case 2:
3909: x1 = b[row]; x2 = b[row+1];
3910: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3911: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3912: x[row++] = tmp1;
3913: x[row++] = tmp2;
3914: cnt += 4;
3915: break;
3916: case 3:
3917: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3918: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3919: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3920: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3921: x[row++] = tmp1;
3922: x[row++] = tmp2;
3923: x[row++] = tmp3;
3924: cnt += 9;
3925: break;
3926: case 4:
3927: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3928: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3929: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3930: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3931: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3932: x[row++] = tmp1;
3933: x[row++] = tmp2;
3934: x[row++] = tmp3;
3935: x[row++] = tmp4;
3936: cnt += 16;
3937: break;
3938: case 5:
3939: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3940: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3941: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3942: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3943: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3944: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3945: x[row++] = tmp1;
3946: x[row++] = tmp2;
3947: x[row++] = tmp3;
3948: x[row++] = tmp4;
3949: x[row++] = tmp5;
3950: cnt += 25;
3951: break;
3952: default:
3953: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3954: }
3955: }
3956: PetscLogFlops(2*cnt);
3957: VecRestoreArray(xx,&x);
3958: VecRestoreArrayRead(bb,&b);
3959: return(0);
3960: }
3962: /*
3963: samestructure indicates that the matrix has not changed its nonzero structure so we
3964: do not need to recompute the inodes
3965: */
3968: PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3969: {
3970: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3972: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3973: PetscBool flag;
3976: if (!a->inode.use) return(0);
3977: if (a->inode.checked && samestructure) return(0);
3980: m = A->rmap->n;
3981: if (a->inode.size) {ns = a->inode.size;}
3982: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
3984: i = 0;
3985: node_count = 0;
3986: idx = a->j;
3987: ii = a->i;
3988: while (i < m){ /* For each row */
3989: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
3990: /* Limits the number of elements in a node to 'a->inode.limit' */
3991: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3992: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
3993: if (nzy != nzx) break;
3994: idy += nzx; /* Same nonzero pattern */
3995: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
3996: if (!flag) break;
3997: }
3998: ns[node_count++] = blk_size;
3999: idx += blk_size*nzx;
4000: i = j;
4001: }
4002: /* If not enough inodes found,, do not use inode version of the routines */
4003: if (!m || node_count > .8*m) {
4004: PetscFree(ns);
4005: a->inode.node_count = 0;
4006: a->inode.size = PETSC_NULL;
4007: a->inode.use = PETSC_FALSE;
4008: A->ops->mult = MatMult_SeqAIJ;
4009: A->ops->sor = MatSOR_SeqAIJ;
4010: A->ops->multadd = MatMultAdd_SeqAIJ;
4011: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4012: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4013: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4014: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4015: A->ops->coloringpatch = 0;
4016: A->ops->multdiagonalblock = 0;
4017: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4018: } else {
4019: if (!A->factortype) {
4020: A->ops->mult = MatMult_SeqAIJ_Inode;
4021: A->ops->sor = MatSOR_SeqAIJ_Inode;
4022: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4023: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4024: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4025: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4026: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4027: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4028: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4029: } else {
4030: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4031: }
4032: a->inode.node_count = node_count;
4033: a->inode.size = ns;
4034: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4035: }
4036: a->inode.checked = PETSC_TRUE;
4037: return(0);
4038: }
4042: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4043: {
4044: Mat B=*C;
4045: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4047: PetscInt m=A->rmap->n;
4051: c->inode.use = a->inode.use;
4052: c->inode.limit = a->inode.limit;
4053: c->inode.max_limit = a->inode.max_limit;
4054: if (a->inode.size){
4055: PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);
4056: c->inode.node_count = a->inode.node_count;
4057: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4058: /* note the table of functions below should match that in Mat_CheckInode() */
4059: if (!B->factortype) {
4060: B->ops->mult = MatMult_SeqAIJ_Inode;
4061: B->ops->sor = MatSOR_SeqAIJ_Inode;
4062: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4063: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4064: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4065: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4066: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4067: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4068: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4069: } else {
4070: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4071: }
4072: } else {
4073: c->inode.size = 0;
4074: c->inode.node_count = 0;
4075: }
4076: c->inode.ibdiagvalid = PETSC_FALSE;
4077: c->inode.ibdiag = 0;
4078: c->inode.bdiag = 0;
4079: return(0);
4080: }
4084: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4085: {
4086: PetscInt k, *vi;
4088: vi = aj + ai[row];
4089: for(k=0;k<nzl;k++) cols[k] = vi[k];
4090: vi = aj + adiag[row];
4091: cols[nzl] = vi[0];
4092: vi = aj + adiag[row+1]+1;
4093: for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
4094: return(0);
4095: }
4096: /*
4097: Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4098: Modified from Mat_CheckInode().
4100: Input Parameters:
4101: + Mat A - ILU or LU matrix factor
4102: - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4103: do not need to recompute the inodes
4104: */
4107: PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4108: {
4109: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4111: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4112: PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4113: PetscBool flag;
4116: if (!a->inode.use) return(0);
4117: if (a->inode.checked && samestructure) return(0);
4119: m = A->rmap->n;
4120: if (a->inode.size) {ns = a->inode.size;}
4121: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
4123: i = 0;
4124: node_count = 0;
4125: while (i < m){ /* For each row */
4126: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4127: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4128: nzx = nzl1 + nzu1 + 1;
4129: PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);
4130: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4131:
4132: /* Limits the number of elements in a node to 'a->inode.limit' */
4133: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4134: nzl2 = ai[j+1] - ai[j];
4135: nzu2 = adiag[j] - adiag[j+1] - 1;
4136: nzy = nzl2 + nzu2 + 1;
4137: if( nzy != nzx) break;
4138: PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);
4139: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4140: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4141: if (!flag) {PetscFree(cols2);break;}
4142: PetscFree(cols2);
4143: }
4144: ns[node_count++] = blk_size;
4145: PetscFree(cols1);
4146: i = j;
4147: }
4148: /* If not enough inodes found,, do not use inode version of the routines */
4149: if (!m || node_count > .8*m) {
4150: PetscFree(ns);
4151: a->inode.node_count = 0;
4152: a->inode.size = PETSC_NULL;
4153: a->inode.use = PETSC_FALSE;
4154: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4155: } else {
4156: A->ops->mult = 0;
4157: A->ops->sor = 0;
4158: A->ops->multadd = 0;
4159: A->ops->getrowij = 0;
4160: A->ops->restorerowij = 0;
4161: A->ops->getcolumnij = 0;
4162: A->ops->restorecolumnij = 0;
4163: A->ops->coloringpatch = 0;
4164: A->ops->multdiagonalblock = 0;
4165: A->ops->solve = MatSolve_SeqAIJ_Inode;
4166: a->inode.node_count = node_count;
4167: a->inode.size = ns;
4168: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4169: }
4170: a->inode.checked = PETSC_TRUE;
4171: return(0);
4172: }
4176: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4177: {
4178: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4181: a->inode.ibdiagvalid = PETSC_FALSE;
4182: return(0);
4183: }
4185: /*
4186: This is really ugly. if inodes are used this replaces the
4187: permutations with ones that correspond to rows/cols of the matrix
4188: rather then inode blocks
4189: */
4192: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4193: {
4197: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4198: return(0);
4199: }
4201: EXTERN_C_BEGIN
4204: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4205: {
4206: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4208: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4209: const PetscInt *ridx,*cidx;
4210: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4211: PetscInt nslim_col,*ns_col;
4212: IS ris = *rperm,cis = *cperm;
4215: if (!a->inode.size) return(0); /* no inodes so return */
4216: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4218: Mat_CreateColInode(A,&nslim_col,&ns_col);
4219: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
4220: PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);
4222: ISGetIndices(ris,&ridx);
4223: ISGetIndices(cis,&cidx);
4225: /* Form the inode structure for the rows of permuted matric using inv perm*/
4226: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4228: /* Construct the permutations for rows*/
4229: for (i=0,row = 0; i<nslim_row; ++i){
4230: indx = ridx[i];
4231: start_val = tns[indx];
4232: end_val = tns[indx + 1];
4233: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4234: }
4236: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4237: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4239: /* Construct permutations for columns */
4240: for (i=0,col=0; i<nslim_col; ++i){
4241: indx = cidx[i];
4242: start_val = tns[indx];
4243: end_val = tns[indx + 1];
4244: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4245: }
4247: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4248: ISSetPermutation(*rperm);
4249: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4250: ISSetPermutation(*cperm);
4251:
4252: ISRestoreIndices(ris,&ridx);
4253: ISRestoreIndices(cis,&cidx);
4255: PetscFree(ns_col);
4256: PetscFree2(permr,permc);
4257: ISDestroy(&cis);
4258: ISDestroy(&ris);
4259: PetscFree(tns);
4260: return(0);
4261: }
4262: EXTERN_C_END
4266: /*@C
4267: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4269: Not Collective
4271: Input Parameter:
4272: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4274: Output Parameter:
4275: + node_count - no of inodes present in the matrix.
4276: . sizes - an array of size node_count,with sizes of each inode.
4277: - limit - the max size used to generate the inodes.
4279: Level: advanced
4281: Notes: This routine returns some internal storage information
4282: of the matrix, it is intended to be used by advanced users.
4283: It should be called after the matrix is assembled.
4284: The contents of the sizes[] array should not be changed.
4285: PETSC_NULL may be passed for information not requested.
4287: .keywords: matrix, seqaij, get, inode
4289: .seealso: MatGetInfo()
4290: @*/
4291: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4292: {
4293: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4296: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4297: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
4298: if (f) {
4299: (*f)(A,node_count,sizes,limit);
4300: }
4301: return(0);
4302: }
4304: EXTERN_C_BEGIN
4307: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4308: {
4309: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4312: if (node_count) *node_count = a->inode.node_count;
4313: if (sizes) *sizes = a->inode.size;
4314: if (limit) *limit = a->inode.limit;
4315: return(0);
4316: }
4317: EXTERN_C_END