Actual source code: inode.c
petsc-3.4.5 2014-06-29
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24: for (; count+1 < n; count++,i++) ;
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for (; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
69: nslim_row = a->inode.node_count;
70: m = A->rmap->n;
71: n = A->cmap->n;
72: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74: /* Use the row_inode as column_inode */
75: nslim_col = nslim_row;
76: ns_col = ns_row;
78: /* allocate space for reformated inode structure */
79: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
80: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
81: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83: for (i1=0,col=0; i1<nslim_col; ++i1) {
84: nsz = ns_col[i1];
85: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
86: }
87: /* allocate space for row pointers */
88: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
89: *iia = ia;
90: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
91: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
93: /* determine the number of columns in each row */
94: ia[0] = oshift;
95: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97: j = aj + ai[row] + ishift;
98: jmax = aj + ai[row+1] + ishift;
99: col = *j++ + ishift;
100: i2 = tvc[col];
101: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
102: ia[i1+1]++;
103: ia[i2+1]++;
104: i2++; /* Start col of next node */
105: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
106: i2 = tvc[col];
107: }
108: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
109: }
111: /* shift ia[i] to point to next row */
112: for (i1=1; i1<nslim_row+1; i1++) {
113: row = ia[i1-1];
114: ia[i1] += row;
115: work[i1-1] = row - oshift;
116: }
118: /* allocate space for column pointers */
119: nz = ia[nslim_row] + (!ishift);
120: PetscMalloc(nz*sizeof(PetscInt),&ja);
121: *jja = ja;
123: /* loop over lower triangular part putting into ja */
124: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
125: j = aj + ai[row] + ishift;
126: jmax = aj + ai[row+1] + ishift;
127: col = *j++ + ishift;
128: i2 = tvc[col];
129: while (i2<i1 && j<jmax) {
130: ja[work[i2]++] = i1 + oshift;
131: ja[work[i1]++] = i2 + oshift;
132: ++i2;
133: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
134: i2 = tvc[col];
135: }
136: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138: }
139: PetscFree(work);
140: PetscFree(tns);
141: PetscFree(tvc);
142: return(0);
143: }
145: /*
146: This builds nonsymmetric version of nonzero structure,
147: */
150: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
151: {
152: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
154: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
155: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
158: nslim_row = a->inode.node_count;
159: n = A->cmap->n;
161: /* Create The column_inode for this matrix */
162: Mat_CreateColInode(A,&nslim_col,&ns_col);
164: /* allocate space for reformated column_inode structure */
165: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
166: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
167: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
169: for (i1=0,col=0; i1<nslim_col; ++i1) {
170: nsz = ns_col[i1];
171: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
172: }
173: /* allocate space for row pointers */
174: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
175: *iia = ia;
176: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
177: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
179: /* determine the number of columns in each row */
180: ia[0] = oshift;
181: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
182: j = aj + ai[row] + ishift;
183: col = *j++ + ishift;
184: i2 = tvc[col];
185: nz = ai[row+1] - ai[row];
186: while (nz-- > 0) { /* off-diagonal elemets */
187: ia[i1+1]++;
188: i2++; /* Start col of next node */
189: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
190: if (nz > 0) i2 = tvc[col];
191: }
192: }
194: /* shift ia[i] to point to next row */
195: for (i1=1; i1<nslim_row+1; i1++) {
196: row = ia[i1-1];
197: ia[i1] += row;
198: work[i1-1] = row - oshift;
199: }
201: /* allocate space for column pointers */
202: nz = ia[nslim_row] + (!ishift);
203: PetscMalloc(nz*sizeof(PetscInt),&ja);
204: *jja = ja;
206: /* loop over matrix putting into ja */
207: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
208: j = aj + ai[row] + ishift;
209: col = *j++ + ishift;
210: i2 = tvc[col];
211: nz = ai[row+1] - ai[row];
212: while (nz-- > 0) {
213: ja[work[i1]++] = i2 + oshift;
214: ++i2;
215: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216: if (nz > 0) i2 = tvc[col];
217: }
218: }
219: PetscFree(ns_col);
220: PetscFree(work);
221: PetscFree(tns);
222: PetscFree(tvc);
223: return(0);
224: }
228: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
229: {
230: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
234: *n = a->inode.node_count;
235: if (!ia) return(0);
236: if (!blockcompressed) {
237: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
238: } else if (symmetric) {
239: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
240: } else {
241: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
242: }
243: return(0);
244: }
248: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
249: {
253: if (!ia) return(0);
255: if (!blockcompressed) {
256: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
257: } else {
258: PetscFree(*ia);
259: PetscFree(*ja);
260: }
261: return(0);
262: }
264: /* ----------------------------------------------------------- */
268: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269: {
270: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
272: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276: nslim_row = a->inode.node_count;
277: n = A->cmap->n;
279: /* Create The column_inode for this matrix */
280: Mat_CreateColInode(A,&nslim_col,&ns_col);
282: /* allocate space for reformated column_inode structure */
283: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
284: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
285: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
287: for (i1=0,col=0; i1<nslim_col; ++i1) {
288: nsz = ns_col[i1];
289: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
290: }
291: /* allocate space for column pointers */
292: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
293: *iia = ia;
294: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
295: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
297: /* determine the number of columns in each row */
298: ia[0] = oshift;
299: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
300: j = aj + ai[row] + ishift;
301: col = *j++ + ishift;
302: i2 = tvc[col];
303: nz = ai[row+1] - ai[row];
304: while (nz-- > 0) { /* off-diagonal elemets */
305: /* ia[i1+1]++; */
306: ia[i2+1]++;
307: i2++;
308: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
309: if (nz > 0) i2 = tvc[col];
310: }
311: }
313: /* shift ia[i] to point to next col */
314: for (i1=1; i1<nslim_col+1; i1++) {
315: col = ia[i1-1];
316: ia[i1] += col;
317: work[i1-1] = col - oshift;
318: }
320: /* allocate space for column pointers */
321: nz = ia[nslim_col] + (!ishift);
322: PetscMalloc(nz*sizeof(PetscInt),&ja);
323: *jja = ja;
325: /* loop over matrix putting into ja */
326: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
327: j = aj + ai[row] + ishift;
328: col = *j++ + ishift;
329: i2 = tvc[col];
330: nz = ai[row+1] - ai[row];
331: while (nz-- > 0) {
332: /* ja[work[i1]++] = i2 + oshift; */
333: ja[work[i2]++] = i1 + oshift;
334: i2++;
335: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
336: if (nz > 0) i2 = tvc[col];
337: }
338: }
339: PetscFree(ns_col);
340: PetscFree(work);
341: PetscFree(tns);
342: PetscFree(tvc);
343: return(0);
344: }
348: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
349: {
353: Mat_CreateColInode(A,n,NULL);
354: if (!ia) return(0);
356: if (!blockcompressed) {
357: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
358: } else if (symmetric) {
359: /* Since the indices are symmetric it does'nt matter */
360: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
361: } else {
362: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
363: }
364: return(0);
365: }
369: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
370: {
374: if (!ia) return(0);
375: if (!blockcompressed) {
376: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
377: } else {
378: PetscFree(*ia);
379: PetscFree(*ja);
380: }
381: return(0);
382: }
384: /* ----------------------------------------------------------- */
388: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
389: {
390: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
391: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
392: PetscScalar *y;
393: const PetscScalar *x;
394: const MatScalar *v1,*v2,*v3,*v4,*v5;
395: PetscErrorCode ierr;
396: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
398: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
399: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
400: #endif
403: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
404: node_max = a->inode.node_count;
405: ns = a->inode.size; /* Node Size array */
406: VecGetArrayRead(xx,&x);
407: VecGetArray(yy,&y);
408: idx = a->j;
409: v1 = a->a;
410: ii = a->i;
412: for (i = 0,row = 0; i< node_max; ++i) {
413: nsz = ns[i];
414: n = ii[1] - ii[0];
415: nonzerorow += (n>0)*nsz;
416: ii += nsz;
417: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
418: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
419: sz = n; /* No of non zeros in this row */
420: /* Switch on the size of Node */
421: switch (nsz) { /* Each loop in 'case' is unrolled */
422: case 1:
423: sum1 = 0.;
425: for (n = 0; n< sz-1; n+=2) {
426: i1 = idx[0]; /* The instructions are ordered to */
427: i2 = idx[1]; /* make the compiler's job easy */
428: idx += 2;
429: tmp0 = x[i1];
430: tmp1 = x[i2];
431: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432: }
434: if (n == sz-1) { /* Take care of the last nonzero */
435: tmp0 = x[*idx++];
436: sum1 += *v1++ *tmp0;
437: }
438: y[row++]=sum1;
439: break;
440: case 2:
441: sum1 = 0.;
442: sum2 = 0.;
443: v2 = v1 + n;
445: for (n = 0; n< sz-1; n+=2) {
446: i1 = idx[0];
447: i2 = idx[1];
448: idx += 2;
449: tmp0 = x[i1];
450: tmp1 = x[i2];
451: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
452: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
453: }
454: if (n == sz-1) {
455: tmp0 = x[*idx++];
456: sum1 += *v1++ * tmp0;
457: sum2 += *v2++ * tmp0;
458: }
459: y[row++]=sum1;
460: y[row++]=sum2;
461: v1 =v2; /* Since the next block to be processed starts there*/
462: idx +=sz;
463: break;
464: case 3:
465: sum1 = 0.;
466: sum2 = 0.;
467: sum3 = 0.;
468: v2 = v1 + n;
469: v3 = v2 + n;
471: for (n = 0; n< sz-1; n+=2) {
472: i1 = idx[0];
473: i2 = idx[1];
474: idx += 2;
475: tmp0 = x[i1];
476: tmp1 = x[i2];
477: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
478: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
479: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
480: }
481: if (n == sz-1) {
482: tmp0 = x[*idx++];
483: sum1 += *v1++ * tmp0;
484: sum2 += *v2++ * tmp0;
485: sum3 += *v3++ * tmp0;
486: }
487: y[row++]=sum1;
488: y[row++]=sum2;
489: y[row++]=sum3;
490: v1 =v3; /* Since the next block to be processed starts there*/
491: idx +=2*sz;
492: break;
493: case 4:
494: sum1 = 0.;
495: sum2 = 0.;
496: sum3 = 0.;
497: sum4 = 0.;
498: v2 = v1 + n;
499: v3 = v2 + n;
500: v4 = v3 + n;
502: for (n = 0; n< sz-1; n+=2) {
503: i1 = idx[0];
504: i2 = idx[1];
505: idx += 2;
506: tmp0 = x[i1];
507: tmp1 = x[i2];
508: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
509: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
510: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
511: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
512: }
513: if (n == sz-1) {
514: tmp0 = x[*idx++];
515: sum1 += *v1++ * tmp0;
516: sum2 += *v2++ * tmp0;
517: sum3 += *v3++ * tmp0;
518: sum4 += *v4++ * tmp0;
519: }
520: y[row++]=sum1;
521: y[row++]=sum2;
522: y[row++]=sum3;
523: y[row++]=sum4;
524: v1 =v4; /* Since the next block to be processed starts there*/
525: idx +=3*sz;
526: break;
527: case 5:
528: sum1 = 0.;
529: sum2 = 0.;
530: sum3 = 0.;
531: sum4 = 0.;
532: sum5 = 0.;
533: v2 = v1 + n;
534: v3 = v2 + n;
535: v4 = v3 + n;
536: v5 = v4 + n;
538: for (n = 0; n<sz-1; n+=2) {
539: i1 = idx[0];
540: i2 = idx[1];
541: idx += 2;
542: tmp0 = x[i1];
543: tmp1 = x[i2];
544: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
545: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
546: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
547: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
548: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
549: }
550: if (n == sz-1) {
551: tmp0 = x[*idx++];
552: sum1 += *v1++ * tmp0;
553: sum2 += *v2++ * tmp0;
554: sum3 += *v3++ * tmp0;
555: sum4 += *v4++ * tmp0;
556: sum5 += *v5++ * tmp0;
557: }
558: y[row++]=sum1;
559: y[row++]=sum2;
560: y[row++]=sum3;
561: y[row++]=sum4;
562: y[row++]=sum5;
563: v1 =v5; /* Since the next block to be processed starts there */
564: idx +=4*sz;
565: break;
566: default:
567: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
568: }
569: }
570: VecRestoreArrayRead(xx,&x);
571: VecRestoreArray(yy,&y);
572: PetscLogFlops(2.0*a->nz - nonzerorow);
573: return(0);
574: }
575: /* ----------------------------------------------------------- */
576: /* Almost same code as the MatMult_SeqAIJ_Inode() */
579: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
580: {
581: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
582: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
583: MatScalar *v1,*v2,*v3,*v4,*v5;
584: PetscScalar *x,*y,*z,*zt;
586: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
589: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
590: node_max = a->inode.node_count;
591: ns = a->inode.size; /* Node Size array */
593: VecGetArray(xx,&x);
594: VecGetArray(yy,&y);
595: if (zz != yy) {
596: VecGetArray(zz,&z);
597: } else {
598: z = y;
599: }
600: zt = z;
602: idx = a->j;
603: v1 = a->a;
604: ii = a->i;
606: for (i = 0,row = 0; i< node_max; ++i) {
607: nsz = ns[i];
608: n = ii[1] - ii[0];
609: ii += nsz;
610: sz = n; /* No of non zeros in this row */
611: /* Switch on the size of Node */
612: switch (nsz) { /* Each loop in 'case' is unrolled */
613: case 1:
614: sum1 = *zt++;
616: for (n = 0; n< sz-1; n+=2) {
617: i1 = idx[0]; /* The instructions are ordered to */
618: i2 = idx[1]; /* make the compiler's job easy */
619: idx += 2;
620: tmp0 = x[i1];
621: tmp1 = x[i2];
622: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623: }
625: if (n == sz-1) { /* Take care of the last nonzero */
626: tmp0 = x[*idx++];
627: sum1 += *v1++ * tmp0;
628: }
629: y[row++]=sum1;
630: break;
631: case 2:
632: sum1 = *zt++;
633: sum2 = *zt++;
634: v2 = v1 + n;
636: for (n = 0; n< sz-1; n+=2) {
637: i1 = idx[0];
638: i2 = idx[1];
639: idx += 2;
640: tmp0 = x[i1];
641: tmp1 = x[i2];
642: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644: }
645: if (n == sz-1) {
646: tmp0 = x[*idx++];
647: sum1 += *v1++ * tmp0;
648: sum2 += *v2++ * tmp0;
649: }
650: y[row++]=sum1;
651: y[row++]=sum2;
652: v1 =v2; /* Since the next block to be processed starts there*/
653: idx +=sz;
654: break;
655: case 3:
656: sum1 = *zt++;
657: sum2 = *zt++;
658: sum3 = *zt++;
659: v2 = v1 + n;
660: v3 = v2 + n;
662: for (n = 0; n< sz-1; n+=2) {
663: i1 = idx[0];
664: i2 = idx[1];
665: idx += 2;
666: tmp0 = x[i1];
667: tmp1 = x[i2];
668: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671: }
672: if (n == sz-1) {
673: tmp0 = x[*idx++];
674: sum1 += *v1++ * tmp0;
675: sum2 += *v2++ * tmp0;
676: sum3 += *v3++ * tmp0;
677: }
678: y[row++]=sum1;
679: y[row++]=sum2;
680: y[row++]=sum3;
681: v1 =v3; /* Since the next block to be processed starts there*/
682: idx +=2*sz;
683: break;
684: case 4:
685: sum1 = *zt++;
686: sum2 = *zt++;
687: sum3 = *zt++;
688: sum4 = *zt++;
689: v2 = v1 + n;
690: v3 = v2 + n;
691: v4 = v3 + n;
693: for (n = 0; n< sz-1; n+=2) {
694: i1 = idx[0];
695: i2 = idx[1];
696: idx += 2;
697: tmp0 = x[i1];
698: tmp1 = x[i2];
699: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703: }
704: if (n == sz-1) {
705: tmp0 = x[*idx++];
706: sum1 += *v1++ * tmp0;
707: sum2 += *v2++ * tmp0;
708: sum3 += *v3++ * tmp0;
709: sum4 += *v4++ * tmp0;
710: }
711: y[row++]=sum1;
712: y[row++]=sum2;
713: y[row++]=sum3;
714: y[row++]=sum4;
715: v1 =v4; /* Since the next block to be processed starts there*/
716: idx +=3*sz;
717: break;
718: case 5:
719: sum1 = *zt++;
720: sum2 = *zt++;
721: sum3 = *zt++;
722: sum4 = *zt++;
723: sum5 = *zt++;
724: v2 = v1 + n;
725: v3 = v2 + n;
726: v4 = v3 + n;
727: v5 = v4 + n;
729: for (n = 0; n<sz-1; n+=2) {
730: i1 = idx[0];
731: i2 = idx[1];
732: idx += 2;
733: tmp0 = x[i1];
734: tmp1 = x[i2];
735: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740: }
741: if (n == sz-1) {
742: tmp0 = x[*idx++];
743: sum1 += *v1++ * tmp0;
744: sum2 += *v2++ * tmp0;
745: sum3 += *v3++ * tmp0;
746: sum4 += *v4++ * tmp0;
747: sum5 += *v5++ * tmp0;
748: }
749: y[row++]=sum1;
750: y[row++]=sum2;
751: y[row++]=sum3;
752: y[row++]=sum4;
753: y[row++]=sum5;
754: v1 =v5; /* Since the next block to be processed starts there */
755: idx +=4*sz;
756: break;
757: default:
758: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759: }
760: }
761: VecRestoreArray(xx,&x);
762: VecRestoreArray(yy,&y);
763: if (zz != yy) {
764: VecRestoreArray(zz,&z);
765: }
766: PetscLogFlops(2.0*a->nz);
767: return(0);
768: }
770: /* ----------------------------------------------------------- */
773: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
774: {
775: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
776: IS iscol = a->col,isrow = a->row;
777: PetscErrorCode ierr;
778: const PetscInt *r,*c,*rout,*cout;
779: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
780: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
781: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
782: PetscScalar sum1,sum2,sum3,sum4,sum5;
783: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
784: const PetscScalar *b;
787: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
788: node_max = a->inode.node_count;
789: ns = a->inode.size; /* Node Size array */
791: VecGetArrayRead(bb,&b);
792: VecGetArray(xx,&x);
793: tmp = a->solve_work;
795: ISGetIndices(isrow,&rout); r = rout;
796: ISGetIndices(iscol,&cout); c = cout + (n-1);
798: /* forward solve the lower triangular */
799: tmps = tmp;
800: aa = a_a;
801: aj = a_j;
802: ad = a->diag;
804: for (i = 0,row = 0; i< node_max; ++i) {
805: nsz = ns[i];
806: aii = ai[row];
807: v1 = aa + aii;
808: vi = aj + aii;
809: nz = ad[row]- aii;
810: if (i < node_max-1) {
811: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
812: * but our indexing to determine it's size could. */
813: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
814: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
815: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
816: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
817: }
819: switch (nsz) { /* Each loop in 'case' is unrolled */
820: case 1:
821: sum1 = b[*r++];
822: for (j=0; j<nz-1; j+=2) {
823: i0 = vi[0];
824: i1 = vi[1];
825: vi +=2;
826: tmp0 = tmps[i0];
827: tmp1 = tmps[i1];
828: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
829: }
830: if (j == nz-1) {
831: tmp0 = tmps[*vi++];
832: sum1 -= *v1++ *tmp0;
833: }
834: tmp[row++]=sum1;
835: break;
836: case 2:
837: sum1 = b[*r++];
838: sum2 = b[*r++];
839: v2 = aa + ai[row+1];
841: for (j=0; j<nz-1; j+=2) {
842: i0 = vi[0];
843: i1 = vi[1];
844: vi +=2;
845: tmp0 = tmps[i0];
846: tmp1 = tmps[i1];
847: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849: }
850: if (j == nz-1) {
851: tmp0 = tmps[*vi++];
852: sum1 -= *v1++ *tmp0;
853: sum2 -= *v2++ *tmp0;
854: }
855: sum2 -= *v2++ *sum1;
856: tmp[row++]=sum1;
857: tmp[row++]=sum2;
858: break;
859: case 3:
860: sum1 = b[*r++];
861: sum2 = b[*r++];
862: sum3 = b[*r++];
863: v2 = aa + ai[row+1];
864: v3 = aa + ai[row+2];
866: for (j=0; j<nz-1; j+=2) {
867: i0 = vi[0];
868: i1 = vi[1];
869: vi +=2;
870: tmp0 = tmps[i0];
871: tmp1 = tmps[i1];
872: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
873: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
874: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
875: }
876: if (j == nz-1) {
877: tmp0 = tmps[*vi++];
878: sum1 -= *v1++ *tmp0;
879: sum2 -= *v2++ *tmp0;
880: sum3 -= *v3++ *tmp0;
881: }
882: sum2 -= *v2++ * sum1;
883: sum3 -= *v3++ * sum1;
884: sum3 -= *v3++ * sum2;
886: tmp[row++]=sum1;
887: tmp[row++]=sum2;
888: tmp[row++]=sum3;
889: break;
891: case 4:
892: sum1 = b[*r++];
893: sum2 = b[*r++];
894: sum3 = b[*r++];
895: sum4 = b[*r++];
896: v2 = aa + ai[row+1];
897: v3 = aa + ai[row+2];
898: v4 = aa + ai[row+3];
900: for (j=0; j<nz-1; j+=2) {
901: i0 = vi[0];
902: i1 = vi[1];
903: vi +=2;
904: tmp0 = tmps[i0];
905: tmp1 = tmps[i1];
906: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
908: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
909: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
910: }
911: if (j == nz-1) {
912: tmp0 = tmps[*vi++];
913: sum1 -= *v1++ *tmp0;
914: sum2 -= *v2++ *tmp0;
915: sum3 -= *v3++ *tmp0;
916: sum4 -= *v4++ *tmp0;
917: }
918: sum2 -= *v2++ * sum1;
919: sum3 -= *v3++ * sum1;
920: sum4 -= *v4++ * sum1;
921: sum3 -= *v3++ * sum2;
922: sum4 -= *v4++ * sum2;
923: sum4 -= *v4++ * sum3;
925: tmp[row++]=sum1;
926: tmp[row++]=sum2;
927: tmp[row++]=sum3;
928: tmp[row++]=sum4;
929: break;
930: case 5:
931: sum1 = b[*r++];
932: sum2 = b[*r++];
933: sum3 = b[*r++];
934: sum4 = b[*r++];
935: sum5 = b[*r++];
936: v2 = aa + ai[row+1];
937: v3 = aa + ai[row+2];
938: v4 = aa + ai[row+3];
939: v5 = aa + ai[row+4];
941: for (j=0; j<nz-1; j+=2) {
942: i0 = vi[0];
943: i1 = vi[1];
944: vi +=2;
945: tmp0 = tmps[i0];
946: tmp1 = tmps[i1];
947: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
948: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
949: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
950: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
951: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
952: }
953: if (j == nz-1) {
954: tmp0 = tmps[*vi++];
955: sum1 -= *v1++ *tmp0;
956: sum2 -= *v2++ *tmp0;
957: sum3 -= *v3++ *tmp0;
958: sum4 -= *v4++ *tmp0;
959: sum5 -= *v5++ *tmp0;
960: }
962: sum2 -= *v2++ * sum1;
963: sum3 -= *v3++ * sum1;
964: sum4 -= *v4++ * sum1;
965: sum5 -= *v5++ * sum1;
966: sum3 -= *v3++ * sum2;
967: sum4 -= *v4++ * sum2;
968: sum5 -= *v5++ * sum2;
969: sum4 -= *v4++ * sum3;
970: sum5 -= *v5++ * sum3;
971: sum5 -= *v5++ * sum4;
973: tmp[row++]=sum1;
974: tmp[row++]=sum2;
975: tmp[row++]=sum3;
976: tmp[row++]=sum4;
977: tmp[row++]=sum5;
978: break;
979: default:
980: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
981: }
982: }
983: /* backward solve the upper triangular */
984: for (i=node_max -1,row = n-1; i>=0; i--) {
985: nsz = ns[i];
986: aii = ai[row+1] -1;
987: v1 = aa + aii;
988: vi = aj + aii;
989: nz = aii- ad[row];
990: switch (nsz) { /* Each loop in 'case' is unrolled */
991: case 1:
992: sum1 = tmp[row];
994: for (j=nz; j>1; j-=2) {
995: vi -=2;
996: i0 = vi[2];
997: i1 = vi[1];
998: tmp0 = tmps[i0];
999: tmp1 = tmps[i1];
1000: v1 -= 2;
1001: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002: }
1003: if (j==1) {
1004: tmp0 = tmps[*vi--];
1005: sum1 -= *v1-- * tmp0;
1006: }
1007: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1008: break;
1009: case 2:
1010: sum1 = tmp[row];
1011: sum2 = tmp[row -1];
1012: v2 = aa + ai[row]-1;
1013: for (j=nz; j>1; j-=2) {
1014: vi -=2;
1015: i0 = vi[2];
1016: i1 = vi[1];
1017: tmp0 = tmps[i0];
1018: tmp1 = tmps[i1];
1019: v1 -= 2;
1020: v2 -= 2;
1021: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1022: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1023: }
1024: if (j==1) {
1025: tmp0 = tmps[*vi--];
1026: sum1 -= *v1-- * tmp0;
1027: sum2 -= *v2-- * tmp0;
1028: }
1030: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031: sum2 -= *v2-- * tmp0;
1032: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1033: break;
1034: case 3:
1035: sum1 = tmp[row];
1036: sum2 = tmp[row -1];
1037: sum3 = tmp[row -2];
1038: v2 = aa + ai[row]-1;
1039: v3 = aa + ai[row -1]-1;
1040: for (j=nz; j>1; j-=2) {
1041: vi -=2;
1042: i0 = vi[2];
1043: i1 = vi[1];
1044: tmp0 = tmps[i0];
1045: tmp1 = tmps[i1];
1046: v1 -= 2;
1047: v2 -= 2;
1048: v3 -= 2;
1049: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1050: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1051: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1052: }
1053: if (j==1) {
1054: tmp0 = tmps[*vi--];
1055: sum1 -= *v1-- * tmp0;
1056: sum2 -= *v2-- * tmp0;
1057: sum3 -= *v3-- * tmp0;
1058: }
1059: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1060: sum2 -= *v2-- * tmp0;
1061: sum3 -= *v3-- * tmp0;
1062: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1063: sum3 -= *v3-- * tmp0;
1064: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1066: break;
1067: case 4:
1068: sum1 = tmp[row];
1069: sum2 = tmp[row -1];
1070: sum3 = tmp[row -2];
1071: sum4 = tmp[row -3];
1072: v2 = aa + ai[row]-1;
1073: v3 = aa + ai[row -1]-1;
1074: v4 = aa + ai[row -2]-1;
1076: for (j=nz; j>1; j-=2) {
1077: vi -=2;
1078: i0 = vi[2];
1079: i1 = vi[1];
1080: tmp0 = tmps[i0];
1081: tmp1 = tmps[i1];
1082: v1 -= 2;
1083: v2 -= 2;
1084: v3 -= 2;
1085: v4 -= 2;
1086: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1087: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1088: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1089: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1090: }
1091: if (j==1) {
1092: tmp0 = tmps[*vi--];
1093: sum1 -= *v1-- * tmp0;
1094: sum2 -= *v2-- * tmp0;
1095: sum3 -= *v3-- * tmp0;
1096: sum4 -= *v4-- * tmp0;
1097: }
1099: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1100: sum2 -= *v2-- * tmp0;
1101: sum3 -= *v3-- * tmp0;
1102: sum4 -= *v4-- * tmp0;
1103: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1104: sum3 -= *v3-- * tmp0;
1105: sum4 -= *v4-- * tmp0;
1106: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1107: sum4 -= *v4-- * tmp0;
1108: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1109: break;
1110: case 5:
1111: sum1 = tmp[row];
1112: sum2 = tmp[row -1];
1113: sum3 = tmp[row -2];
1114: sum4 = tmp[row -3];
1115: sum5 = tmp[row -4];
1116: v2 = aa + ai[row]-1;
1117: v3 = aa + ai[row -1]-1;
1118: v4 = aa + ai[row -2]-1;
1119: v5 = aa + ai[row -3]-1;
1120: for (j=nz; j>1; j-=2) {
1121: vi -= 2;
1122: i0 = vi[2];
1123: i1 = vi[1];
1124: tmp0 = tmps[i0];
1125: tmp1 = tmps[i1];
1126: v1 -= 2;
1127: v2 -= 2;
1128: v3 -= 2;
1129: v4 -= 2;
1130: v5 -= 2;
1131: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136: }
1137: if (j==1) {
1138: tmp0 = tmps[*vi--];
1139: sum1 -= *v1-- * tmp0;
1140: sum2 -= *v2-- * tmp0;
1141: sum3 -= *v3-- * tmp0;
1142: sum4 -= *v4-- * tmp0;
1143: sum5 -= *v5-- * tmp0;
1144: }
1146: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1147: sum2 -= *v2-- * tmp0;
1148: sum3 -= *v3-- * tmp0;
1149: sum4 -= *v4-- * tmp0;
1150: sum5 -= *v5-- * tmp0;
1151: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1152: sum3 -= *v3-- * tmp0;
1153: sum4 -= *v4-- * tmp0;
1154: sum5 -= *v5-- * tmp0;
1155: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1156: sum4 -= *v4-- * tmp0;
1157: sum5 -= *v5-- * tmp0;
1158: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1159: sum5 -= *v5-- * tmp0;
1160: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1161: break;
1162: default:
1163: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1164: }
1165: }
1166: ISRestoreIndices(isrow,&rout);
1167: ISRestoreIndices(iscol,&cout);
1168: VecRestoreArrayRead(bb,&b);
1169: VecRestoreArray(xx,&x);
1170: PetscLogFlops(2.0*a->nz - A->cmap->n);
1171: return(0);
1172: }
1176: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1177: {
1178: Mat C =B;
1179: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1180: IS isrow = b->row,isicol = b->icol;
1181: PetscErrorCode ierr;
1182: const PetscInt *r,*ic,*ics;
1183: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1184: PetscInt i,j,k,nz,nzL,row,*pj;
1185: const PetscInt *ajtmp,*bjtmp;
1186: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1187: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1188: FactorShiftCtx sctx;
1189: const PetscInt *ddiag;
1190: PetscReal rs;
1191: MatScalar d;
1192: PetscInt inod,nodesz,node_max,col;
1193: const PetscInt *ns;
1194: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1197: /* MatPivotSetUp(): initialize shift context sctx */
1198: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1200: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1201: ddiag = a->diag;
1202: sctx.shift_top = info->zeropivot;
1203: for (i=0; i<n; i++) {
1204: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1205: d = (aa)[ddiag[i]];
1206: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1207: v = aa+ai[i];
1208: nz = ai[i+1] - ai[i];
1209: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1210: if (rs>sctx.shift_top) sctx.shift_top = rs;
1211: }
1212: sctx.shift_top *= 1.1;
1213: sctx.nshift_max = 5;
1214: sctx.shift_lo = 0.;
1215: sctx.shift_hi = 1.;
1216: }
1218: ISGetIndices(isrow,&r);
1219: ISGetIndices(isicol,&ic);
1221: PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);
1222: PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));
1223: rtmp2 = rtmp1 + n;
1224: rtmp3 = rtmp2 + n;
1225: rtmp4 = rtmp3 + n;
1226: ics = ic;
1228: node_max = a->inode.node_count;
1229: ns = a->inode.size;
1230: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1232: /* If max inode size > 4, split it into two inodes.*/
1233: /* also map the inode sizes according to the ordering */
1234: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1235: for (i=0,j=0; i<node_max; ++i,++j) {
1236: if (ns[i] > 4) {
1237: tmp_vec1[j] = 4;
1238: ++j;
1239: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1240: } else {
1241: tmp_vec1[j] = ns[i];
1242: }
1243: }
1244: /* Use the correct node_max */
1245: node_max = j;
1247: /* Now reorder the inode info based on mat re-ordering info */
1248: /* First create a row -> inode_size_array_index map */
1249: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1250: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1251: for (i=0,row=0; i<node_max; i++) {
1252: nodesz = tmp_vec1[i];
1253: for (j=0; j<nodesz; j++,row++) {
1254: nsmap[row] = i;
1255: }
1256: }
1257: /* Using nsmap, create a reordered ns structure */
1258: for (i=0,j=0; i< node_max; i++) {
1259: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1260: tmp_vec2[i] = nodesz;
1261: j += nodesz;
1262: }
1263: PetscFree(nsmap);
1264: PetscFree(tmp_vec1);
1266: /* Now use the correct ns */
1267: ns = tmp_vec2;
1269: do {
1270: sctx.newshift = PETSC_FALSE;
1271: /* Now loop over each block-row, and do the factorization */
1272: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1273: nodesz = ns[inod];
1275: switch (nodesz) {
1276: case 1:
1277: /*----------*/
1278: /* zero rtmp1 */
1279: /* L part */
1280: nz = bi[i+1] - bi[i];
1281: bjtmp = bj + bi[i];
1282: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1284: /* U part */
1285: nz = bdiag[i]-bdiag[i+1];
1286: bjtmp = bj + bdiag[i+1]+1;
1287: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1289: /* load in initial (unfactored row) */
1290: nz = ai[r[i]+1] - ai[r[i]];
1291: ajtmp = aj + ai[r[i]];
1292: v = aa + ai[r[i]];
1293: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1295: /* ZeropivotApply() */
1296: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1298: /* elimination */
1299: bjtmp = bj + bi[i];
1300: row = *bjtmp++;
1301: nzL = bi[i+1] - bi[i];
1302: for (k=0; k < nzL; k++) {
1303: pc = rtmp1 + row;
1304: if (*pc != 0.0) {
1305: pv = b->a + bdiag[row];
1306: mul1 = *pc * (*pv);
1307: *pc = mul1;
1308: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1309: pv = b->a + bdiag[row+1]+1;
1310: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1311: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1312: PetscLogFlops(1+2*nz);
1313: }
1314: row = *bjtmp++;
1315: }
1317: /* finished row so stick it into b->a */
1318: rs = 0.0;
1319: /* L part */
1320: pv = b->a + bi[i];
1321: pj = b->j + bi[i];
1322: nz = bi[i+1] - bi[i];
1323: for (j=0; j<nz; j++) {
1324: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1325: }
1327: /* U part */
1328: pv = b->a + bdiag[i+1]+1;
1329: pj = b->j + bdiag[i+1]+1;
1330: nz = bdiag[i] - bdiag[i+1]-1;
1331: for (j=0; j<nz; j++) {
1332: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333: }
1335: /* Check zero pivot */
1336: sctx.rs = rs;
1337: sctx.pv = rtmp1[i];
1338: MatPivotCheck(A,info,&sctx,i);
1339: if (sctx.newshift) break;
1341: /* Mark diagonal and invert diagonal for simplier triangular solves */
1342: pv = b->a + bdiag[i];
1343: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1344: break;
1346: case 2:
1347: /*----------*/
1348: /* zero rtmp1 and rtmp2 */
1349: /* L part */
1350: nz = bi[i+1] - bi[i];
1351: bjtmp = bj + bi[i];
1352: for (j=0; j<nz; j++) {
1353: col = bjtmp[j];
1354: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1355: }
1357: /* U part */
1358: nz = bdiag[i]-bdiag[i+1];
1359: bjtmp = bj + bdiag[i+1]+1;
1360: for (j=0; j<nz; j++) {
1361: col = bjtmp[j];
1362: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363: }
1365: /* load in initial (unfactored row) */
1366: nz = ai[r[i]+1] - ai[r[i]];
1367: ajtmp = aj + ai[r[i]];
1368: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1369: for (j=0; j<nz; j++) {
1370: col = ics[ajtmp[j]];
1371: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1372: }
1373: /* ZeropivotApply(): shift the diagonal of the matrix */
1374: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1376: /* elimination */
1377: bjtmp = bj + bi[i];
1378: row = *bjtmp++; /* pivot row */
1379: nzL = bi[i+1] - bi[i];
1380: for (k=0; k < nzL; k++) {
1381: pc1 = rtmp1 + row;
1382: pc2 = rtmp2 + row;
1383: if (*pc1 != 0.0 || *pc2 != 0.0) {
1384: pv = b->a + bdiag[row];
1385: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1386: *pc1 = mul1; *pc2 = mul2;
1388: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1389: pv = b->a + bdiag[row+1]+1;
1390: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1391: for (j=0; j<nz; j++) {
1392: col = pj[j];
1393: rtmp1[col] -= mul1 * pv[j];
1394: rtmp2[col] -= mul2 * pv[j];
1395: }
1396: PetscLogFlops(2+4*nz);
1397: }
1398: row = *bjtmp++;
1399: }
1401: /* finished row i; check zero pivot, then stick row i into b->a */
1402: rs = 0.0;
1403: /* L part */
1404: pc1 = b->a + bi[i];
1405: pj = b->j + bi[i];
1406: nz = bi[i+1] - bi[i];
1407: for (j=0; j<nz; j++) {
1408: col = pj[j];
1409: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1410: }
1411: /* U part */
1412: pc1 = b->a + bdiag[i+1]+1;
1413: pj = b->j + bdiag[i+1]+1;
1414: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1415: for (j=0; j<nz; j++) {
1416: col = pj[j];
1417: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418: }
1420: sctx.rs = rs;
1421: sctx.pv = rtmp1[i];
1422: MatPivotCheck(A,info,&sctx,i);
1423: if (sctx.newshift) break;
1424: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1425: *pc1 = 1.0/sctx.pv;
1427: /* Now take care of diagonal 2x2 block. */
1428: pc2 = rtmp2 + i;
1429: if (*pc2 != 0.0) {
1430: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1431: *pc2 = mul1; /* insert L entry */
1432: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1433: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1434: for (j=0; j<nz; j++) {
1435: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1436: }
1437: PetscLogFlops(1+2*nz);
1438: }
1440: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1441: rs = 0.0;
1442: /* L part */
1443: pc2 = b->a + bi[i+1];
1444: pj = b->j + bi[i+1];
1445: nz = bi[i+2] - bi[i+1];
1446: for (j=0; j<nz; j++) {
1447: col = pj[j];
1448: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1449: }
1450: /* U part */
1451: pc2 = b->a + bdiag[i+2]+1;
1452: pj = b->j + bdiag[i+2]+1;
1453: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1454: for (j=0; j<nz; j++) {
1455: col = pj[j];
1456: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457: }
1459: sctx.rs = rs;
1460: sctx.pv = rtmp2[i+1];
1461: MatPivotCheck(A,info,&sctx,i+1);
1462: if (sctx.newshift) break;
1463: pc2 = b->a + bdiag[i+1];
1464: *pc2 = 1.0/sctx.pv;
1465: break;
1467: case 3:
1468: /*----------*/
1469: /* zero rtmp */
1470: /* L part */
1471: nz = bi[i+1] - bi[i];
1472: bjtmp = bj + bi[i];
1473: for (j=0; j<nz; j++) {
1474: col = bjtmp[j];
1475: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1476: }
1478: /* U part */
1479: nz = bdiag[i]-bdiag[i+1];
1480: bjtmp = bj + bdiag[i+1]+1;
1481: for (j=0; j<nz; j++) {
1482: col = bjtmp[j];
1483: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484: }
1486: /* load in initial (unfactored row) */
1487: nz = ai[r[i]+1] - ai[r[i]];
1488: ajtmp = aj + ai[r[i]];
1489: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1490: for (j=0; j<nz; j++) {
1491: col = ics[ajtmp[j]];
1492: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1493: }
1494: /* ZeropivotApply(): shift the diagonal of the matrix */
1495: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1497: /* elimination */
1498: bjtmp = bj + bi[i];
1499: row = *bjtmp++; /* pivot row */
1500: nzL = bi[i+1] - bi[i];
1501: for (k=0; k < nzL; k++) {
1502: pc1 = rtmp1 + row;
1503: pc2 = rtmp2 + row;
1504: pc3 = rtmp3 + row;
1505: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1506: pv = b->a + bdiag[row];
1507: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1508: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1510: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1511: pv = b->a + bdiag[row+1]+1;
1512: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1513: for (j=0; j<nz; j++) {
1514: col = pj[j];
1515: rtmp1[col] -= mul1 * pv[j];
1516: rtmp2[col] -= mul2 * pv[j];
1517: rtmp3[col] -= mul3 * pv[j];
1518: }
1519: PetscLogFlops(3+6*nz);
1520: }
1521: row = *bjtmp++;
1522: }
1524: /* finished row i; check zero pivot, then stick row i into b->a */
1525: rs = 0.0;
1526: /* L part */
1527: pc1 = b->a + bi[i];
1528: pj = b->j + bi[i];
1529: nz = bi[i+1] - bi[i];
1530: for (j=0; j<nz; j++) {
1531: col = pj[j];
1532: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1533: }
1534: /* U part */
1535: pc1 = b->a + bdiag[i+1]+1;
1536: pj = b->j + bdiag[i+1]+1;
1537: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1538: for (j=0; j<nz; j++) {
1539: col = pj[j];
1540: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541: }
1543: sctx.rs = rs;
1544: sctx.pv = rtmp1[i];
1545: MatPivotCheck(A,info,&sctx,i);
1546: if (sctx.newshift) break;
1547: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1548: *pc1 = 1.0/sctx.pv;
1550: /* Now take care of 1st column of diagonal 3x3 block. */
1551: pc2 = rtmp2 + i;
1552: pc3 = rtmp3 + i;
1553: if (*pc2 != 0.0 || *pc3 != 0.0) {
1554: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1555: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1556: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1557: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1558: for (j=0; j<nz; j++) {
1559: col = pj[j];
1560: rtmp2[col] -= mul2 * rtmp1[col];
1561: rtmp3[col] -= mul3 * rtmp1[col];
1562: }
1563: PetscLogFlops(2+4*nz);
1564: }
1566: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1567: rs = 0.0;
1568: /* L part */
1569: pc2 = b->a + bi[i+1];
1570: pj = b->j + bi[i+1];
1571: nz = bi[i+2] - bi[i+1];
1572: for (j=0; j<nz; j++) {
1573: col = pj[j];
1574: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1575: }
1576: /* U part */
1577: pc2 = b->a + bdiag[i+2]+1;
1578: pj = b->j + bdiag[i+2]+1;
1579: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1580: for (j=0; j<nz; j++) {
1581: col = pj[j];
1582: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583: }
1585: sctx.rs = rs;
1586: sctx.pv = rtmp2[i+1];
1587: MatPivotCheck(A,info,&sctx,i+1);
1588: if (sctx.newshift) break;
1589: pc2 = b->a + bdiag[i+1];
1590: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1592: /* Now take care of 2nd column of diagonal 3x3 block. */
1593: pc3 = rtmp3 + i+1;
1594: if (*pc3 != 0.0) {
1595: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1596: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1597: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1598: for (j=0; j<nz; j++) {
1599: col = pj[j];
1600: rtmp3[col] -= mul3 * rtmp2[col];
1601: }
1602: PetscLogFlops(1+2*nz);
1603: }
1605: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1606: rs = 0.0;
1607: /* L part */
1608: pc3 = b->a + bi[i+2];
1609: pj = b->j + bi[i+2];
1610: nz = bi[i+3] - bi[i+2];
1611: for (j=0; j<nz; j++) {
1612: col = pj[j];
1613: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1614: }
1615: /* U part */
1616: pc3 = b->a + bdiag[i+3]+1;
1617: pj = b->j + bdiag[i+3]+1;
1618: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1619: for (j=0; j<nz; j++) {
1620: col = pj[j];
1621: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622: }
1624: sctx.rs = rs;
1625: sctx.pv = rtmp3[i+2];
1626: MatPivotCheck(A,info,&sctx,i+2);
1627: if (sctx.newshift) break;
1628: pc3 = b->a + bdiag[i+2];
1629: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1630: break;
1631: case 4:
1632: /*----------*/
1633: /* zero rtmp */
1634: /* L part */
1635: nz = bi[i+1] - bi[i];
1636: bjtmp = bj + bi[i];
1637: for (j=0; j<nz; j++) {
1638: col = bjtmp[j];
1639: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1640: }
1642: /* U part */
1643: nz = bdiag[i]-bdiag[i+1];
1644: bjtmp = bj + bdiag[i+1]+1;
1645: for (j=0; j<nz; j++) {
1646: col = bjtmp[j];
1647: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1648: }
1650: /* load in initial (unfactored row) */
1651: nz = ai[r[i]+1] - ai[r[i]];
1652: ajtmp = aj + ai[r[i]];
1653: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1654: for (j=0; j<nz; j++) {
1655: col = ics[ajtmp[j]];
1656: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1657: }
1658: /* ZeropivotApply(): shift the diagonal of the matrix */
1659: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1661: /* elimination */
1662: bjtmp = bj + bi[i];
1663: row = *bjtmp++; /* pivot row */
1664: nzL = bi[i+1] - bi[i];
1665: for (k=0; k < nzL; k++) {
1666: pc1 = rtmp1 + row;
1667: pc2 = rtmp2 + row;
1668: pc3 = rtmp3 + row;
1669: pc4 = rtmp4 + row;
1670: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1671: pv = b->a + bdiag[row];
1672: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1673: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1675: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1676: pv = b->a + bdiag[row+1]+1;
1677: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1678: for (j=0; j<nz; j++) {
1679: col = pj[j];
1680: rtmp1[col] -= mul1 * pv[j];
1681: rtmp2[col] -= mul2 * pv[j];
1682: rtmp3[col] -= mul3 * pv[j];
1683: rtmp4[col] -= mul4 * pv[j];
1684: }
1685: PetscLogFlops(4+8*nz);
1686: }
1687: row = *bjtmp++;
1688: }
1690: /* finished row i; check zero pivot, then stick row i into b->a */
1691: rs = 0.0;
1692: /* L part */
1693: pc1 = b->a + bi[i];
1694: pj = b->j + bi[i];
1695: nz = bi[i+1] - bi[i];
1696: for (j=0; j<nz; j++) {
1697: col = pj[j];
1698: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1699: }
1700: /* U part */
1701: pc1 = b->a + bdiag[i+1]+1;
1702: pj = b->j + bdiag[i+1]+1;
1703: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1704: for (j=0; j<nz; j++) {
1705: col = pj[j];
1706: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707: }
1709: sctx.rs = rs;
1710: sctx.pv = rtmp1[i];
1711: MatPivotCheck(A,info,&sctx,i);
1712: if (sctx.newshift) break;
1713: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1714: *pc1 = 1.0/sctx.pv;
1716: /* Now take care of 1st column of diagonal 4x4 block. */
1717: pc2 = rtmp2 + i;
1718: pc3 = rtmp3 + i;
1719: pc4 = rtmp4 + i;
1720: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1722: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1723: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1724: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1725: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1726: for (j=0; j<nz; j++) {
1727: col = pj[j];
1728: rtmp2[col] -= mul2 * rtmp1[col];
1729: rtmp3[col] -= mul3 * rtmp1[col];
1730: rtmp4[col] -= mul4 * rtmp1[col];
1731: }
1732: PetscLogFlops(3+6*nz);
1733: }
1735: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1736: rs = 0.0;
1737: /* L part */
1738: pc2 = b->a + bi[i+1];
1739: pj = b->j + bi[i+1];
1740: nz = bi[i+2] - bi[i+1];
1741: for (j=0; j<nz; j++) {
1742: col = pj[j];
1743: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1744: }
1745: /* U part */
1746: pc2 = b->a + bdiag[i+2]+1;
1747: pj = b->j + bdiag[i+2]+1;
1748: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1749: for (j=0; j<nz; j++) {
1750: col = pj[j];
1751: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752: }
1754: sctx.rs = rs;
1755: sctx.pv = rtmp2[i+1];
1756: MatPivotCheck(A,info,&sctx,i+1);
1757: if (sctx.newshift) break;
1758: pc2 = b->a + bdiag[i+1];
1759: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1761: /* Now take care of 2nd column of diagonal 4x4 block. */
1762: pc3 = rtmp3 + i+1;
1763: pc4 = rtmp4 + i+1;
1764: if (*pc3 != 0.0 || *pc4 != 0.0) {
1765: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1766: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1767: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1768: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1769: for (j=0; j<nz; j++) {
1770: col = pj[j];
1771: rtmp3[col] -= mul3 * rtmp2[col];
1772: rtmp4[col] -= mul4 * rtmp2[col];
1773: }
1774: PetscLogFlops(4*nz);
1775: }
1777: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1778: rs = 0.0;
1779: /* L part */
1780: pc3 = b->a + bi[i+2];
1781: pj = b->j + bi[i+2];
1782: nz = bi[i+3] - bi[i+2];
1783: for (j=0; j<nz; j++) {
1784: col = pj[j];
1785: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1786: }
1787: /* U part */
1788: pc3 = b->a + bdiag[i+3]+1;
1789: pj = b->j + bdiag[i+3]+1;
1790: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1791: for (j=0; j<nz; j++) {
1792: col = pj[j];
1793: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794: }
1796: sctx.rs = rs;
1797: sctx.pv = rtmp3[i+2];
1798: MatPivotCheck(A,info,&sctx,i+2);
1799: if (sctx.newshift) break;
1800: pc3 = b->a + bdiag[i+2];
1801: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1803: /* Now take care of 3rd column of diagonal 4x4 block. */
1804: pc4 = rtmp4 + i+2;
1805: if (*pc4 != 0.0) {
1806: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1807: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1808: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1809: for (j=0; j<nz; j++) {
1810: col = pj[j];
1811: rtmp4[col] -= mul4 * rtmp3[col];
1812: }
1813: PetscLogFlops(1+2*nz);
1814: }
1816: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1817: rs = 0.0;
1818: /* L part */
1819: pc4 = b->a + bi[i+3];
1820: pj = b->j + bi[i+3];
1821: nz = bi[i+4] - bi[i+3];
1822: for (j=0; j<nz; j++) {
1823: col = pj[j];
1824: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1825: }
1826: /* U part */
1827: pc4 = b->a + bdiag[i+4]+1;
1828: pj = b->j + bdiag[i+4]+1;
1829: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1830: for (j=0; j<nz; j++) {
1831: col = pj[j];
1832: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833: }
1835: sctx.rs = rs;
1836: sctx.pv = rtmp4[i+3];
1837: MatPivotCheck(A,info,&sctx,i+3);
1838: if (sctx.newshift) break;
1839: pc4 = b->a + bdiag[i+3];
1840: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1841: break;
1843: default:
1844: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1845: }
1846: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1847: i += nodesz; /* Update the row */
1848: }
1850: /* MatPivotRefine() */
1851: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1852: /*
1853: * if no shift in this attempt & shifting & started shifting & can refine,
1854: * then try lower shift
1855: */
1856: sctx.shift_hi = sctx.shift_fraction;
1857: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1858: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1859: sctx.newshift = PETSC_TRUE;
1860: sctx.nshift++;
1861: }
1862: } while (sctx.newshift);
1864: PetscFree(rtmp1);
1865: PetscFree(tmp_vec2);
1866: ISRestoreIndices(isicol,&ic);
1867: ISRestoreIndices(isrow,&r);
1869: C->ops->solve = MatSolve_SeqAIJ;
1870: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1871: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1872: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1873: C->ops->matsolve = MatMatSolve_SeqAIJ;
1874: C->assembled = PETSC_TRUE;
1875: C->preallocated = PETSC_TRUE;
1877: PetscLogFlops(C->cmap->n);
1879: /* MatShiftView(A,info,&sctx) */
1880: if (sctx.nshift) {
1881: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1882: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1883: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1884: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1885: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1886: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);
1887: }
1888: }
1889: Mat_CheckInode_FactorLU(C,PETSC_FALSE);
1890: return(0);
1891: }
1895: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1896: {
1897: Mat C = B;
1898: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1899: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1900: PetscErrorCode ierr;
1901: const PetscInt *r,*ic,*c,*ics;
1902: PetscInt n = A->rmap->n,*bi = b->i;
1903: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1904: PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1905: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1906: PetscScalar mul1,mul2,mul3,tmp;
1907: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1908: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1909: PetscReal rs=0.0;
1910: FactorShiftCtx sctx;
1913: sctx.shift_top = 0;
1914: sctx.nshift_max = 0;
1915: sctx.shift_lo = 0;
1916: sctx.shift_hi = 0;
1917: sctx.shift_fraction = 0;
1919: /* if both shift schemes are chosen by user, only use info->shiftpd */
1920: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1921: sctx.shift_top = 0;
1922: for (i=0; i<n; i++) {
1923: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1924: rs = 0.0;
1925: ajtmp = aj + ai[i];
1926: rtmp1 = aa + ai[i];
1927: nz = ai[i+1] - ai[i];
1928: for (j=0; j<nz; j++) {
1929: if (*ajtmp != i) {
1930: rs += PetscAbsScalar(*rtmp1++);
1931: } else {
1932: rs -= PetscRealPart(*rtmp1++);
1933: }
1934: ajtmp++;
1935: }
1936: if (rs>sctx.shift_top) sctx.shift_top = rs;
1937: }
1938: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1939: sctx.shift_top *= 1.1;
1940: sctx.nshift_max = 5;
1941: sctx.shift_lo = 0.;
1942: sctx.shift_hi = 1.;
1943: }
1944: sctx.shift_amount = 0;
1945: sctx.nshift = 0;
1947: ISGetIndices(isrow,&r);
1948: ISGetIndices(iscol,&c);
1949: ISGetIndices(isicol,&ic);
1950: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1951: PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1952: ics = ic;
1953: rtmp22 = rtmp11 + n;
1954: rtmp33 = rtmp22 + n;
1956: node_max = a->inode.node_count;
1957: ns = a->inode.size;
1958: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1960: /* If max inode size > 3, split it into two inodes.*/
1961: /* also map the inode sizes according to the ordering */
1962: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1963: for (i=0,j=0; i<node_max; ++i,++j) {
1964: if (ns[i]>3) {
1965: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1966: ++j;
1967: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1968: } else {
1969: tmp_vec1[j] = ns[i];
1970: }
1971: }
1972: /* Use the correct node_max */
1973: node_max = j;
1975: /* Now reorder the inode info based on mat re-ordering info */
1976: /* First create a row -> inode_size_array_index map */
1977: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1978: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1979: for (i=0,row=0; i<node_max; i++) {
1980: nodesz = tmp_vec1[i];
1981: for (j=0; j<nodesz; j++,row++) {
1982: nsmap[row] = i;
1983: }
1984: }
1985: /* Using nsmap, create a reordered ns structure */
1986: for (i=0,j=0; i< node_max; i++) {
1987: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1988: tmp_vec2[i] = nodesz;
1989: j += nodesz;
1990: }
1991: PetscFree(nsmap);
1992: PetscFree(tmp_vec1);
1993: /* Now use the correct ns */
1994: ns = tmp_vec2;
1996: do {
1997: sctx.newshift = PETSC_FALSE;
1998: /* Now loop over each block-row, and do the factorization */
1999: for (i=0,row=0; i<node_max; i++) {
2000: nodesz = ns[i];
2001: nz = bi[row+1] - bi[row];
2002: bjtmp = bj + bi[row];
2004: switch (nodesz) {
2005: case 1:
2006: for (j=0; j<nz; j++) {
2007: idx = bjtmp[j];
2008: rtmp11[idx] = 0.0;
2009: }
2011: /* load in initial (unfactored row) */
2012: idx = r[row];
2013: nz_tmp = ai[idx+1] - ai[idx];
2014: ajtmp = aj + ai[idx];
2015: v1 = aa + ai[idx];
2017: for (j=0; j<nz_tmp; j++) {
2018: idx = ics[ajtmp[j]];
2019: rtmp11[idx] = v1[j];
2020: }
2021: rtmp11[ics[r[row]]] += sctx.shift_amount;
2023: prow = *bjtmp++;
2024: while (prow < row) {
2025: pc1 = rtmp11 + prow;
2026: if (*pc1 != 0.0) {
2027: pv = ba + bd[prow];
2028: pj = nbj + bd[prow];
2029: mul1 = *pc1 * *pv++;
2030: *pc1 = mul1;
2031: nz_tmp = bi[prow+1] - bd[prow] - 1;
2032: PetscLogFlops(1+2*nz_tmp);
2033: for (j=0; j<nz_tmp; j++) {
2034: tmp = pv[j];
2035: idx = pj[j];
2036: rtmp11[idx] -= mul1 * tmp;
2037: }
2038: }
2039: prow = *bjtmp++;
2040: }
2041: pj = bj + bi[row];
2042: pc1 = ba + bi[row];
2044: sctx.pv = rtmp11[row];
2045: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2046: rs = 0.0;
2047: for (j=0; j<nz; j++) {
2048: idx = pj[j];
2049: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2050: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2051: }
2052: sctx.rs = rs;
2053: MatPivotCheck(A,info,&sctx,row);
2054: if (sctx.newshift) goto endofwhile;
2055: break;
2057: case 2:
2058: for (j=0; j<nz; j++) {
2059: idx = bjtmp[j];
2060: rtmp11[idx] = 0.0;
2061: rtmp22[idx] = 0.0;
2062: }
2064: /* load in initial (unfactored row) */
2065: idx = r[row];
2066: nz_tmp = ai[idx+1] - ai[idx];
2067: ajtmp = aj + ai[idx];
2068: v1 = aa + ai[idx];
2069: v2 = aa + ai[idx+1];
2070: for (j=0; j<nz_tmp; j++) {
2071: idx = ics[ajtmp[j]];
2072: rtmp11[idx] = v1[j];
2073: rtmp22[idx] = v2[j];
2074: }
2075: rtmp11[ics[r[row]]] += sctx.shift_amount;
2076: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2078: prow = *bjtmp++;
2079: while (prow < row) {
2080: pc1 = rtmp11 + prow;
2081: pc2 = rtmp22 + prow;
2082: if (*pc1 != 0.0 || *pc2 != 0.0) {
2083: pv = ba + bd[prow];
2084: pj = nbj + bd[prow];
2085: mul1 = *pc1 * *pv;
2086: mul2 = *pc2 * *pv;
2087: ++pv;
2088: *pc1 = mul1;
2089: *pc2 = mul2;
2091: nz_tmp = bi[prow+1] - bd[prow] - 1;
2092: for (j=0; j<nz_tmp; j++) {
2093: tmp = pv[j];
2094: idx = pj[j];
2095: rtmp11[idx] -= mul1 * tmp;
2096: rtmp22[idx] -= mul2 * tmp;
2097: }
2098: PetscLogFlops(2+4*nz_tmp);
2099: }
2100: prow = *bjtmp++;
2101: }
2103: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2104: pc1 = rtmp11 + prow;
2105: pc2 = rtmp22 + prow;
2107: sctx.pv = *pc1;
2108: pj = bj + bi[prow];
2109: rs = 0.0;
2110: for (j=0; j<nz; j++) {
2111: idx = pj[j];
2112: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2113: }
2114: sctx.rs = rs;
2115: MatPivotCheck(A,info,&sctx,row);
2116: if (sctx.newshift) goto endofwhile;
2118: if (*pc2 != 0.0) {
2119: pj = nbj + bd[prow];
2120: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2121: *pc2 = mul2;
2122: nz_tmp = bi[prow+1] - bd[prow] - 1;
2123: for (j=0; j<nz_tmp; j++) {
2124: idx = pj[j];
2125: tmp = rtmp11[idx];
2126: rtmp22[idx] -= mul2 * tmp;
2127: }
2128: PetscLogFlops(1+2*nz_tmp);
2129: }
2131: pj = bj + bi[row];
2132: pc1 = ba + bi[row];
2133: pc2 = ba + bi[row+1];
2135: sctx.pv = rtmp22[row+1];
2136: rs = 0.0;
2137: rtmp11[row] = 1.0/rtmp11[row];
2138: rtmp22[row+1] = 1.0/rtmp22[row+1];
2139: /* copy row entries from dense representation to sparse */
2140: for (j=0; j<nz; j++) {
2141: idx = pj[j];
2142: pc1[j] = rtmp11[idx];
2143: pc2[j] = rtmp22[idx];
2144: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2145: }
2146: sctx.rs = rs;
2147: MatPivotCheck(A,info,&sctx,row+1);
2148: if (sctx.newshift) goto endofwhile;
2149: break;
2151: case 3:
2152: for (j=0; j<nz; j++) {
2153: idx = bjtmp[j];
2154: rtmp11[idx] = 0.0;
2155: rtmp22[idx] = 0.0;
2156: rtmp33[idx] = 0.0;
2157: }
2158: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2159: idx = r[row];
2160: nz_tmp = ai[idx+1] - ai[idx];
2161: ajtmp = aj + ai[idx];
2162: v1 = aa + ai[idx];
2163: v2 = aa + ai[idx+1];
2164: v3 = aa + ai[idx+2];
2165: for (j=0; j<nz_tmp; j++) {
2166: idx = ics[ajtmp[j]];
2167: rtmp11[idx] = v1[j];
2168: rtmp22[idx] = v2[j];
2169: rtmp33[idx] = v3[j];
2170: }
2171: rtmp11[ics[r[row]]] += sctx.shift_amount;
2172: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2173: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2175: /* loop over all pivot row blocks above this row block */
2176: prow = *bjtmp++;
2177: while (prow < row) {
2178: pc1 = rtmp11 + prow;
2179: pc2 = rtmp22 + prow;
2180: pc3 = rtmp33 + prow;
2181: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2182: pv = ba + bd[prow];
2183: pj = nbj + bd[prow];
2184: mul1 = *pc1 * *pv;
2185: mul2 = *pc2 * *pv;
2186: mul3 = *pc3 * *pv;
2187: ++pv;
2188: *pc1 = mul1;
2189: *pc2 = mul2;
2190: *pc3 = mul3;
2192: nz_tmp = bi[prow+1] - bd[prow] - 1;
2193: /* update this row based on pivot row */
2194: for (j=0; j<nz_tmp; j++) {
2195: tmp = pv[j];
2196: idx = pj[j];
2197: rtmp11[idx] -= mul1 * tmp;
2198: rtmp22[idx] -= mul2 * tmp;
2199: rtmp33[idx] -= mul3 * tmp;
2200: }
2201: PetscLogFlops(3+6*nz_tmp);
2202: }
2203: prow = *bjtmp++;
2204: }
2206: /* Now take care of diagonal 3x3 block in this set of rows */
2207: /* note: prow = row here */
2208: pc1 = rtmp11 + prow;
2209: pc2 = rtmp22 + prow;
2210: pc3 = rtmp33 + prow;
2212: sctx.pv = *pc1;
2213: pj = bj + bi[prow];
2214: rs = 0.0;
2215: for (j=0; j<nz; j++) {
2216: idx = pj[j];
2217: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2218: }
2219: sctx.rs = rs;
2220: MatPivotCheck(A,info,&sctx,row);
2221: if (sctx.newshift) goto endofwhile;
2223: if (*pc2 != 0.0 || *pc3 != 0.0) {
2224: mul2 = (*pc2)/(*pc1);
2225: mul3 = (*pc3)/(*pc1);
2226: *pc2 = mul2;
2227: *pc3 = mul3;
2228: nz_tmp = bi[prow+1] - bd[prow] - 1;
2229: pj = nbj + bd[prow];
2230: for (j=0; j<nz_tmp; j++) {
2231: idx = pj[j];
2232: tmp = rtmp11[idx];
2233: rtmp22[idx] -= mul2 * tmp;
2234: rtmp33[idx] -= mul3 * tmp;
2235: }
2236: PetscLogFlops(2+4*nz_tmp);
2237: }
2238: ++prow;
2240: pc2 = rtmp22 + prow;
2241: pc3 = rtmp33 + prow;
2242: sctx.pv = *pc2;
2243: pj = bj + bi[prow];
2244: rs = 0.0;
2245: for (j=0; j<nz; j++) {
2246: idx = pj[j];
2247: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2248: }
2249: sctx.rs = rs;
2250: MatPivotCheck(A,info,&sctx,row+1);
2251: if (sctx.newshift) goto endofwhile;
2253: if (*pc3 != 0.0) {
2254: mul3 = (*pc3)/(*pc2);
2255: *pc3 = mul3;
2256: pj = nbj + bd[prow];
2257: nz_tmp = bi[prow+1] - bd[prow] - 1;
2258: for (j=0; j<nz_tmp; j++) {
2259: idx = pj[j];
2260: tmp = rtmp22[idx];
2261: rtmp33[idx] -= mul3 * tmp;
2262: }
2263: PetscLogFlops(1+2*nz_tmp);
2264: }
2266: pj = bj + bi[row];
2267: pc1 = ba + bi[row];
2268: pc2 = ba + bi[row+1];
2269: pc3 = ba + bi[row+2];
2271: sctx.pv = rtmp33[row+2];
2272: rs = 0.0;
2273: rtmp11[row] = 1.0/rtmp11[row];
2274: rtmp22[row+1] = 1.0/rtmp22[row+1];
2275: rtmp33[row+2] = 1.0/rtmp33[row+2];
2276: /* copy row entries from dense representation to sparse */
2277: for (j=0; j<nz; j++) {
2278: idx = pj[j];
2279: pc1[j] = rtmp11[idx];
2280: pc2[j] = rtmp22[idx];
2281: pc3[j] = rtmp33[idx];
2282: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2283: }
2285: sctx.rs = rs;
2286: MatPivotCheck(A,info,&sctx,row+2);
2287: if (sctx.newshift) goto endofwhile;
2288: break;
2290: default:
2291: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2292: }
2293: row += nodesz; /* Update the row */
2294: }
2295: endofwhile:;
2296: } while (sctx.newshift);
2297: PetscFree(rtmp11);
2298: PetscFree(tmp_vec2);
2299: ISRestoreIndices(isicol,&ic);
2300: ISRestoreIndices(isrow,&r);
2301: ISRestoreIndices(iscol,&c);
2303: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2304: /* do not set solve add, since MatSolve_Inode + Add is faster */
2305: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2306: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2307: C->assembled = PETSC_TRUE;
2308: C->preallocated = PETSC_TRUE;
2309: if (sctx.nshift) {
2310: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2311: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
2312: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2313: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
2314: }
2315: }
2316: PetscLogFlops(C->cmap->n);
2317: Mat_CheckInode(C,PETSC_FALSE);
2318: return(0);
2319: }
2322: /* ----------------------------------------------------------- */
2325: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2326: {
2327: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2328: IS iscol = a->col,isrow = a->row;
2329: PetscErrorCode ierr;
2330: const PetscInt *r,*c,*rout,*cout;
2331: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2332: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2333: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2334: PetscScalar sum1,sum2,sum3,sum4,sum5;
2335: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336: const PetscScalar *b;
2339: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340: node_max = a->inode.node_count;
2341: ns = a->inode.size; /* Node Size array */
2343: VecGetArrayRead(bb,&b);
2344: VecGetArray(xx,&x);
2345: tmp = a->solve_work;
2347: ISGetIndices(isrow,&rout); r = rout;
2348: ISGetIndices(iscol,&cout); c = cout;
2350: /* forward solve the lower triangular */
2351: tmps = tmp;
2352: aa = a_a;
2353: aj = a_j;
2354: ad = a->diag;
2356: for (i = 0,row = 0; i< node_max; ++i) {
2357: nsz = ns[i];
2358: aii = ai[row];
2359: v1 = aa + aii;
2360: vi = aj + aii;
2361: nz = ai[row+1]- ai[row];
2363: if (i < node_max-1) {
2364: /* Prefetch the indices for the next block */
2365: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366: /* Prefetch the data for the next block */
2367: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368: }
2370: switch (nsz) { /* Each loop in 'case' is unrolled */
2371: case 1:
2372: sum1 = b[r[row]];
2373: for (j=0; j<nz-1; j+=2) {
2374: i0 = vi[j];
2375: i1 = vi[j+1];
2376: tmp0 = tmps[i0];
2377: tmp1 = tmps[i1];
2378: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379: }
2380: if (j == nz-1) {
2381: tmp0 = tmps[vi[j]];
2382: sum1 -= v1[j]*tmp0;
2383: }
2384: tmp[row++]=sum1;
2385: break;
2386: case 2:
2387: sum1 = b[r[row]];
2388: sum2 = b[r[row+1]];
2389: v2 = aa + ai[row+1];
2391: for (j=0; j<nz-1; j+=2) {
2392: i0 = vi[j];
2393: i1 = vi[j+1];
2394: tmp0 = tmps[i0];
2395: tmp1 = tmps[i1];
2396: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398: }
2399: if (j == nz-1) {
2400: tmp0 = tmps[vi[j]];
2401: sum1 -= v1[j] *tmp0;
2402: sum2 -= v2[j] *tmp0;
2403: }
2404: sum2 -= v2[nz] * sum1;
2405: tmp[row++]=sum1;
2406: tmp[row++]=sum2;
2407: break;
2408: case 3:
2409: sum1 = b[r[row]];
2410: sum2 = b[r[row+1]];
2411: sum3 = b[r[row+2]];
2412: v2 = aa + ai[row+1];
2413: v3 = aa + ai[row+2];
2415: for (j=0; j<nz-1; j+=2) {
2416: i0 = vi[j];
2417: i1 = vi[j+1];
2418: tmp0 = tmps[i0];
2419: tmp1 = tmps[i1];
2420: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423: }
2424: if (j == nz-1) {
2425: tmp0 = tmps[vi[j]];
2426: sum1 -= v1[j] *tmp0;
2427: sum2 -= v2[j] *tmp0;
2428: sum3 -= v3[j] *tmp0;
2429: }
2430: sum2 -= v2[nz] * sum1;
2431: sum3 -= v3[nz] * sum1;
2432: sum3 -= v3[nz+1] * sum2;
2433: tmp[row++]=sum1;
2434: tmp[row++]=sum2;
2435: tmp[row++]=sum3;
2436: break;
2438: case 4:
2439: sum1 = b[r[row]];
2440: sum2 = b[r[row+1]];
2441: sum3 = b[r[row+2]];
2442: sum4 = b[r[row+3]];
2443: v2 = aa + ai[row+1];
2444: v3 = aa + ai[row+2];
2445: v4 = aa + ai[row+3];
2447: for (j=0; j<nz-1; j+=2) {
2448: i0 = vi[j];
2449: i1 = vi[j+1];
2450: tmp0 = tmps[i0];
2451: tmp1 = tmps[i1];
2452: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456: }
2457: if (j == nz-1) {
2458: tmp0 = tmps[vi[j]];
2459: sum1 -= v1[j] *tmp0;
2460: sum2 -= v2[j] *tmp0;
2461: sum3 -= v3[j] *tmp0;
2462: sum4 -= v4[j] *tmp0;
2463: }
2464: sum2 -= v2[nz] * sum1;
2465: sum3 -= v3[nz] * sum1;
2466: sum4 -= v4[nz] * sum1;
2467: sum3 -= v3[nz+1] * sum2;
2468: sum4 -= v4[nz+1] * sum2;
2469: sum4 -= v4[nz+2] * sum3;
2471: tmp[row++]=sum1;
2472: tmp[row++]=sum2;
2473: tmp[row++]=sum3;
2474: tmp[row++]=sum4;
2475: break;
2476: case 5:
2477: sum1 = b[r[row]];
2478: sum2 = b[r[row+1]];
2479: sum3 = b[r[row+2]];
2480: sum4 = b[r[row+3]];
2481: sum5 = b[r[row+4]];
2482: v2 = aa + ai[row+1];
2483: v3 = aa + ai[row+2];
2484: v4 = aa + ai[row+3];
2485: v5 = aa + ai[row+4];
2487: for (j=0; j<nz-1; j+=2) {
2488: i0 = vi[j];
2489: i1 = vi[j+1];
2490: tmp0 = tmps[i0];
2491: tmp1 = tmps[i1];
2492: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497: }
2498: if (j == nz-1) {
2499: tmp0 = tmps[vi[j]];
2500: sum1 -= v1[j] *tmp0;
2501: sum2 -= v2[j] *tmp0;
2502: sum3 -= v3[j] *tmp0;
2503: sum4 -= v4[j] *tmp0;
2504: sum5 -= v5[j] *tmp0;
2505: }
2507: sum2 -= v2[nz] * sum1;
2508: sum3 -= v3[nz] * sum1;
2509: sum4 -= v4[nz] * sum1;
2510: sum5 -= v5[nz] * sum1;
2511: sum3 -= v3[nz+1] * sum2;
2512: sum4 -= v4[nz+1] * sum2;
2513: sum5 -= v5[nz+1] * sum2;
2514: sum4 -= v4[nz+2] * sum3;
2515: sum5 -= v5[nz+2] * sum3;
2516: sum5 -= v5[nz+3] * sum4;
2518: tmp[row++]=sum1;
2519: tmp[row++]=sum2;
2520: tmp[row++]=sum3;
2521: tmp[row++]=sum4;
2522: tmp[row++]=sum5;
2523: break;
2524: default:
2525: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526: }
2527: }
2528: /* backward solve the upper triangular */
2529: for (i=node_max -1,row = n-1; i>=0; i--) {
2530: nsz = ns[i];
2531: aii = ad[row+1] + 1;
2532: v1 = aa + aii;
2533: vi = aj + aii;
2534: nz = ad[row]- ad[row+1] - 1;
2536: if (i > 0) {
2537: /* Prefetch the indices for the next block */
2538: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539: /* Prefetch the data for the next block */
2540: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541: }
2543: switch (nsz) { /* Each loop in 'case' is unrolled */
2544: case 1:
2545: sum1 = tmp[row];
2547: for (j=0; j<nz-1; j+=2) {
2548: i0 = vi[j];
2549: i1 = vi[j+1];
2550: tmp0 = tmps[i0];
2551: tmp1 = tmps[i1];
2552: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553: }
2554: if (j == nz-1) {
2555: tmp0 = tmps[vi[j]];
2556: sum1 -= v1[j]*tmp0;
2557: }
2558: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559: break;
2560: case 2:
2561: sum1 = tmp[row];
2562: sum2 = tmp[row-1];
2563: v2 = aa + ad[row] + 1;
2564: for (j=0; j<nz-1; j+=2) {
2565: i0 = vi[j];
2566: i1 = vi[j+1];
2567: tmp0 = tmps[i0];
2568: tmp1 = tmps[i1];
2569: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571: }
2572: if (j == nz-1) {
2573: tmp0 = tmps[vi[j]];
2574: sum1 -= v1[j]* tmp0;
2575: sum2 -= v2[j+1]* tmp0;
2576: }
2578: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579: sum2 -= v2[0] * tmp0;
2580: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581: break;
2582: case 3:
2583: sum1 = tmp[row];
2584: sum2 = tmp[row -1];
2585: sum3 = tmp[row -2];
2586: v2 = aa + ad[row] + 1;
2587: v3 = aa + ad[row -1] + 1;
2588: for (j=0; j<nz-1; j+=2) {
2589: i0 = vi[j];
2590: i1 = vi[j+1];
2591: tmp0 = tmps[i0];
2592: tmp1 = tmps[i1];
2593: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596: }
2597: if (j== nz-1) {
2598: tmp0 = tmps[vi[j]];
2599: sum1 -= v1[j] * tmp0;
2600: sum2 -= v2[j+1] * tmp0;
2601: sum3 -= v3[j+2] * tmp0;
2602: }
2603: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604: sum2 -= v2[0]* tmp0;
2605: sum3 -= v3[1] * tmp0;
2606: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607: sum3 -= v3[0]* tmp0;
2608: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2610: break;
2611: case 4:
2612: sum1 = tmp[row];
2613: sum2 = tmp[row -1];
2614: sum3 = tmp[row -2];
2615: sum4 = tmp[row -3];
2616: v2 = aa + ad[row]+1;
2617: v3 = aa + ad[row -1]+1;
2618: v4 = aa + ad[row -2]+1;
2620: for (j=0; j<nz-1; j+=2) {
2621: i0 = vi[j];
2622: i1 = vi[j+1];
2623: tmp0 = tmps[i0];
2624: tmp1 = tmps[i1];
2625: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2626: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629: }
2630: if (j== nz-1) {
2631: tmp0 = tmps[vi[j]];
2632: sum1 -= v1[j] * tmp0;
2633: sum2 -= v2[j+1] * tmp0;
2634: sum3 -= v3[j+2] * tmp0;
2635: sum4 -= v4[j+3] * tmp0;
2636: }
2638: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639: sum2 -= v2[0] * tmp0;
2640: sum3 -= v3[1] * tmp0;
2641: sum4 -= v4[2] * tmp0;
2642: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643: sum3 -= v3[0] * tmp0;
2644: sum4 -= v4[1] * tmp0;
2645: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646: sum4 -= v4[0] * tmp0;
2647: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648: break;
2649: case 5:
2650: sum1 = tmp[row];
2651: sum2 = tmp[row -1];
2652: sum3 = tmp[row -2];
2653: sum4 = tmp[row -3];
2654: sum5 = tmp[row -4];
2655: v2 = aa + ad[row]+1;
2656: v3 = aa + ad[row -1]+1;
2657: v4 = aa + ad[row -2]+1;
2658: v5 = aa + ad[row -3]+1;
2659: for (j=0; j<nz-1; j+=2) {
2660: i0 = vi[j];
2661: i1 = vi[j+1];
2662: tmp0 = tmps[i0];
2663: tmp1 = tmps[i1];
2664: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669: }
2670: if (j==nz-1) {
2671: tmp0 = tmps[vi[j]];
2672: sum1 -= v1[j] * tmp0;
2673: sum2 -= v2[j+1] * tmp0;
2674: sum3 -= v3[j+2] * tmp0;
2675: sum4 -= v4[j+3] * tmp0;
2676: sum5 -= v5[j+4] * tmp0;
2677: }
2679: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680: sum2 -= v2[0] * tmp0;
2681: sum3 -= v3[1] * tmp0;
2682: sum4 -= v4[2] * tmp0;
2683: sum5 -= v5[3] * tmp0;
2684: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685: sum3 -= v3[0] * tmp0;
2686: sum4 -= v4[1] * tmp0;
2687: sum5 -= v5[2] * tmp0;
2688: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689: sum4 -= v4[0] * tmp0;
2690: sum5 -= v5[1] * tmp0;
2691: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692: sum5 -= v5[0] * tmp0;
2693: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694: break;
2695: default:
2696: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697: }
2698: }
2699: ISRestoreIndices(isrow,&rout);
2700: ISRestoreIndices(iscol,&cout);
2701: VecRestoreArrayRead(bb,&b);
2702: VecRestoreArray(xx,&x);
2703: PetscLogFlops(2.0*a->nz - A->cmap->n);
2704: return(0);
2705: }
2708: /*
2709: Makes a longer coloring[] array and calls the usual code with that
2710: */
2713: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714: {
2715: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2716: PetscErrorCode ierr;
2717: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718: PetscInt *colorused,i;
2719: ISColoringValue *newcolor;
2722: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
2723: /* loop over inodes, marking a color for each column*/
2724: row = 0;
2725: for (i=0; i<m; i++) {
2726: for (j=0; j<ns[i]; j++) {
2727: newcolor[row++] = coloring[i] + j*ncolors;
2728: }
2729: }
2731: /* eliminate unneeded colors */
2732: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
2733: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
2734: for (i=0; i<n; i++) {
2735: colorused[newcolor[i]] = 1;
2736: }
2738: for (i=1; i<5*ncolors; i++) {
2739: colorused[i] += colorused[i-1];
2740: }
2741: ncolors = colorused[5*ncolors-1];
2742: for (i=0; i<n; i++) {
2743: newcolor[i] = colorused[newcolor[i]]-1;
2744: }
2745: PetscFree(colorused);
2746: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);
2747: PetscFree(coloring);
2748: return(0);
2749: }
2751: #include <petsc-private/kernels/blockinvert.h>
2755: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2756: {
2757: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2758: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2759: MatScalar *ibdiag,*bdiag,work[25],*t;
2760: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2761: const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5;
2762: const PetscScalar *xb, *b;
2763: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2764: PetscErrorCode ierr;
2765: PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2766: PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2769: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2770: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2772: if (!a->inode.ibdiagvalid) {
2773: if (!a->inode.ibdiag) {
2774: /* calculate space needed for diagonal blocks */
2775: for (i=0; i<m; i++) {
2776: cnt += sizes[i]*sizes[i];
2777: }
2778: a->inode.bdiagsize = cnt;
2780: PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);
2781: }
2783: /* copy over the diagonal blocks and invert them */
2784: ibdiag = a->inode.ibdiag;
2785: bdiag = a->inode.bdiag;
2786: cnt = 0;
2787: for (i=0, row = 0; i<m; i++) {
2788: for (j=0; j<sizes[i]; j++) {
2789: for (k=0; k<sizes[i]; k++) {
2790: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2791: }
2792: }
2793: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2795: switch (sizes[i]) {
2796: case 1:
2797: /* Create matrix data structure */
2798: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2799: ibdiag[cnt] = 1.0/ibdiag[cnt];
2800: break;
2801: case 2:
2802: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2803: break;
2804: case 3:
2805: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2806: break;
2807: case 4:
2808: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2809: break;
2810: case 5:
2811: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2812: break;
2813: default:
2814: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2815: }
2816: cnt += sizes[i]*sizes[i];
2817: row += sizes[i];
2818: }
2819: a->inode.ibdiagvalid = PETSC_TRUE;
2820: }
2821: ibdiag = a->inode.ibdiag;
2822: bdiag = a->inode.bdiag;
2823: t = a->inode.ssor_work;
2825: VecGetArray(xx,&x);
2826: VecGetArrayRead(bb,&b);
2827: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2828: if (flag & SOR_ZERO_INITIAL_GUESS) {
2829: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2831: for (i=0, row=0; i<m; i++) {
2832: sz = diag[row] - ii[row];
2833: v1 = a->a + ii[row];
2834: idx = a->j + ii[row];
2836: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2837: switch (sizes[i]) {
2838: case 1:
2840: sum1 = b[row];
2841: for (n = 0; n<sz-1; n+=2) {
2842: i1 = idx[0];
2843: i2 = idx[1];
2844: idx += 2;
2845: tmp0 = x[i1];
2846: tmp1 = x[i2];
2847: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2848: }
2850: if (n == sz-1) {
2851: tmp0 = x[*idx];
2852: sum1 -= *v1 * tmp0;
2853: }
2854: t[row] = sum1;
2855: x[row++] = sum1*(*ibdiag++);
2856: break;
2857: case 2:
2858: v2 = a->a + ii[row+1];
2859: sum1 = b[row];
2860: sum2 = b[row+1];
2861: for (n = 0; n<sz-1; n+=2) {
2862: i1 = idx[0];
2863: i2 = idx[1];
2864: idx += 2;
2865: tmp0 = x[i1];
2866: tmp1 = x[i2];
2867: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2868: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2869: }
2871: if (n == sz-1) {
2872: tmp0 = x[*idx];
2873: sum1 -= v1[0] * tmp0;
2874: sum2 -= v2[0] * tmp0;
2875: }
2876: t[row] = sum1;
2877: t[row+1] = sum2;
2878: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2879: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2880: ibdiag += 4;
2881: break;
2882: case 3:
2883: v2 = a->a + ii[row+1];
2884: v3 = a->a + ii[row+2];
2885: sum1 = b[row];
2886: sum2 = b[row+1];
2887: sum3 = b[row+2];
2888: for (n = 0; n<sz-1; n+=2) {
2889: i1 = idx[0];
2890: i2 = idx[1];
2891: idx += 2;
2892: tmp0 = x[i1];
2893: tmp1 = x[i2];
2894: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2895: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2896: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2897: }
2899: if (n == sz-1) {
2900: tmp0 = x[*idx];
2901: sum1 -= v1[0] * tmp0;
2902: sum2 -= v2[0] * tmp0;
2903: sum3 -= v3[0] * tmp0;
2904: }
2905: t[row] = sum1;
2906: t[row+1] = sum2;
2907: t[row+2] = sum3;
2908: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2909: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2910: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2911: ibdiag += 9;
2912: break;
2913: case 4:
2914: v2 = a->a + ii[row+1];
2915: v3 = a->a + ii[row+2];
2916: v4 = a->a + ii[row+3];
2917: sum1 = b[row];
2918: sum2 = b[row+1];
2919: sum3 = b[row+2];
2920: sum4 = b[row+3];
2921: for (n = 0; n<sz-1; n+=2) {
2922: i1 = idx[0];
2923: i2 = idx[1];
2924: idx += 2;
2925: tmp0 = x[i1];
2926: tmp1 = x[i2];
2927: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2928: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2929: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2930: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2931: }
2933: if (n == sz-1) {
2934: tmp0 = x[*idx];
2935: sum1 -= v1[0] * tmp0;
2936: sum2 -= v2[0] * tmp0;
2937: sum3 -= v3[0] * tmp0;
2938: sum4 -= v4[0] * tmp0;
2939: }
2940: t[row] = sum1;
2941: t[row+1] = sum2;
2942: t[row+2] = sum3;
2943: t[row+3] = sum4;
2944: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2945: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2946: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2947: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2948: ibdiag += 16;
2949: break;
2950: case 5:
2951: v2 = a->a + ii[row+1];
2952: v3 = a->a + ii[row+2];
2953: v4 = a->a + ii[row+3];
2954: v5 = a->a + ii[row+4];
2955: sum1 = b[row];
2956: sum2 = b[row+1];
2957: sum3 = b[row+2];
2958: sum4 = b[row+3];
2959: sum5 = b[row+4];
2960: for (n = 0; n<sz-1; n+=2) {
2961: i1 = idx[0];
2962: i2 = idx[1];
2963: idx += 2;
2964: tmp0 = x[i1];
2965: tmp1 = x[i2];
2966: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2967: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2968: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2969: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2970: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2971: }
2973: if (n == sz-1) {
2974: tmp0 = x[*idx];
2975: sum1 -= v1[0] * tmp0;
2976: sum2 -= v2[0] * tmp0;
2977: sum3 -= v3[0] * tmp0;
2978: sum4 -= v4[0] * tmp0;
2979: sum5 -= v5[0] * tmp0;
2980: }
2981: t[row] = sum1;
2982: t[row+1] = sum2;
2983: t[row+2] = sum3;
2984: t[row+3] = sum4;
2985: t[row+4] = sum5;
2986: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2987: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2988: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2989: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2990: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2991: ibdiag += 25;
2992: break;
2993: default:
2994: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2995: }
2996: }
2998: xb = t;
2999: PetscLogFlops(a->nz);
3000: } else xb = b;
3001: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3003: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3004: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3005: ibdiag -= sizes[i]*sizes[i];
3006: sz = ii[row+1] - diag[row] - 1;
3007: v1 = a->a + diag[row] + 1;
3008: idx = a->j + diag[row] + 1;
3010: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3011: switch (sizes[i]) {
3012: case 1:
3014: sum1 = xb[row];
3015: for (n = 0; n<sz-1; n+=2) {
3016: i1 = idx[0];
3017: i2 = idx[1];
3018: idx += 2;
3019: tmp0 = x[i1];
3020: tmp1 = x[i2];
3021: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3022: }
3024: if (n == sz-1) {
3025: tmp0 = x[*idx];
3026: sum1 -= *v1*tmp0;
3027: }
3028: x[row--] = sum1*(*ibdiag);
3029: break;
3031: case 2:
3033: sum1 = xb[row];
3034: sum2 = xb[row-1];
3035: /* note that sum1 is associated with the second of the two rows */
3036: v2 = a->a + diag[row-1] + 2;
3037: for (n = 0; n<sz-1; n+=2) {
3038: i1 = idx[0];
3039: i2 = idx[1];
3040: idx += 2;
3041: tmp0 = x[i1];
3042: tmp1 = x[i2];
3043: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3044: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3045: }
3047: if (n == sz-1) {
3048: tmp0 = x[*idx];
3049: sum1 -= *v1*tmp0;
3050: sum2 -= *v2*tmp0;
3051: }
3052: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3053: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3054: break;
3055: case 3:
3057: sum1 = xb[row];
3058: sum2 = xb[row-1];
3059: sum3 = xb[row-2];
3060: v2 = a->a + diag[row-1] + 2;
3061: v3 = a->a + diag[row-2] + 3;
3062: for (n = 0; n<sz-1; n+=2) {
3063: i1 = idx[0];
3064: i2 = idx[1];
3065: idx += 2;
3066: tmp0 = x[i1];
3067: tmp1 = x[i2];
3068: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071: }
3073: if (n == sz-1) {
3074: tmp0 = x[*idx];
3075: sum1 -= *v1*tmp0;
3076: sum2 -= *v2*tmp0;
3077: sum3 -= *v3*tmp0;
3078: }
3079: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3080: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3081: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3082: break;
3083: case 4:
3085: sum1 = xb[row];
3086: sum2 = xb[row-1];
3087: sum3 = xb[row-2];
3088: sum4 = xb[row-3];
3089: v2 = a->a + diag[row-1] + 2;
3090: v3 = a->a + diag[row-2] + 3;
3091: v4 = a->a + diag[row-3] + 4;
3092: for (n = 0; n<sz-1; n+=2) {
3093: i1 = idx[0];
3094: i2 = idx[1];
3095: idx += 2;
3096: tmp0 = x[i1];
3097: tmp1 = x[i2];
3098: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3101: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3102: }
3104: if (n == sz-1) {
3105: tmp0 = x[*idx];
3106: sum1 -= *v1*tmp0;
3107: sum2 -= *v2*tmp0;
3108: sum3 -= *v3*tmp0;
3109: sum4 -= *v4*tmp0;
3110: }
3111: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3112: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3113: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3114: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3115: break;
3116: case 5:
3118: sum1 = xb[row];
3119: sum2 = xb[row-1];
3120: sum3 = xb[row-2];
3121: sum4 = xb[row-3];
3122: sum5 = xb[row-4];
3123: v2 = a->a + diag[row-1] + 2;
3124: v3 = a->a + diag[row-2] + 3;
3125: v4 = a->a + diag[row-3] + 4;
3126: v5 = a->a + diag[row-4] + 5;
3127: for (n = 0; n<sz-1; n+=2) {
3128: i1 = idx[0];
3129: i2 = idx[1];
3130: idx += 2;
3131: tmp0 = x[i1];
3132: tmp1 = x[i2];
3133: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3134: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3135: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3136: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3137: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3138: }
3140: if (n == sz-1) {
3141: tmp0 = x[*idx];
3142: sum1 -= *v1*tmp0;
3143: sum2 -= *v2*tmp0;
3144: sum3 -= *v3*tmp0;
3145: sum4 -= *v4*tmp0;
3146: sum5 -= *v5*tmp0;
3147: }
3148: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3149: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3150: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3151: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3152: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3153: break;
3154: default:
3155: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3156: }
3157: }
3159: PetscLogFlops(a->nz);
3160: }
3161: its--;
3162: }
3163: while (its--) {
3165: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3166: ibdiag = a->inode.ibdiag;
3168: for (i=0, row=0; i<m; i++) {
3169: sz = ii[row + 1] - ii[row];
3170: v1 = a->a + ii[row];
3171: idx = a->j + ii[row];
3173: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3174: switch (sizes[i]) {
3175: case 1:
3177: sum1 = b[row];
3178: for (n = 0; n<sz-1; n+=2) {
3179: i1 = idx[0];
3180: i2 = idx[1];
3181: idx += 2;
3182: tmp0 = x[i1];
3183: tmp1 = x[i2];
3184: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3185: }
3187: if (n == sz-1) {
3188: tmp0 = x[*idx];
3189: sum1 -= *v1 * tmp0;
3190: }
3192: /* in MatSOR_SeqAIJ this line would be
3193: *
3194: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3195: *
3196: * but omega == 1, so this becomes
3197: *
3198: * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3199: *
3200: * but bdiag and ibdiag cancel each other, so we can change this
3201: * to adding sum1*(*ibdiag++). We can skip bdiag for the larger
3202: * block sizes as well
3203: */
3204: x[row++] += sum1*(*ibdiag++);
3205: break;
3206: case 2:
3207: v2 = a->a + ii[row+1];
3208: sum1 = b[row];
3209: sum2 = b[row+1];
3210: for (n = 0; n<sz-1; n+=2) {
3211: i1 = idx[0];
3212: i2 = idx[1];
3213: idx += 2;
3214: tmp0 = x[i1];
3215: tmp1 = x[i2];
3216: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3217: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3218: }
3220: if (n == sz-1) {
3221: tmp0 = x[*idx];
3222: sum1 -= v1[0] * tmp0;
3223: sum2 -= v2[0] * tmp0;
3224: }
3225: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3226: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3227: ibdiag += 4;
3228: break;
3229: case 3:
3230: v2 = a->a + ii[row+1];
3231: v3 = a->a + ii[row+2];
3232: sum1 = b[row];
3233: sum2 = b[row+1];
3234: sum3 = b[row+2];
3235: for (n = 0; n<sz-1; n+=2) {
3236: i1 = idx[0];
3237: i2 = idx[1];
3238: idx += 2;
3239: tmp0 = x[i1];
3240: tmp1 = x[i2];
3241: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3244: }
3246: if (n == sz-1) {
3247: tmp0 = x[*idx];
3248: sum1 -= v1[0] * tmp0;
3249: sum2 -= v2[0] * tmp0;
3250: sum3 -= v3[0] * tmp0;
3251: }
3252: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3253: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3254: x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3255: ibdiag += 9;
3256: break;
3257: case 4:
3258: v2 = a->a + ii[row+1];
3259: v3 = a->a + ii[row+2];
3260: v4 = a->a + ii[row+3];
3261: sum1 = b[row];
3262: sum2 = b[row+1];
3263: sum3 = b[row+2];
3264: sum4 = b[row+3];
3265: for (n = 0; n<sz-1; n+=2) {
3266: i1 = idx[0];
3267: i2 = idx[1];
3268: idx += 2;
3269: tmp0 = x[i1];
3270: tmp1 = x[i2];
3271: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3272: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3273: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3274: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3275: }
3277: if (n == sz-1) {
3278: tmp0 = x[*idx];
3279: sum1 -= v1[0] * tmp0;
3280: sum2 -= v2[0] * tmp0;
3281: sum3 -= v3[0] * tmp0;
3282: sum4 -= v4[0] * tmp0;
3283: }
3284: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3285: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3286: x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3287: x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3288: ibdiag += 16;
3289: break;
3290: case 5:
3291: v2 = a->a + ii[row+1];
3292: v3 = a->a + ii[row+2];
3293: v4 = a->a + ii[row+3];
3294: v5 = a->a + ii[row+4];
3295: sum1 = b[row];
3296: sum2 = b[row+1];
3297: sum3 = b[row+2];
3298: sum4 = b[row+3];
3299: sum5 = b[row+4];
3300: for (n = 0; n<sz-1; n+=2) {
3301: i1 = idx[0];
3302: i2 = idx[1];
3303: idx += 2;
3304: tmp0 = x[i1];
3305: tmp1 = x[i2];
3306: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3307: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3308: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3309: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3310: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3311: }
3313: if (n == sz-1) {
3314: tmp0 = x[*idx];
3315: sum1 -= v1[0] * tmp0;
3316: sum2 -= v2[0] * tmp0;
3317: sum3 -= v3[0] * tmp0;
3318: sum4 -= v4[0] * tmp0;
3319: sum5 -= v5[0] * tmp0;
3320: }
3321: x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3322: x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3323: x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3324: x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3325: x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3326: ibdiag += 25;
3327: break;
3328: default:
3329: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3330: }
3331: }
3333: PetscLogFlops(2.0*a->nz);
3334: }
3335: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3337: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3338: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3339: ibdiag -= sizes[i]*sizes[i];
3340: sz = ii[row+1] - ii[row];
3341: v1 = a->a + ii[row];
3342: idx = a->j + ii[row];
3344: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3345: switch (sizes[i]) {
3346: case 1:
3348: sum1 = b[row];
3349: for (n = 0; n<sz-1; n+=2) {
3350: i1 = idx[0];
3351: i2 = idx[1];
3352: idx += 2;
3353: tmp0 = x[i1];
3354: tmp1 = x[i2];
3355: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3356: }
3358: if (n == sz-1) {
3359: tmp0 = x[*idx];
3360: sum1 -= *v1*tmp0;
3361: }
3362: x[row--] += sum1*(*ibdiag);
3363: break;
3365: case 2:
3367: sum1 = b[row];
3368: sum2 = b[row-1];
3369: /* note that sum1 is associated with the second of the two rows */
3370: v2 = a->a + ii[row - 1];
3371: for (n = 0; n<sz-1; n+=2) {
3372: i1 = idx[0];
3373: i2 = idx[1];
3374: idx += 2;
3375: tmp0 = x[i1];
3376: tmp1 = x[i2];
3377: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3378: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3379: }
3381: if (n == sz-1) {
3382: tmp0 = x[*idx];
3383: sum1 -= *v1*tmp0;
3384: sum2 -= *v2*tmp0;
3385: }
3386: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3387: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3388: break;
3389: case 3:
3391: sum1 = b[row];
3392: sum2 = b[row-1];
3393: sum3 = b[row-2];
3394: v2 = a->a + ii[row-1];
3395: v3 = a->a + ii[row-2];
3396: for (n = 0; n<sz-1; n+=2) {
3397: i1 = idx[0];
3398: i2 = idx[1];
3399: idx += 2;
3400: tmp0 = x[i1];
3401: tmp1 = x[i2];
3402: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3403: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3404: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3405: }
3407: if (n == sz-1) {
3408: tmp0 = x[*idx];
3409: sum1 -= *v1*tmp0;
3410: sum2 -= *v2*tmp0;
3411: sum3 -= *v3*tmp0;
3412: }
3413: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3414: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3415: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3416: break;
3417: case 4:
3419: sum1 = b[row];
3420: sum2 = b[row-1];
3421: sum3 = b[row-2];
3422: sum4 = b[row-3];
3423: v2 = a->a + ii[row-1];
3424: v3 = a->a + ii[row-2];
3425: v4 = a->a + ii[row-3];
3426: for (n = 0; n<sz-1; n+=2) {
3427: i1 = idx[0];
3428: i2 = idx[1];
3429: idx += 2;
3430: tmp0 = x[i1];
3431: tmp1 = x[i2];
3432: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3433: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3434: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3435: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3436: }
3438: if (n == sz-1) {
3439: tmp0 = x[*idx];
3440: sum1 -= *v1*tmp0;
3441: sum2 -= *v2*tmp0;
3442: sum3 -= *v3*tmp0;
3443: sum4 -= *v4*tmp0;
3444: }
3445: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3446: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3447: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3448: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3449: break;
3450: case 5:
3452: sum1 = b[row];
3453: sum2 = b[row-1];
3454: sum3 = b[row-2];
3455: sum4 = b[row-3];
3456: sum5 = b[row-4];
3457: v2 = a->a + ii[row-1];
3458: v3 = a->a + ii[row-2];
3459: v4 = a->a + ii[row-3];
3460: v5 = a->a + ii[row-4];
3461: for (n = 0; n<sz-1; n+=2) {
3462: i1 = idx[0];
3463: i2 = idx[1];
3464: idx += 2;
3465: tmp0 = x[i1];
3466: tmp1 = x[i2];
3467: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3468: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3469: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3470: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3471: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3472: }
3474: if (n == sz-1) {
3475: tmp0 = x[*idx];
3476: sum1 -= *v1*tmp0;
3477: sum2 -= *v2*tmp0;
3478: sum3 -= *v3*tmp0;
3479: sum4 -= *v4*tmp0;
3480: sum5 -= *v5*tmp0;
3481: }
3482: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3483: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3484: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3485: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3486: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3487: break;
3488: default:
3489: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3490: }
3491: }
3493: PetscLogFlops(2.0*a->nz);
3494: }
3495: }
3496: if (flag & SOR_EISENSTAT) {
3497: /*
3498: Apply (U + D)^-1 where D is now the block diagonal
3499: */
3500: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3501: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3502: ibdiag -= sizes[i]*sizes[i];
3503: sz = ii[row+1] - diag[row] - 1;
3504: v1 = a->a + diag[row] + 1;
3505: idx = a->j + diag[row] + 1;
3506: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3507: switch (sizes[i]) {
3508: case 1:
3510: sum1 = b[row];
3511: for (n = 0; n<sz-1; n+=2) {
3512: i1 = idx[0];
3513: i2 = idx[1];
3514: idx += 2;
3515: tmp0 = x[i1];
3516: tmp1 = x[i2];
3517: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3518: }
3520: if (n == sz-1) {
3521: tmp0 = x[*idx];
3522: sum1 -= *v1*tmp0;
3523: }
3524: x[row] = sum1*(*ibdiag);row--;
3525: break;
3527: case 2:
3529: sum1 = b[row];
3530: sum2 = b[row-1];
3531: /* note that sum1 is associated with the second of the two rows */
3532: v2 = a->a + diag[row-1] + 2;
3533: for (n = 0; n<sz-1; n+=2) {
3534: i1 = idx[0];
3535: i2 = idx[1];
3536: idx += 2;
3537: tmp0 = x[i1];
3538: tmp1 = x[i2];
3539: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3540: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3541: }
3543: if (n == sz-1) {
3544: tmp0 = x[*idx];
3545: sum1 -= *v1*tmp0;
3546: sum2 -= *v2*tmp0;
3547: }
3548: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3549: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3550: row -= 2;
3551: break;
3552: case 3:
3554: sum1 = b[row];
3555: sum2 = b[row-1];
3556: sum3 = b[row-2];
3557: v2 = a->a + diag[row-1] + 2;
3558: v3 = a->a + diag[row-2] + 3;
3559: for (n = 0; n<sz-1; n+=2) {
3560: i1 = idx[0];
3561: i2 = idx[1];
3562: idx += 2;
3563: tmp0 = x[i1];
3564: tmp1 = x[i2];
3565: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3566: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3567: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3568: }
3570: if (n == sz-1) {
3571: tmp0 = x[*idx];
3572: sum1 -= *v1*tmp0;
3573: sum2 -= *v2*tmp0;
3574: sum3 -= *v3*tmp0;
3575: }
3576: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3577: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3578: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3579: row -= 3;
3580: break;
3581: case 4:
3583: sum1 = b[row];
3584: sum2 = b[row-1];
3585: sum3 = b[row-2];
3586: sum4 = b[row-3];
3587: v2 = a->a + diag[row-1] + 2;
3588: v3 = a->a + diag[row-2] + 3;
3589: v4 = a->a + diag[row-3] + 4;
3590: for (n = 0; n<sz-1; n+=2) {
3591: i1 = idx[0];
3592: i2 = idx[1];
3593: idx += 2;
3594: tmp0 = x[i1];
3595: tmp1 = x[i2];
3596: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3597: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3598: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3599: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3600: }
3602: if (n == sz-1) {
3603: tmp0 = x[*idx];
3604: sum1 -= *v1*tmp0;
3605: sum2 -= *v2*tmp0;
3606: sum3 -= *v3*tmp0;
3607: sum4 -= *v4*tmp0;
3608: }
3609: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3610: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3611: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3612: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3613: row -= 4;
3614: break;
3615: case 5:
3617: sum1 = b[row];
3618: sum2 = b[row-1];
3619: sum3 = b[row-2];
3620: sum4 = b[row-3];
3621: sum5 = b[row-4];
3622: v2 = a->a + diag[row-1] + 2;
3623: v3 = a->a + diag[row-2] + 3;
3624: v4 = a->a + diag[row-3] + 4;
3625: v5 = a->a + diag[row-4] + 5;
3626: for (n = 0; n<sz-1; n+=2) {
3627: i1 = idx[0];
3628: i2 = idx[1];
3629: idx += 2;
3630: tmp0 = x[i1];
3631: tmp1 = x[i2];
3632: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3633: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3634: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3635: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3636: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3637: }
3639: if (n == sz-1) {
3640: tmp0 = x[*idx];
3641: sum1 -= *v1*tmp0;
3642: sum2 -= *v2*tmp0;
3643: sum3 -= *v3*tmp0;
3644: sum4 -= *v4*tmp0;
3645: sum5 -= *v5*tmp0;
3646: }
3647: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3648: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3649: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3650: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3651: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3652: row -= 5;
3653: break;
3654: default:
3655: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3656: }
3657: }
3658: PetscLogFlops(a->nz);
3660: /*
3661: t = b - D x where D is the block diagonal
3662: */
3663: cnt = 0;
3664: for (i=0, row=0; i<m; i++) {
3665: switch (sizes[i]) {
3666: case 1:
3667: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3668: break;
3669: case 2:
3670: x1 = x[row]; x2 = x[row+1];
3671: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3672: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3673: t[row] = b[row] - tmp1;
3674: t[row+1] = b[row+1] - tmp2; row += 2;
3675: cnt += 4;
3676: break;
3677: case 3:
3678: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3679: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3680: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3681: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3682: t[row] = b[row] - tmp1;
3683: t[row+1] = b[row+1] - tmp2;
3684: t[row+2] = b[row+2] - tmp3; row += 3;
3685: cnt += 9;
3686: break;
3687: case 4:
3688: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3689: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3690: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3691: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3692: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3693: t[row] = b[row] - tmp1;
3694: t[row+1] = b[row+1] - tmp2;
3695: t[row+2] = b[row+2] - tmp3;
3696: t[row+3] = b[row+3] - tmp4; row += 4;
3697: cnt += 16;
3698: break;
3699: case 5:
3700: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3701: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3702: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3703: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3704: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3705: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3706: t[row] = b[row] - tmp1;
3707: t[row+1] = b[row+1] - tmp2;
3708: t[row+2] = b[row+2] - tmp3;
3709: t[row+3] = b[row+3] - tmp4;
3710: t[row+4] = b[row+4] - tmp5;row += 5;
3711: cnt += 25;
3712: break;
3713: default:
3714: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3715: }
3716: }
3717: PetscLogFlops(m);
3721: /*
3722: Apply (L + D)^-1 where D is the block diagonal
3723: */
3724: for (i=0, row=0; i<m; i++) {
3725: sz = diag[row] - ii[row];
3726: v1 = a->a + ii[row];
3727: idx = a->j + ii[row];
3728: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3729: switch (sizes[i]) {
3730: case 1:
3732: sum1 = t[row];
3733: for (n = 0; n<sz-1; n+=2) {
3734: i1 = idx[0];
3735: i2 = idx[1];
3736: idx += 2;
3737: tmp0 = t[i1];
3738: tmp1 = t[i2];
3739: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3740: }
3742: if (n == sz-1) {
3743: tmp0 = t[*idx];
3744: sum1 -= *v1 * tmp0;
3745: }
3746: x[row] += t[row] = sum1*(*ibdiag++); row++;
3747: break;
3748: case 2:
3749: v2 = a->a + ii[row+1];
3750: sum1 = t[row];
3751: sum2 = t[row+1];
3752: for (n = 0; n<sz-1; n+=2) {
3753: i1 = idx[0];
3754: i2 = idx[1];
3755: idx += 2;
3756: tmp0 = t[i1];
3757: tmp1 = t[i2];
3758: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3759: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3760: }
3762: if (n == sz-1) {
3763: tmp0 = t[*idx];
3764: sum1 -= v1[0] * tmp0;
3765: sum2 -= v2[0] * tmp0;
3766: }
3767: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3768: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3769: ibdiag += 4; row += 2;
3770: break;
3771: case 3:
3772: v2 = a->a + ii[row+1];
3773: v3 = a->a + ii[row+2];
3774: sum1 = t[row];
3775: sum2 = t[row+1];
3776: sum3 = t[row+2];
3777: for (n = 0; n<sz-1; n+=2) {
3778: i1 = idx[0];
3779: i2 = idx[1];
3780: idx += 2;
3781: tmp0 = t[i1];
3782: tmp1 = t[i2];
3783: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3784: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3785: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3786: }
3788: if (n == sz-1) {
3789: tmp0 = t[*idx];
3790: sum1 -= v1[0] * tmp0;
3791: sum2 -= v2[0] * tmp0;
3792: sum3 -= v3[0] * tmp0;
3793: }
3794: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3795: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3796: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3797: ibdiag += 9; row += 3;
3798: break;
3799: case 4:
3800: v2 = a->a + ii[row+1];
3801: v3 = a->a + ii[row+2];
3802: v4 = a->a + ii[row+3];
3803: sum1 = t[row];
3804: sum2 = t[row+1];
3805: sum3 = t[row+2];
3806: sum4 = t[row+3];
3807: for (n = 0; n<sz-1; n+=2) {
3808: i1 = idx[0];
3809: i2 = idx[1];
3810: idx += 2;
3811: tmp0 = t[i1];
3812: tmp1 = t[i2];
3813: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3814: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3815: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3816: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3817: }
3819: if (n == sz-1) {
3820: tmp0 = t[*idx];
3821: sum1 -= v1[0] * tmp0;
3822: sum2 -= v2[0] * tmp0;
3823: sum3 -= v3[0] * tmp0;
3824: sum4 -= v4[0] * tmp0;
3825: }
3826: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3827: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3828: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3829: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3830: ibdiag += 16; row += 4;
3831: break;
3832: case 5:
3833: v2 = a->a + ii[row+1];
3834: v3 = a->a + ii[row+2];
3835: v4 = a->a + ii[row+3];
3836: v5 = a->a + ii[row+4];
3837: sum1 = t[row];
3838: sum2 = t[row+1];
3839: sum3 = t[row+2];
3840: sum4 = t[row+3];
3841: sum5 = t[row+4];
3842: for (n = 0; n<sz-1; n+=2) {
3843: i1 = idx[0];
3844: i2 = idx[1];
3845: idx += 2;
3846: tmp0 = t[i1];
3847: tmp1 = t[i2];
3848: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3849: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3850: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3851: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3852: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3853: }
3855: if (n == sz-1) {
3856: tmp0 = t[*idx];
3857: sum1 -= v1[0] * tmp0;
3858: sum2 -= v2[0] * tmp0;
3859: sum3 -= v3[0] * tmp0;
3860: sum4 -= v4[0] * tmp0;
3861: sum5 -= v5[0] * tmp0;
3862: }
3863: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3864: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3865: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3866: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3867: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3868: ibdiag += 25; row += 5;
3869: break;
3870: default:
3871: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3872: }
3873: }
3874: PetscLogFlops(a->nz);
3875: }
3876: VecRestoreArray(xx,&x);
3877: VecRestoreArrayRead(bb,&b);
3878: return(0);
3879: }
3883: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3884: {
3885: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3886: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3887: const MatScalar *bdiag = a->inode.bdiag;
3888: const PetscScalar *b;
3889: PetscErrorCode ierr;
3890: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3891: const PetscInt *sizes = a->inode.size;
3894: VecGetArray(xx,&x);
3895: VecGetArrayRead(bb,&b);
3896: cnt = 0;
3897: for (i=0, row=0; i<m; i++) {
3898: switch (sizes[i]) {
3899: case 1:
3900: x[row] = b[row]*bdiag[cnt++];row++;
3901: break;
3902: case 2:
3903: x1 = b[row]; x2 = b[row+1];
3904: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3905: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3906: x[row++] = tmp1;
3907: x[row++] = tmp2;
3908: cnt += 4;
3909: break;
3910: case 3:
3911: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3912: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3913: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3914: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3915: x[row++] = tmp1;
3916: x[row++] = tmp2;
3917: x[row++] = tmp3;
3918: cnt += 9;
3919: break;
3920: case 4:
3921: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3922: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3923: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3924: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3925: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3926: x[row++] = tmp1;
3927: x[row++] = tmp2;
3928: x[row++] = tmp3;
3929: x[row++] = tmp4;
3930: cnt += 16;
3931: break;
3932: case 5:
3933: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3934: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3935: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3936: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3937: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3938: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3939: x[row++] = tmp1;
3940: x[row++] = tmp2;
3941: x[row++] = tmp3;
3942: x[row++] = tmp4;
3943: x[row++] = tmp5;
3944: cnt += 25;
3945: break;
3946: default:
3947: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3948: }
3949: }
3950: PetscLogFlops(2*cnt);
3951: VecRestoreArray(xx,&x);
3952: VecRestoreArrayRead(bb,&b);
3953: return(0);
3954: }
3956: /*
3957: samestructure indicates that the matrix has not changed its nonzero structure so we
3958: do not need to recompute the inodes
3959: */
3962: PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3963: {
3964: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3966: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3967: PetscBool flag;
3970: if (!a->inode.use) return(0);
3971: if (a->inode.checked && samestructure) return(0);
3974: m = A->rmap->n;
3975: if (a->inode.size) ns = a->inode.size;
3976: else {
3977: PetscMalloc((m+1)*sizeof(PetscInt),&ns);
3978: }
3980: i = 0;
3981: node_count = 0;
3982: idx = a->j;
3983: ii = a->i;
3984: while (i < m) { /* For each row */
3985: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
3986: /* Limits the number of elements in a node to 'a->inode.limit' */
3987: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3988: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
3989: if (nzy != nzx) break;
3990: idy += nzx; /* Same nonzero pattern */
3991: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
3992: if (!flag) break;
3993: }
3994: ns[node_count++] = blk_size;
3995: idx += blk_size*nzx;
3996: i = j;
3997: }
3998: /* If not enough inodes found,, do not use inode version of the routines */
3999: if (!m || node_count > .8*m) {
4000: PetscFree(ns);
4002: a->inode.node_count = 0;
4003: a->inode.size = NULL;
4004: a->inode.use = PETSC_FALSE;
4005: A->ops->mult = MatMult_SeqAIJ;
4006: A->ops->sor = MatSOR_SeqAIJ;
4007: A->ops->multadd = MatMultAdd_SeqAIJ;
4008: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4009: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4010: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4011: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4012: A->ops->coloringpatch = 0;
4013: A->ops->multdiagonalblock = 0;
4015: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4016: } else {
4017: if (!A->factortype) {
4018: A->ops->mult = MatMult_SeqAIJ_Inode;
4019: A->ops->sor = MatSOR_SeqAIJ_Inode;
4020: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4021: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4022: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4023: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4024: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4025: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4026: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4027: } else {
4028: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4029: }
4030: a->inode.node_count = node_count;
4031: a->inode.size = ns;
4032: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4033: }
4034: a->inode.checked = PETSC_TRUE;
4035: return(0);
4036: }
4040: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4041: {
4042: Mat B =*C;
4043: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4045: PetscInt m=A->rmap->n;
4048: c->inode.use = a->inode.use;
4049: c->inode.limit = a->inode.limit;
4050: c->inode.max_limit = a->inode.max_limit;
4051: if (a->inode.size) {
4052: PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);
4053: c->inode.node_count = a->inode.node_count;
4054: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4055: /* note the table of functions below should match that in Mat_CheckInode() */
4056: if (!B->factortype) {
4057: B->ops->mult = MatMult_SeqAIJ_Inode;
4058: B->ops->sor = MatSOR_SeqAIJ_Inode;
4059: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4060: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4061: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4062: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4063: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4064: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4065: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4066: } else {
4067: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4068: }
4069: } else {
4070: c->inode.size = 0;
4071: c->inode.node_count = 0;
4072: }
4073: c->inode.ibdiagvalid = PETSC_FALSE;
4074: c->inode.ibdiag = 0;
4075: c->inode.bdiag = 0;
4076: return(0);
4077: }
4081: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4082: {
4083: PetscInt k, *vi;
4086: vi = aj + ai[row];
4087: for (k=0; k<nzl; k++) cols[k] = vi[k];
4088: vi = aj + adiag[row];
4089: cols[nzl] = vi[0];
4090: vi = aj + adiag[row+1]+1;
4091: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4092: return(0);
4093: }
4094: /*
4095: Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4096: Modified from Mat_CheckInode().
4098: Input Parameters:
4099: + Mat A - ILU or LU matrix factor
4100: - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4101: do not need to recompute the inodes
4102: */
4105: PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4106: {
4107: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4109: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4110: PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4111: PetscBool flag;
4114: if (!a->inode.use) return(0);
4115: if (a->inode.checked && samestructure) return(0);
4117: m = A->rmap->n;
4118: if (a->inode.size) ns = a->inode.size;
4119: else {
4120: PetscMalloc((m+1)*sizeof(PetscInt),&ns);
4121: }
4123: i = 0;
4124: node_count = 0;
4125: while (i < m) { /* For each row */
4126: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4127: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4128: nzx = nzl1 + nzu1 + 1;
4129: PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);
4130: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4132: /* Limits the number of elements in a node to 'a->inode.limit' */
4133: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4134: nzl2 = ai[j+1] - ai[j];
4135: nzu2 = adiag[j] - adiag[j+1] - 1;
4136: nzy = nzl2 + nzu2 + 1;
4137: if (nzy != nzx) break;
4138: PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);
4139: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4140: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4141: if (!flag) {PetscFree(cols2);break;}
4142: PetscFree(cols2);
4143: }
4144: ns[node_count++] = blk_size;
4145: PetscFree(cols1);
4146: i = j;
4147: }
4148: /* If not enough inodes found,, do not use inode version of the routines */
4149: if (!m || node_count > .8*m) {
4150: PetscFree(ns);
4152: a->inode.node_count = 0;
4153: a->inode.size = NULL;
4154: a->inode.use = PETSC_FALSE;
4156: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4157: } else {
4158: A->ops->mult = 0;
4159: A->ops->sor = 0;
4160: A->ops->multadd = 0;
4161: A->ops->getrowij = 0;
4162: A->ops->restorerowij = 0;
4163: A->ops->getcolumnij = 0;
4164: A->ops->restorecolumnij = 0;
4165: A->ops->coloringpatch = 0;
4166: A->ops->multdiagonalblock = 0;
4167: A->ops->solve = MatSolve_SeqAIJ_Inode;
4168: a->inode.node_count = node_count;
4169: a->inode.size = ns;
4171: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4172: }
4173: a->inode.checked = PETSC_TRUE;
4174: return(0);
4175: }
4179: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4180: {
4181: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4184: a->inode.ibdiagvalid = PETSC_FALSE;
4185: return(0);
4186: }
4188: /*
4189: This is really ugly. if inodes are used this replaces the
4190: permutations with ones that correspond to rows/cols of the matrix
4191: rather then inode blocks
4192: */
4195: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4196: {
4200: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4201: return(0);
4202: }
4206: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4207: {
4208: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4210: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4211: const PetscInt *ridx,*cidx;
4212: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4213: PetscInt nslim_col,*ns_col;
4214: IS ris = *rperm,cis = *cperm;
4217: if (!a->inode.size) return(0); /* no inodes so return */
4218: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4220: Mat_CreateColInode(A,&nslim_col,&ns_col);
4221: PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);
4222: PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);
4224: ISGetIndices(ris,&ridx);
4225: ISGetIndices(cis,&cidx);
4227: /* Form the inode structure for the rows of permuted matric using inv perm*/
4228: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4230: /* Construct the permutations for rows*/
4231: for (i=0,row = 0; i<nslim_row; ++i) {
4232: indx = ridx[i];
4233: start_val = tns[indx];
4234: end_val = tns[indx + 1];
4235: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4236: }
4238: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4239: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4241: /* Construct permutations for columns */
4242: for (i=0,col=0; i<nslim_col; ++i) {
4243: indx = cidx[i];
4244: start_val = tns[indx];
4245: end_val = tns[indx + 1];
4246: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4247: }
4249: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4250: ISSetPermutation(*rperm);
4251: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4252: ISSetPermutation(*cperm);
4254: ISRestoreIndices(ris,&ridx);
4255: ISRestoreIndices(cis,&cidx);
4257: PetscFree(ns_col);
4258: PetscFree2(permr,permc);
4259: ISDestroy(&cis);
4260: ISDestroy(&ris);
4261: PetscFree(tns);
4262: return(0);
4263: }
4267: /*@C
4268: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4270: Not Collective
4272: Input Parameter:
4273: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4275: Output Parameter:
4276: + node_count - no of inodes present in the matrix.
4277: . sizes - an array of size node_count,with sizes of each inode.
4278: - limit - the max size used to generate the inodes.
4280: Level: advanced
4282: Notes: This routine returns some internal storage information
4283: of the matrix, it is intended to be used by advanced users.
4284: It should be called after the matrix is assembled.
4285: The contents of the sizes[] array should not be changed.
4286: NULL may be passed for information not requested.
4288: .keywords: matrix, seqaij, get, inode
4290: .seealso: MatGetInfo()
4291: @*/
4292: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4293: {
4294: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4297: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4298: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4299: if (f) {
4300: (*f)(A,node_count,sizes,limit);
4301: }
4302: return(0);
4303: }
4307: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4308: {
4309: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4312: if (node_count) *node_count = a->inode.node_count;
4313: if (sizes) *sizes = a->inode.size;
4314: if (limit) *limit = a->inode.limit;
4315: return(0);
4316: }