Actual source code: baijfact2.c
petsc-3.3-p7 2013-05-11
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <../src/mat/blockinvert.h>
8: #include <petscbt.h>
9: #include <../src/mat/utils/freespace.h>
13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscErrorCode ierr;
17: const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
18: PetscInt i,n = a->mbs,j;
19: PetscInt nz;
20: PetscScalar *x,*tmp,s1;
21: const MatScalar *aa = a->a,*v;
22: const PetscScalar *b;
25: VecGetArrayRead(bb,&b);
26: VecGetArray(xx,&x);
27: tmp = a->solve_work;
30: /* copy the b into temp work space according to permutation */
31: for (i=0; i<n; i++) tmp[i] = b[i];
33: /* forward solve the U^T */
34: for (i=0; i<n; i++) {
35: v = aa + adiag[i+1] + 1;
36: vi = aj + adiag[i+1] + 1;
37: nz = adiag[i] - adiag[i+1] - 1;
38: s1 = tmp[i];
39: s1 *= v[nz]; /* multiply by inverse of diagonal entry */
40: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
41: tmp[i] = s1;
42: }
44: /* backward solve the L^T */
45: for (i=n-1; i>=0; i--){
46: v = aa + ai[i];
47: vi = aj + ai[i];
48: nz = ai[i+1] - ai[i];
49: s1 = tmp[i];
50: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
51: }
53: /* copy tmp into x according to permutation */
54: for (i=0; i<n; i++) x[i] = tmp[i];
56: VecRestoreArrayRead(bb,&b);
57: VecRestoreArray(xx,&x);
59: PetscLogFlops(2.0*a->nz-A->cmap->n);
60: return(0);
61: }
65: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66: {
67: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
68: PetscErrorCode ierr;
69: PetscInt i,nz;
70: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
71: const MatScalar *aa=a->a,*v;
72: PetscScalar s1,*x;
75: VecCopy(bb,xx);
76: VecGetArray(xx,&x);
77:
78: /* forward solve the U^T */
79: for (i=0; i<n; i++) {
81: v = aa + diag[i];
82: /* multiply by the inverse of the block diagonal */
83: s1 = (*v++)*x[i];
84: vi = aj + diag[i] + 1;
85: nz = ai[i+1] - diag[i] - 1;
86: while (nz--) {
87: x[*vi++] -= (*v++)*s1;
88: }
89: x[i] = s1;
90: }
91: /* backward solve the L^T */
92: for (i=n-1; i>=0; i--){
93: v = aa + diag[i] - 1;
94: vi = aj + diag[i] - 1;
95: nz = diag[i] - ai[i];
96: s1 = x[i];
97: while (nz--) {
98: x[*vi--] -= (*v--)*s1;
99: }
100: }
101: VecRestoreArray(xx,&x);
102: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
103: return(0);
104: }
108: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109: {
110: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
111: PetscErrorCode ierr;
112: PetscInt i,nz,idx,idt,oidx;
113: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114: const MatScalar *aa=a->a,*v;
115: PetscScalar s1,s2,x1,x2,*x;
118: VecCopy(bb,xx);
119: VecGetArray(xx,&x);
121: /* forward solve the U^T */
122: idx = 0;
123: for (i=0; i<n; i++) {
125: v = aa + 4*diag[i];
126: /* multiply by the inverse of the block diagonal */
127: x1 = x[idx]; x2 = x[1+idx];
128: s1 = v[0]*x1 + v[1]*x2;
129: s2 = v[2]*x1 + v[3]*x2;
130: v += 4;
132: vi = aj + diag[i] + 1;
133: nz = ai[i+1] - diag[i] - 1;
134: while (nz--) {
135: oidx = 2*(*vi++);
136: x[oidx] -= v[0]*s1 + v[1]*s2;
137: x[oidx+1] -= v[2]*s1 + v[3]*s2;
138: v += 4;
139: }
140: x[idx] = s1;x[1+idx] = s2;
141: idx += 2;
142: }
143: /* backward solve the L^T */
144: for (i=n-1; i>=0; i--){
145: v = aa + 4*diag[i] - 4;
146: vi = aj + diag[i] - 1;
147: nz = diag[i] - ai[i];
148: idt = 2*i;
149: s1 = x[idt]; s2 = x[1+idt];
150: while (nz--) {
151: idx = 2*(*vi--);
152: x[idx] -= v[0]*s1 + v[1]*s2;
153: x[idx+1] -= v[2]*s1 + v[3]*s2;
154: v -= 4;
155: }
156: }
157: VecRestoreArray(xx,&x);
158: PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
159: return(0);
160: }
164: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165: {
166: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
167: PetscErrorCode ierr;
168: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169: PetscInt nz,idx,idt,j,i,oidx;
170: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
171: const MatScalar *aa=a->a,*v;
172: PetscScalar s1,s2,x1,x2,*x;
175: VecCopy(bb,xx);
176: VecGetArray(xx,&x);
178: /* forward solve the U^T */
179: idx = 0;
180: for (i=0; i<n; i++) {
181: v = aa + bs2*diag[i];
182: /* multiply by the inverse of the block diagonal */
183: x1 = x[idx]; x2 = x[1+idx];
184: s1 = v[0]*x1 + v[1]*x2;
185: s2 = v[2]*x1 + v[3]*x2;
186: v -= bs2;
188: vi = aj + diag[i] - 1;
189: nz = diag[i] - diag[i+1] - 1;
190: for(j=0;j>-nz;j--){
191: oidx = bs*vi[j];
192: x[oidx] -= v[0]*s1 + v[1]*s2;
193: x[oidx+1] -= v[2]*s1 + v[3]*s2;
194: v -= bs2;
195: }
196: x[idx] = s1;x[1+idx] = s2;
197: idx += bs;
198: }
199: /* backward solve the L^T */
200: for (i=n-1; i>=0; i--){
201: v = aa + bs2*ai[i];
202: vi = aj + ai[i];
203: nz = ai[i+1] - ai[i];
204: idt = bs*i;
205: s1 = x[idt]; s2 = x[1+idt];
206: for(j=0;j<nz;j++){
207: idx = bs*vi[j];
208: x[idx] -= v[0]*s1 + v[1]*s2;
209: x[idx+1] -= v[2]*s1 + v[3]*s2;
210: v += bs2;
211: }
212: }
213: VecRestoreArray(xx,&x);
214: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
215: return(0);
216: }
220: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221: {
222: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
223: PetscErrorCode ierr;
224: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225: PetscInt i,nz,idx,idt,oidx;
226: const MatScalar *aa=a->a,*v;
227: PetscScalar s1,s2,s3,x1,x2,x3,*x;
230: VecCopy(bb,xx);
231: VecGetArray(xx,&x);
233: /* forward solve the U^T */
234: idx = 0;
235: for (i=0; i<n; i++) {
237: v = aa + 9*diag[i];
238: /* multiply by the inverse of the block diagonal */
239: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
240: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
241: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
242: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
243: v += 9;
245: vi = aj + diag[i] + 1;
246: nz = ai[i+1] - diag[i] - 1;
247: while (nz--) {
248: oidx = 3*(*vi++);
249: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
250: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
251: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252: v += 9;
253: }
254: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
255: idx += 3;
256: }
257: /* backward solve the L^T */
258: for (i=n-1; i>=0; i--){
259: v = aa + 9*diag[i] - 9;
260: vi = aj + diag[i] - 1;
261: nz = diag[i] - ai[i];
262: idt = 3*i;
263: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
264: while (nz--) {
265: idx = 3*(*vi--);
266: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
267: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
268: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269: v -= 9;
270: }
271: }
272: VecRestoreArray(xx,&x);
273: PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
274: return(0);
275: }
279: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280: {
281: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
282: PetscErrorCode ierr;
283: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284: PetscInt nz,idx,idt,j,i,oidx;
285: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
286: const MatScalar *aa=a->a,*v;
287: PetscScalar s1,s2,s3,x1,x2,x3,*x;
290: VecCopy(bb,xx);
291: VecGetArray(xx,&x);
293: /* forward solve the U^T */
294: idx = 0;
295: for (i=0; i<n; i++) {
296: v = aa + bs2*diag[i];
297: /* multiply by the inverse of the block diagonal */
298: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
299: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
300: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
301: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
302: v -= bs2;
304: vi = aj + diag[i] - 1;
305: nz = diag[i] - diag[i+1] - 1;
306: for(j=0;j>-nz;j--){
307: oidx = bs*vi[j];
308: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
309: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
310: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
311: v -= bs2;
312: }
313: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
314: idx += bs;
315: }
316: /* backward solve the L^T */
317: for (i=n-1; i>=0; i--){
318: v = aa + bs2*ai[i];
319: vi = aj + ai[i];
320: nz = ai[i+1] - ai[i];
321: idt = bs*i;
322: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
323: for(j=0;j<nz;j++){
324: idx = bs*vi[j];
325: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
326: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
327: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
328: v += bs2;
329: }
330: }
331: VecRestoreArray(xx,&x);
332: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
333: return(0);
334: }
338: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339: {
340: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
341: PetscErrorCode ierr;
342: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343: PetscInt i,nz,idx,idt,oidx;
344: const MatScalar *aa=a->a,*v;
345: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x;
348: VecCopy(bb,xx);
349: VecGetArray(xx,&x);
351: /* forward solve the U^T */
352: idx = 0;
353: for (i=0; i<n; i++) {
355: v = aa + 16*diag[i];
356: /* multiply by the inverse of the block diagonal */
357: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
358: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
359: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
360: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
361: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362: v += 16;
364: vi = aj + diag[i] + 1;
365: nz = ai[i+1] - diag[i] - 1;
366: while (nz--) {
367: oidx = 4*(*vi++);
368: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
369: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
370: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372: v += 16;
373: }
374: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375: idx += 4;
376: }
377: /* backward solve the L^T */
378: for (i=n-1; i>=0; i--){
379: v = aa + 16*diag[i] - 16;
380: vi = aj + diag[i] - 1;
381: nz = diag[i] - ai[i];
382: idt = 4*i;
383: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384: while (nz--) {
385: idx = 4*(*vi--);
386: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
387: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
388: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390: v -= 16;
391: }
392: }
393: VecRestoreArray(xx,&x);
394: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
395: return(0);
396: }
400: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401: {
402: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
403: PetscErrorCode ierr;
404: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405: PetscInt nz,idx,idt,j,i,oidx;
406: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
407: const MatScalar *aa=a->a,*v;
408: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x;
411: VecCopy(bb,xx);
412: VecGetArray(xx,&x);
414: /* forward solve the U^T */
415: idx = 0;
416: for (i=0; i<n; i++) {
417: v = aa + bs2*diag[i];
418: /* multiply by the inverse of the block diagonal */
419: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
420: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
421: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
422: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
423: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
424: v -= bs2;
426: vi = aj + diag[i] - 1;
427: nz = diag[i] - diag[i+1] - 1;
428: for(j=0;j>-nz;j--){
429: oidx = bs*vi[j];
430: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
431: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
432: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
433: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
434: v -= bs2;
435: }
436: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4;
437: idx += bs;
438: }
439: /* backward solve the L^T */
440: for (i=n-1; i>=0; i--){
441: v = aa + bs2*ai[i];
442: vi = aj + ai[i];
443: nz = ai[i+1] - ai[i];
444: idt = bs*i;
445: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt];
446: for(j=0;j<nz;j++){
447: idx = bs*vi[j];
448: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
449: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
450: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
451: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
452: v += bs2;
453: }
454: }
455: VecRestoreArray(xx,&x);
456: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
457: return(0);
458: }
462: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463: {
464: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
465: PetscErrorCode ierr;
466: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467: PetscInt i,nz,idx,idt,oidx;
468: const MatScalar *aa=a->a,*v;
469: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
472: VecCopy(bb,xx);
473: VecGetArray(xx,&x);
475: /* forward solve the U^T */
476: idx = 0;
477: for (i=0; i<n; i++) {
479: v = aa + 25*diag[i];
480: /* multiply by the inverse of the block diagonal */
481: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
483: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
484: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487: v += 25;
489: vi = aj + diag[i] + 1;
490: nz = ai[i+1] - diag[i] - 1;
491: while (nz--) {
492: oidx = 5*(*vi++);
493: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
494: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
495: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498: v += 25;
499: }
500: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501: idx += 5;
502: }
503: /* backward solve the L^T */
504: for (i=n-1; i>=0; i--){
505: v = aa + 25*diag[i] - 25;
506: vi = aj + diag[i] - 1;
507: nz = diag[i] - ai[i];
508: idt = 5*i;
509: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510: while (nz--) {
511: idx = 5*(*vi--);
512: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
513: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
514: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517: v -= 25;
518: }
519: }
520: VecRestoreArray(xx,&x);
521: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
522: return(0);
523: }
527: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528: {
529: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
531: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532: PetscInt nz,idx,idt,j,i,oidx;
533: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
534: const MatScalar *aa=a->a,*v;
535: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
538: VecCopy(bb,xx);
539: VecGetArray(xx,&x);
541: /* forward solve the U^T */
542: idx = 0;
543: for (i=0; i<n; i++) {
544: v = aa + bs2*diag[i];
545: /* multiply by the inverse of the block diagonal */
546: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
547: x5 = x[4+idx];
548: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
549: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
550: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
551: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
552: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
553: v -= bs2;
555: vi = aj + diag[i] - 1;
556: nz = diag[i] - diag[i+1] - 1;
557: for(j=0;j>-nz;j--){
558: oidx = bs*vi[j];
559: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
560: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
561: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
562: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
563: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
564: v -= bs2;
565: }
566: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
567: idx += bs;
568: }
569: /* backward solve the L^T */
570: for (i=n-1; i>=0; i--){
571: v = aa + bs2*ai[i];
572: vi = aj + ai[i];
573: nz = ai[i+1] - ai[i];
574: idt = bs*i;
575: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
576: for(j=0;j<nz;j++){
577: idx = bs*vi[j];
578: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
579: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
580: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
581: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
582: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
583: v += bs2;
584: }
585: }
586: VecRestoreArray(xx,&x);
587: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
588: return(0);
589: }
593: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594: {
595: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
596: PetscErrorCode ierr;
597: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598: PetscInt i,nz,idx,idt,oidx;
599: const MatScalar *aa=a->a,*v;
600: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
603: VecCopy(bb,xx);
604: VecGetArray(xx,&x);
606: /* forward solve the U^T */
607: idx = 0;
608: for (i=0; i<n; i++) {
610: v = aa + 36*diag[i];
611: /* multiply by the inverse of the block diagonal */
612: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613: x6 = x[5+idx];
614: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
615: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
616: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620: v += 36;
622: vi = aj + diag[i] + 1;
623: nz = ai[i+1] - diag[i] - 1;
624: while (nz--) {
625: oidx = 6*(*vi++);
626: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
627: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
628: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632: v += 36;
633: }
634: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635: x[5+idx] = s6;
636: idx += 6;
637: }
638: /* backward solve the L^T */
639: for (i=n-1; i>=0; i--){
640: v = aa + 36*diag[i] - 36;
641: vi = aj + diag[i] - 1;
642: nz = diag[i] - ai[i];
643: idt = 6*i;
644: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645: s6 = x[5+idt];
646: while (nz--) {
647: idx = 6*(*vi--);
648: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
649: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
650: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654: v -= 36;
655: }
656: }
657: VecRestoreArray(xx,&x);
658: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
659: return(0);
660: }
664: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665: {
666: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
667: PetscErrorCode ierr;
668: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669: PetscInt nz,idx,idt,j,i,oidx;
670: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
671: const MatScalar *aa=a->a,*v;
672: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
675: VecCopy(bb,xx);
676: VecGetArray(xx,&x);
678: /* forward solve the U^T */
679: idx = 0;
680: for (i=0; i<n; i++) {
681: v = aa + bs2*diag[i];
682: /* multiply by the inverse of the block diagonal */
683: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
684: x5 = x[4+idx]; x6 = x[5+idx];
685: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
686: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
687: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691: v -= bs2;
693: vi = aj + diag[i] - 1;
694: nz = diag[i] - diag[i+1] - 1;
695: for(j=0;j>-nz;j--){
696: oidx = bs*vi[j];
697: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
698: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
699: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703: v -= bs2;
704: }
705: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
706: x[5+idx] = s6;
707: idx += bs;
708: }
709: /* backward solve the L^T */
710: for (i=n-1; i>=0; i--){
711: v = aa + bs2*ai[i];
712: vi = aj + ai[i];
713: nz = ai[i+1] - ai[i];
714: idt = bs*i;
715: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
716: s6 = x[5+idt];
717: for(j=0;j<nz;j++){
718: idx = bs*vi[j];
719: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
720: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
721: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725: v += bs2;
726: }
727: }
728: VecRestoreArray(xx,&x);
729: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
730: return(0);
731: }
735: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736: {
737: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
738: PetscErrorCode ierr;
739: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740: PetscInt i,nz,idx,idt,oidx;
741: const MatScalar *aa=a->a,*v;
742: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
745: VecCopy(bb,xx);
746: VecGetArray(xx,&x);
748: /* forward solve the U^T */
749: idx = 0;
750: for (i=0; i<n; i++) {
752: v = aa + 49*diag[i];
753: /* multiply by the inverse of the block diagonal */
754: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755: x6 = x[5+idx]; x7 = x[6+idx];
756: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
757: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763: v += 49;
765: vi = aj + diag[i] + 1;
766: nz = ai[i+1] - diag[i] - 1;
767: while (nz--) {
768: oidx = 7*(*vi++);
769: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
770: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776: v += 49;
777: }
778: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779: x[5+idx] = s6;x[6+idx] = s7;
780: idx += 7;
781: }
782: /* backward solve the L^T */
783: for (i=n-1; i>=0; i--){
784: v = aa + 49*diag[i] - 49;
785: vi = aj + diag[i] - 1;
786: nz = diag[i] - ai[i];
787: idt = 7*i;
788: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789: s6 = x[5+idt];s7 = x[6+idt];
790: while (nz--) {
791: idx = 7*(*vi--);
792: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
793: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799: v -= 49;
800: }
801: }
802: VecRestoreArray(xx,&x);
803: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
804: return(0);
805: }
808: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809: {
810: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
811: PetscErrorCode ierr;
812: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813: PetscInt nz,idx,idt,j,i,oidx;
814: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
815: const MatScalar *aa=a->a,*v;
816: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
819: VecCopy(bb,xx);
820: VecGetArray(xx,&x);
822: /* forward solve the U^T */
823: idx = 0;
824: for (i=0; i<n; i++) {
825: v = aa + bs2*diag[i];
826: /* multiply by the inverse of the block diagonal */
827: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
828: x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx];
829: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
830: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836: v -= bs2;
837: vi = aj + diag[i] - 1;
838: nz = diag[i] - diag[i+1] - 1;
839: for(j=0;j>-nz;j--){
840: oidx = bs*vi[j];
841: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
842: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848: v -= bs2;
849: }
850: x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
851: x[5+idx] = s6; x[6+idx] = s7;
852: idx += bs;
853: }
854: /* backward solve the L^T */
855: for (i=n-1; i>=0; i--){
856: v = aa + bs2*ai[i];
857: vi = aj + ai[i];
858: nz = ai[i+1] - ai[i];
859: idt = bs*i;
860: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
861: s6 = x[5+idt]; s7 = x[6+idt];
862: for(j=0;j<nz;j++){
863: idx = bs*vi[j];
864: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
865: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871: v += bs2;
872: }
873: }
874: VecRestoreArray(xx,&x);
875: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
876: return(0);
877: }
879: /*---------------------------------------------------------------------------------------------*/
882: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883: {
884: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
885: IS iscol = a->col,isrow = a->row;
886: PetscErrorCode ierr;
887: const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888: PetscInt i,n = a->mbs,j;
889: PetscInt nz;
890: PetscScalar *x,*tmp,s1;
891: const MatScalar *aa = a->a,*v;
892: const PetscScalar *b;
895: VecGetArrayRead(bb,&b);
896: VecGetArray(xx,&x);
897: tmp = a->solve_work;
899: ISGetIndices(isrow,&rout); r = rout;
900: ISGetIndices(iscol,&cout); c = cout;
902: /* copy the b into temp work space according to permutation */
903: for (i=0; i<n; i++) tmp[i] = b[c[i]];
905: /* forward solve the U^T */
906: for (i=0; i<n; i++) {
907: v = aa + adiag[i+1] + 1;
908: vi = aj + adiag[i+1] + 1;
909: nz = adiag[i] - adiag[i+1] - 1;
910: s1 = tmp[i];
911: s1 *= v[nz]; /* multiply by inverse of diagonal entry */
912: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913: tmp[i] = s1;
914: }
916: /* backward solve the L^T */
917: for (i=n-1; i>=0; i--){
918: v = aa + ai[i];
919: vi = aj + ai[i];
920: nz = ai[i+1] - ai[i];
921: s1 = tmp[i];
922: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923: }
925: /* copy tmp into x according to permutation */
926: for (i=0; i<n; i++) x[r[i]] = tmp[i];
928: ISRestoreIndices(isrow,&rout);
929: ISRestoreIndices(iscol,&cout);
930: VecRestoreArrayRead(bb,&b);
931: VecRestoreArray(xx,&x);
933: PetscLogFlops(2.0*a->nz-A->cmap->n);
934: return(0);
935: }
939: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940: {
941: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
942: IS iscol=a->col,isrow=a->row;
943: PetscErrorCode ierr;
944: const PetscInt *r,*c,*rout,*cout;
945: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946: PetscInt i,nz;
947: const MatScalar *aa=a->a,*v;
948: PetscScalar s1,*x,*t;
949: const PetscScalar *b;
952: VecGetArrayRead(bb,&b);
953: VecGetArray(xx,&x);
954: t = a->solve_work;
956: ISGetIndices(isrow,&rout); r = rout;
957: ISGetIndices(iscol,&cout); c = cout;
959: /* copy the b into temp work space according to permutation */
960: for (i=0; i<n; i++) {
961: t[i] = b[c[i]];
962: }
964: /* forward solve the U^T */
965: for (i=0; i<n; i++) {
967: v = aa + diag[i];
968: /* multiply by the inverse of the block diagonal */
969: s1 = (*v++)*t[i];
970: vi = aj + diag[i] + 1;
971: nz = ai[i+1] - diag[i] - 1;
972: while (nz--) {
973: t[*vi++] -= (*v++)*s1;
974: }
975: t[i] = s1;
976: }
977: /* backward solve the L^T */
978: for (i=n-1; i>=0; i--){
979: v = aa + diag[i] - 1;
980: vi = aj + diag[i] - 1;
981: nz = diag[i] - ai[i];
982: s1 = t[i];
983: while (nz--) {
984: t[*vi--] -= (*v--)*s1;
985: }
986: }
988: /* copy t into x according to permutation */
989: for (i=0; i<n; i++) {
990: x[r[i]] = t[i];
991: }
993: ISRestoreIndices(isrow,&rout);
994: ISRestoreIndices(iscol,&cout);
995: VecRestoreArrayRead(bb,&b);
996: VecRestoreArray(xx,&x);
997: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
998: return(0);
999: }
1003: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1004: {
1005: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1006: IS iscol=a->col,isrow=a->row;
1007: PetscErrorCode ierr;
1008: const PetscInt *r,*c,*rout,*cout;
1009: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1010: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1011: const MatScalar *aa=a->a,*v;
1012: PetscScalar s1,s2,x1,x2,*x,*t;
1013: const PetscScalar *b;
1016: VecGetArrayRead(bb,&b);
1017: VecGetArray(xx,&x);
1018: t = a->solve_work;
1020: ISGetIndices(isrow,&rout); r = rout;
1021: ISGetIndices(iscol,&cout); c = cout;
1023: /* copy the b into temp work space according to permutation */
1024: ii = 0;
1025: for (i=0; i<n; i++) {
1026: ic = 2*c[i];
1027: t[ii] = b[ic];
1028: t[ii+1] = b[ic+1];
1029: ii += 2;
1030: }
1032: /* forward solve the U^T */
1033: idx = 0;
1034: for (i=0; i<n; i++) {
1036: v = aa + 4*diag[i];
1037: /* multiply by the inverse of the block diagonal */
1038: x1 = t[idx]; x2 = t[1+idx];
1039: s1 = v[0]*x1 + v[1]*x2;
1040: s2 = v[2]*x1 + v[3]*x2;
1041: v += 4;
1043: vi = aj + diag[i] + 1;
1044: nz = ai[i+1] - diag[i] - 1;
1045: while (nz--) {
1046: oidx = 2*(*vi++);
1047: t[oidx] -= v[0]*s1 + v[1]*s2;
1048: t[oidx+1] -= v[2]*s1 + v[3]*s2;
1049: v += 4;
1050: }
1051: t[idx] = s1;t[1+idx] = s2;
1052: idx += 2;
1053: }
1054: /* backward solve the L^T */
1055: for (i=n-1; i>=0; i--){
1056: v = aa + 4*diag[i] - 4;
1057: vi = aj + diag[i] - 1;
1058: nz = diag[i] - ai[i];
1059: idt = 2*i;
1060: s1 = t[idt]; s2 = t[1+idt];
1061: while (nz--) {
1062: idx = 2*(*vi--);
1063: t[idx] -= v[0]*s1 + v[1]*s2;
1064: t[idx+1] -= v[2]*s1 + v[3]*s2;
1065: v -= 4;
1066: }
1067: }
1069: /* copy t into x according to permutation */
1070: ii = 0;
1071: for (i=0; i<n; i++) {
1072: ir = 2*r[i];
1073: x[ir] = t[ii];
1074: x[ir+1] = t[ii+1];
1075: ii += 2;
1076: }
1078: ISRestoreIndices(isrow,&rout);
1079: ISRestoreIndices(iscol,&cout);
1080: VecRestoreArrayRead(bb,&b);
1081: VecRestoreArray(xx,&x);
1082: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1083: return(0);
1084: }
1088: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1089: {
1090: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1091: PetscErrorCode ierr;
1092: IS iscol=a->col,isrow=a->row;
1093: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1094: const PetscInt *r,*c,*rout,*cout;
1095: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1096: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1097: const MatScalar *aa=a->a,*v;
1098: PetscScalar s1,s2,x1,x2,*x,*t;
1099: const PetscScalar *b;
1102: VecGetArrayRead(bb,&b);
1103: VecGetArray(xx,&x);
1104: t = a->solve_work;
1106: ISGetIndices(isrow,&rout); r = rout;
1107: ISGetIndices(iscol,&cout); c = cout;
1109: /* copy b into temp work space according to permutation */
1110: for(i=0;i<n;i++){
1111: ii = bs*i; ic = bs*c[i];
1112: t[ii] = b[ic]; t[ii+1] = b[ic+1];
1113: }
1115: /* forward solve the U^T */
1116: idx = 0;
1117: for (i=0; i<n; i++) {
1118: v = aa + bs2*diag[i];
1119: /* multiply by the inverse of the block diagonal */
1120: x1 = t[idx]; x2 = t[1+idx];
1121: s1 = v[0]*x1 + v[1]*x2;
1122: s2 = v[2]*x1 + v[3]*x2;
1123: v -= bs2;
1125: vi = aj + diag[i] - 1;
1126: nz = diag[i] - diag[i+1] - 1;
1127: for(j=0;j>-nz;j--){
1128: oidx = bs*vi[j];
1129: t[oidx] -= v[0]*s1 + v[1]*s2;
1130: t[oidx+1] -= v[2]*s1 + v[3]*s2;
1131: v -= bs2;
1132: }
1133: t[idx] = s1;t[1+idx] = s2;
1134: idx += bs;
1135: }
1136: /* backward solve the L^T */
1137: for (i=n-1; i>=0; i--){
1138: v = aa + bs2*ai[i];
1139: vi = aj + ai[i];
1140: nz = ai[i+1] - ai[i];
1141: idt = bs*i;
1142: s1 = t[idt]; s2 = t[1+idt];
1143: for(j=0;j<nz;j++){
1144: idx = bs*vi[j];
1145: t[idx] -= v[0]*s1 + v[1]*s2;
1146: t[idx+1] -= v[2]*s1 + v[3]*s2;
1147: v += bs2;
1148: }
1149: }
1151: /* copy t into x according to permutation */
1152: for(i=0;i<n;i++){
1153: ii = bs*i; ir = bs*r[i];
1154: x[ir] = t[ii]; x[ir+1] = t[ii+1];
1155: }
1157: ISRestoreIndices(isrow,&rout);
1158: ISRestoreIndices(iscol,&cout);
1159: VecRestoreArrayRead(bb,&b);
1160: VecRestoreArray(xx,&x);
1161: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1162: return(0);
1163: }
1167: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1168: {
1169: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1170: IS iscol=a->col,isrow=a->row;
1171: PetscErrorCode ierr;
1172: const PetscInt *r,*c,*rout,*cout;
1173: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1174: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1175: const MatScalar *aa=a->a,*v;
1176: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
1177: const PetscScalar *b;
1180: VecGetArrayRead(bb,&b);
1181: VecGetArray(xx,&x);
1182: t = a->solve_work;
1184: ISGetIndices(isrow,&rout); r = rout;
1185: ISGetIndices(iscol,&cout); c = cout;
1187: /* copy the b into temp work space according to permutation */
1188: ii = 0;
1189: for (i=0; i<n; i++) {
1190: ic = 3*c[i];
1191: t[ii] = b[ic];
1192: t[ii+1] = b[ic+1];
1193: t[ii+2] = b[ic+2];
1194: ii += 3;
1195: }
1197: /* forward solve the U^T */
1198: idx = 0;
1199: for (i=0; i<n; i++) {
1201: v = aa + 9*diag[i];
1202: /* multiply by the inverse of the block diagonal */
1203: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1204: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
1205: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
1206: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
1207: v += 9;
1209: vi = aj + diag[i] + 1;
1210: nz = ai[i+1] - diag[i] - 1;
1211: while (nz--) {
1212: oidx = 3*(*vi++);
1213: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1214: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1215: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1216: v += 9;
1217: }
1218: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1219: idx += 3;
1220: }
1221: /* backward solve the L^T */
1222: for (i=n-1; i>=0; i--){
1223: v = aa + 9*diag[i] - 9;
1224: vi = aj + diag[i] - 1;
1225: nz = diag[i] - ai[i];
1226: idt = 3*i;
1227: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1228: while (nz--) {
1229: idx = 3*(*vi--);
1230: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1231: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1232: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233: v -= 9;
1234: }
1235: }
1237: /* copy t into x according to permutation */
1238: ii = 0;
1239: for (i=0; i<n; i++) {
1240: ir = 3*r[i];
1241: x[ir] = t[ii];
1242: x[ir+1] = t[ii+1];
1243: x[ir+2] = t[ii+2];
1244: ii += 3;
1245: }
1247: ISRestoreIndices(isrow,&rout);
1248: ISRestoreIndices(iscol,&cout);
1249: VecRestoreArrayRead(bb,&b);
1250: VecRestoreArray(xx,&x);
1251: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1252: return(0);
1253: }
1257: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1258: {
1259: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1260: PetscErrorCode ierr;
1261: IS iscol=a->col,isrow=a->row;
1262: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1263: const PetscInt *r,*c,*rout,*cout;
1264: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1265: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1266: const MatScalar *aa=a->a,*v;
1267: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
1268: const PetscScalar *b;
1271: VecGetArrayRead(bb,&b);
1272: VecGetArray(xx,&x);
1273: t = a->solve_work;
1275: ISGetIndices(isrow,&rout); r = rout;
1276: ISGetIndices(iscol,&cout); c = cout;
1278: /* copy b into temp work space according to permutation */
1279: for(i=0;i<n;i++){
1280: ii = bs*i; ic = bs*c[i];
1281: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1282: }
1284: /* forward solve the U^T */
1285: idx = 0;
1286: for (i=0; i<n; i++) {
1287: v = aa + bs2*diag[i];
1288: /* multiply by the inverse of the block diagonal */
1289: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1290: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
1291: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
1292: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
1293: v -= bs2;
1295: vi = aj + diag[i] - 1;
1296: nz = diag[i] - diag[i+1] - 1;
1297: for(j=0;j>-nz;j--){
1298: oidx = bs*vi[j];
1299: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1300: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1301: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1302: v -= bs2;
1303: }
1304: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1305: idx += bs;
1306: }
1307: /* backward solve the L^T */
1308: for (i=n-1; i>=0; i--){
1309: v = aa + bs2*ai[i];
1310: vi = aj + ai[i];
1311: nz = ai[i+1] - ai[i];
1312: idt = bs*i;
1313: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1314: for(j=0;j<nz;j++){
1315: idx = bs*vi[j];
1316: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1317: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1318: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1319: v += bs2;
1320: }
1321: }
1323: /* copy t into x according to permutation */
1324: for(i=0;i<n;i++){
1325: ii = bs*i; ir = bs*r[i];
1326: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1327: }
1329: ISRestoreIndices(isrow,&rout);
1330: ISRestoreIndices(iscol,&cout);
1331: VecRestoreArrayRead(bb,&b);
1332: VecRestoreArray(xx,&x);
1333: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1334: return(0);
1335: }
1339: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1340: {
1341: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1342: IS iscol=a->col,isrow=a->row;
1343: PetscErrorCode ierr;
1344: const PetscInt *r,*c,*rout,*cout;
1345: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1346: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1347: const MatScalar *aa=a->a,*v;
1348: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1349: const PetscScalar *b;
1352: VecGetArrayRead(bb,&b);
1353: VecGetArray(xx,&x);
1354: t = a->solve_work;
1356: ISGetIndices(isrow,&rout); r = rout;
1357: ISGetIndices(iscol,&cout); c = cout;
1359: /* copy the b into temp work space according to permutation */
1360: ii = 0;
1361: for (i=0; i<n; i++) {
1362: ic = 4*c[i];
1363: t[ii] = b[ic];
1364: t[ii+1] = b[ic+1];
1365: t[ii+2] = b[ic+2];
1366: t[ii+3] = b[ic+3];
1367: ii += 4;
1368: }
1370: /* forward solve the U^T */
1371: idx = 0;
1372: for (i=0; i<n; i++) {
1374: v = aa + 16*diag[i];
1375: /* multiply by the inverse of the block diagonal */
1376: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
1377: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1378: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1379: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1380: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1381: v += 16;
1383: vi = aj + diag[i] + 1;
1384: nz = ai[i+1] - diag[i] - 1;
1385: while (nz--) {
1386: oidx = 4*(*vi++);
1387: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1388: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1389: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1390: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1391: v += 16;
1392: }
1393: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1394: idx += 4;
1395: }
1396: /* backward solve the L^T */
1397: for (i=n-1; i>=0; i--){
1398: v = aa + 16*diag[i] - 16;
1399: vi = aj + diag[i] - 1;
1400: nz = diag[i] - ai[i];
1401: idt = 4*i;
1402: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1403: while (nz--) {
1404: idx = 4*(*vi--);
1405: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1406: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1407: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1408: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1409: v -= 16;
1410: }
1411: }
1413: /* copy t into x according to permutation */
1414: ii = 0;
1415: for (i=0; i<n; i++) {
1416: ir = 4*r[i];
1417: x[ir] = t[ii];
1418: x[ir+1] = t[ii+1];
1419: x[ir+2] = t[ii+2];
1420: x[ir+3] = t[ii+3];
1421: ii += 4;
1422: }
1424: ISRestoreIndices(isrow,&rout);
1425: ISRestoreIndices(iscol,&cout);
1426: VecRestoreArrayRead(bb,&b);
1427: VecRestoreArray(xx,&x);
1428: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1429: return(0);
1430: }
1434: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1435: {
1436: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1437: PetscErrorCode ierr;
1438: IS iscol=a->col,isrow=a->row;
1439: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1440: const PetscInt *r,*c,*rout,*cout;
1441: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1442: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1443: const MatScalar *aa=a->a,*v;
1444: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1445: const PetscScalar *b;
1448: VecGetArrayRead(bb,&b);
1449: VecGetArray(xx,&x);
1450: t = a->solve_work;
1452: ISGetIndices(isrow,&rout); r = rout;
1453: ISGetIndices(iscol,&cout); c = cout;
1455: /* copy b into temp work space according to permutation */
1456: for(i=0;i<n;i++){
1457: ii = bs*i; ic = bs*c[i];
1458: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1459: }
1461: /* forward solve the U^T */
1462: idx = 0;
1463: for (i=0; i<n; i++) {
1464: v = aa + bs2*diag[i];
1465: /* multiply by the inverse of the block diagonal */
1466: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
1467: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1468: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1469: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1470: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1471: v -= bs2;
1473: vi = aj + diag[i] - 1;
1474: nz = diag[i] - diag[i+1] - 1;
1475: for(j=0;j>-nz;j--){
1476: oidx = bs*vi[j];
1477: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1478: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1479: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1480: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1481: v -= bs2;
1482: }
1483: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4;
1484: idx += bs;
1485: }
1486: /* backward solve the L^T */
1487: for (i=n-1; i>=0; i--){
1488: v = aa + bs2*ai[i];
1489: vi = aj + ai[i];
1490: nz = ai[i+1] - ai[i];
1491: idt = bs*i;
1492: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt];
1493: for(j=0;j<nz;j++){
1494: idx = bs*vi[j];
1495: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1496: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1497: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1498: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1499: v += bs2;
1500: }
1501: }
1503: /* copy t into x according to permutation */
1504: for(i=0;i<n;i++){
1505: ii = bs*i; ir = bs*r[i];
1506: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1507: }
1509: ISRestoreIndices(isrow,&rout);
1510: ISRestoreIndices(iscol,&cout);
1511: VecRestoreArrayRead(bb,&b);
1512: VecRestoreArray(xx,&x);
1513: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1514: return(0);
1515: }
1519: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1520: {
1521: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1522: IS iscol=a->col,isrow=a->row;
1523: PetscErrorCode ierr;
1524: const PetscInt *r,*c,*rout,*cout;
1525: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1526: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1527: const MatScalar *aa=a->a,*v;
1528: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1529: const PetscScalar *b;
1532: VecGetArrayRead(bb,&b);
1533: VecGetArray(xx,&x);
1534: t = a->solve_work;
1536: ISGetIndices(isrow,&rout); r = rout;
1537: ISGetIndices(iscol,&cout); c = cout;
1539: /* copy the b into temp work space according to permutation */
1540: ii = 0;
1541: for (i=0; i<n; i++) {
1542: ic = 5*c[i];
1543: t[ii] = b[ic];
1544: t[ii+1] = b[ic+1];
1545: t[ii+2] = b[ic+2];
1546: t[ii+3] = b[ic+3];
1547: t[ii+4] = b[ic+4];
1548: ii += 5;
1549: }
1551: /* forward solve the U^T */
1552: idx = 0;
1553: for (i=0; i<n; i++) {
1555: v = aa + 25*diag[i];
1556: /* multiply by the inverse of the block diagonal */
1557: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1558: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1559: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1560: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1561: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1562: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1563: v += 25;
1565: vi = aj + diag[i] + 1;
1566: nz = ai[i+1] - diag[i] - 1;
1567: while (nz--) {
1568: oidx = 5*(*vi++);
1569: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1570: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1571: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1572: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1573: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1574: v += 25;
1575: }
1576: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1577: idx += 5;
1578: }
1579: /* backward solve the L^T */
1580: for (i=n-1; i>=0; i--){
1581: v = aa + 25*diag[i] - 25;
1582: vi = aj + diag[i] - 1;
1583: nz = diag[i] - ai[i];
1584: idt = 5*i;
1585: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1586: while (nz--) {
1587: idx = 5*(*vi--);
1588: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1589: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1590: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1591: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1592: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1593: v -= 25;
1594: }
1595: }
1597: /* copy t into x according to permutation */
1598: ii = 0;
1599: for (i=0; i<n; i++) {
1600: ir = 5*r[i];
1601: x[ir] = t[ii];
1602: x[ir+1] = t[ii+1];
1603: x[ir+2] = t[ii+2];
1604: x[ir+3] = t[ii+3];
1605: x[ir+4] = t[ii+4];
1606: ii += 5;
1607: }
1609: ISRestoreIndices(isrow,&rout);
1610: ISRestoreIndices(iscol,&cout);
1611: VecRestoreArrayRead(bb,&b);
1612: VecRestoreArray(xx,&x);
1613: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1614: return(0);
1615: }
1619: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1620: {
1621: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1622: PetscErrorCode ierr;
1623: IS iscol=a->col,isrow=a->row;
1624: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1625: const PetscInt *r,*c,*rout,*cout;
1626: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1627: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1628: const MatScalar *aa=a->a,*v;
1629: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1630: const PetscScalar *b;
1633: VecGetArrayRead(bb,&b);
1634: VecGetArray(xx,&x);
1635: t = a->solve_work;
1637: ISGetIndices(isrow,&rout); r = rout;
1638: ISGetIndices(iscol,&cout); c = cout;
1640: /* copy b into temp work space according to permutation */
1641: for(i=0;i<n;i++){
1642: ii = bs*i; ic = bs*c[i];
1643: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1644: t[ii+4] = b[ic+4];
1645: }
1647: /* forward solve the U^T */
1648: idx = 0;
1649: for (i=0; i<n; i++) {
1650: v = aa + bs2*diag[i];
1651: /* multiply by the inverse of the block diagonal */
1652: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1653: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1654: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1655: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1656: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1657: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1658: v -= bs2;
1660: vi = aj + diag[i] - 1;
1661: nz = diag[i] - diag[i+1] - 1;
1662: for(j=0;j>-nz;j--){
1663: oidx = bs*vi[j];
1664: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1665: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1666: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1667: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1668: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1669: v -= bs2;
1670: }
1671: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1672: idx += bs;
1673: }
1674: /* backward solve the L^T */
1675: for (i=n-1; i>=0; i--){
1676: v = aa + bs2*ai[i];
1677: vi = aj + ai[i];
1678: nz = ai[i+1] - ai[i];
1679: idt = bs*i;
1680: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1681: for(j=0;j<nz;j++){
1682: idx = bs*vi[j];
1683: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1684: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1685: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1686: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1687: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1688: v += bs2;
1689: }
1690: }
1692: /* copy t into x according to permutation */
1693: for(i=0;i<n;i++){
1694: ii = bs*i; ir = bs*r[i];
1695: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1696: x[ir+4] = t[ii+4];
1697: }
1699: ISRestoreIndices(isrow,&rout);
1700: ISRestoreIndices(iscol,&cout);
1701: VecRestoreArrayRead(bb,&b);
1702: VecRestoreArray(xx,&x);
1703: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1704: return(0);
1705: }
1709: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1710: {
1711: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1712: IS iscol=a->col,isrow=a->row;
1713: PetscErrorCode ierr;
1714: const PetscInt *r,*c,*rout,*cout;
1715: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1716: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1717: const MatScalar *aa=a->a,*v;
1718: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1719: const PetscScalar *b;
1722: VecGetArrayRead(bb,&b);
1723: VecGetArray(xx,&x);
1724: t = a->solve_work;
1726: ISGetIndices(isrow,&rout); r = rout;
1727: ISGetIndices(iscol,&cout); c = cout;
1729: /* copy the b into temp work space according to permutation */
1730: ii = 0;
1731: for (i=0; i<n; i++) {
1732: ic = 6*c[i];
1733: t[ii] = b[ic];
1734: t[ii+1] = b[ic+1];
1735: t[ii+2] = b[ic+2];
1736: t[ii+3] = b[ic+3];
1737: t[ii+4] = b[ic+4];
1738: t[ii+5] = b[ic+5];
1739: ii += 6;
1740: }
1742: /* forward solve the U^T */
1743: idx = 0;
1744: for (i=0; i<n; i++) {
1746: v = aa + 36*diag[i];
1747: /* multiply by the inverse of the block diagonal */
1748: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1749: x6 = t[5+idx];
1750: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1751: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1752: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1753: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1754: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1755: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1756: v += 36;
1758: vi = aj + diag[i] + 1;
1759: nz = ai[i+1] - diag[i] - 1;
1760: while (nz--) {
1761: oidx = 6*(*vi++);
1762: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1763: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1764: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1765: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1766: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1767: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1768: v += 36;
1769: }
1770: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1771: t[5+idx] = s6;
1772: idx += 6;
1773: }
1774: /* backward solve the L^T */
1775: for (i=n-1; i>=0; i--){
1776: v = aa + 36*diag[i] - 36;
1777: vi = aj + diag[i] - 1;
1778: nz = diag[i] - ai[i];
1779: idt = 6*i;
1780: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1781: s6 = t[5+idt];
1782: while (nz--) {
1783: idx = 6*(*vi--);
1784: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1785: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1786: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1787: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1788: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1789: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1790: v -= 36;
1791: }
1792: }
1794: /* copy t into x according to permutation */
1795: ii = 0;
1796: for (i=0; i<n; i++) {
1797: ir = 6*r[i];
1798: x[ir] = t[ii];
1799: x[ir+1] = t[ii+1];
1800: x[ir+2] = t[ii+2];
1801: x[ir+3] = t[ii+3];
1802: x[ir+4] = t[ii+4];
1803: x[ir+5] = t[ii+5];
1804: ii += 6;
1805: }
1807: ISRestoreIndices(isrow,&rout);
1808: ISRestoreIndices(iscol,&cout);
1809: VecRestoreArrayRead(bb,&b);
1810: VecRestoreArray(xx,&x);
1811: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
1812: return(0);
1813: }
1817: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1818: {
1819: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1820: PetscErrorCode ierr;
1821: IS iscol=a->col,isrow=a->row;
1822: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1823: const PetscInt *r,*c,*rout,*cout;
1824: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1825: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1826: const MatScalar *aa=a->a,*v;
1827: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1828: const PetscScalar *b;
1831: VecGetArrayRead(bb,&b);
1832: VecGetArray(xx,&x);
1833: t = a->solve_work;
1835: ISGetIndices(isrow,&rout); r = rout;
1836: ISGetIndices(iscol,&cout); c = cout;
1838: /* copy b into temp work space according to permutation */
1839: for(i=0;i<n;i++){
1840: ii = bs*i; ic = bs*c[i];
1841: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1842: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5];
1843: }
1845: /* forward solve the U^T */
1846: idx = 0;
1847: for (i=0; i<n; i++) {
1848: v = aa + bs2*diag[i];
1849: /* multiply by the inverse of the block diagonal */
1850: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1851: x6 = t[5+idx];
1852: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1853: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1854: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1855: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1856: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1857: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1858: v -= bs2;
1860: vi = aj + diag[i] - 1;
1861: nz = diag[i] - diag[i+1] - 1;
1862: for(j=0;j>-nz;j--){
1863: oidx = bs*vi[j];
1864: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1865: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1866: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1867: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1868: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1869: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1870: v -= bs2;
1871: }
1872: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1873: t[5+idx] = s6;
1874: idx += bs;
1875: }
1876: /* backward solve the L^T */
1877: for (i=n-1; i>=0; i--){
1878: v = aa + bs2*ai[i];
1879: vi = aj + ai[i];
1880: nz = ai[i+1] - ai[i];
1881: idt = bs*i;
1882: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1883: s6 = t[5+idt];
1884: for(j=0;j<nz;j++){
1885: idx = bs*vi[j];
1886: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1887: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1888: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1889: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1890: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1891: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1892: v += bs2;
1893: }
1894: }
1896: /* copy t into x according to permutation */
1897: for(i=0;i<n;i++){
1898: ii = bs*i; ir = bs*r[i];
1899: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1900: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5];
1901: }
1903: ISRestoreIndices(isrow,&rout);
1904: ISRestoreIndices(iscol,&cout);
1905: VecRestoreArrayRead(bb,&b);
1906: VecRestoreArray(xx,&x);
1907: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1908: return(0);
1909: }
1913: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1914: {
1915: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1916: IS iscol=a->col,isrow=a->row;
1917: PetscErrorCode ierr;
1918: const PetscInt *r,*c,*rout,*cout;
1919: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1920: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1921: const MatScalar *aa=a->a,*v;
1922: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1923: const PetscScalar *b;
1926: VecGetArrayRead(bb,&b);
1927: VecGetArray(xx,&x);
1928: t = a->solve_work;
1930: ISGetIndices(isrow,&rout); r = rout;
1931: ISGetIndices(iscol,&cout); c = cout;
1933: /* copy the b into temp work space according to permutation */
1934: ii = 0;
1935: for (i=0; i<n; i++) {
1936: ic = 7*c[i];
1937: t[ii] = b[ic];
1938: t[ii+1] = b[ic+1];
1939: t[ii+2] = b[ic+2];
1940: t[ii+3] = b[ic+3];
1941: t[ii+4] = b[ic+4];
1942: t[ii+5] = b[ic+5];
1943: t[ii+6] = b[ic+6];
1944: ii += 7;
1945: }
1947: /* forward solve the U^T */
1948: idx = 0;
1949: for (i=0; i<n; i++) {
1951: v = aa + 49*diag[i];
1952: /* multiply by the inverse of the block diagonal */
1953: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1954: x6 = t[5+idx]; x7 = t[6+idx];
1955: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1956: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1957: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1958: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1959: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1960: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1961: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1962: v += 49;
1964: vi = aj + diag[i] + 1;
1965: nz = ai[i+1] - diag[i] - 1;
1966: while (nz--) {
1967: oidx = 7*(*vi++);
1968: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1969: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1970: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1971: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1972: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1973: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1974: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1975: v += 49;
1976: }
1977: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1978: t[5+idx] = s6;t[6+idx] = s7;
1979: idx += 7;
1980: }
1981: /* backward solve the L^T */
1982: for (i=n-1; i>=0; i--){
1983: v = aa + 49*diag[i] - 49;
1984: vi = aj + diag[i] - 1;
1985: nz = diag[i] - ai[i];
1986: idt = 7*i;
1987: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1988: s6 = t[5+idt];s7 = t[6+idt];
1989: while (nz--) {
1990: idx = 7*(*vi--);
1991: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1992: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1993: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1994: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1995: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1996: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1997: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1998: v -= 49;
1999: }
2000: }
2002: /* copy t into x according to permutation */
2003: ii = 0;
2004: for (i=0; i<n; i++) {
2005: ir = 7*r[i];
2006: x[ir] = t[ii];
2007: x[ir+1] = t[ii+1];
2008: x[ir+2] = t[ii+2];
2009: x[ir+3] = t[ii+3];
2010: x[ir+4] = t[ii+4];
2011: x[ir+5] = t[ii+5];
2012: x[ir+6] = t[ii+6];
2013: ii += 7;
2014: }
2016: ISRestoreIndices(isrow,&rout);
2017: ISRestoreIndices(iscol,&cout);
2018: VecRestoreArrayRead(bb,&b);
2019: VecRestoreArray(xx,&x);
2020: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2021: return(0);
2022: }
2025: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2026: {
2027: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2028: PetscErrorCode ierr;
2029: IS iscol=a->col,isrow=a->row;
2030: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2031: const PetscInt *r,*c,*rout,*cout;
2032: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
2033: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
2034: const MatScalar *aa=a->a,*v;
2035: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2036: const PetscScalar *b;
2039: VecGetArrayRead(bb,&b);
2040: VecGetArray(xx,&x);
2041: t = a->solve_work;
2043: ISGetIndices(isrow,&rout); r = rout;
2044: ISGetIndices(iscol,&cout); c = cout;
2046: /* copy b into temp work space according to permutation */
2047: for(i=0;i<n;i++){
2048: ii = bs*i; ic = bs*c[i];
2049: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2050: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6];
2051: }
2053: /* forward solve the U^T */
2054: idx = 0;
2055: for (i=0; i<n; i++) {
2056: v = aa + bs2*diag[i];
2057: /* multiply by the inverse of the block diagonal */
2058: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2059: x6 = t[5+idx]; x7 = t[6+idx];
2060: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
2061: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2062: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2063: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2064: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2065: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2066: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2067: v -= bs2;
2069: vi = aj + diag[i] - 1;
2070: nz = diag[i] - diag[i+1] - 1;
2071: for(j=0;j>-nz;j--){
2072: oidx = bs*vi[j];
2073: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
2074: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2075: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2076: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2077: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2078: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2079: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2080: v -= bs2;
2081: }
2082: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
2083: t[5+idx] = s6; t[6+idx] = s7;
2084: idx += bs;
2085: }
2086: /* backward solve the L^T */
2087: for (i=n-1; i>=0; i--){
2088: v = aa + bs2*ai[i];
2089: vi = aj + ai[i];
2090: nz = ai[i+1] - ai[i];
2091: idt = bs*i;
2092: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
2093: s6 = t[5+idt]; s7 = t[6+idt];
2094: for(j=0;j<nz;j++){
2095: idx = bs*vi[j];
2096: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
2097: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2098: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2099: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2100: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2101: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2102: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2103: v += bs2;
2104: }
2105: }
2107: /* copy t into x according to permutation */
2108: for(i=0;i<n;i++){
2109: ii = bs*i; ir = bs*r[i];
2110: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
2111: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6];
2112: }
2114: ISRestoreIndices(isrow,&rout);
2115: ISRestoreIndices(iscol,&cout);
2116: VecRestoreArrayRead(bb,&b);
2117: VecRestoreArray(xx,&x);
2118: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2119: return(0);
2120: }
2122: /* ----------------------------------------------------------- */
2125: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2126: {
2127: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2128: IS iscol=a->col,isrow=a->row;
2129: PetscErrorCode ierr;
2130: const PetscInt *r,*c,*rout,*cout;
2131: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2132: PetscInt i,nz;
2133: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
2134: const MatScalar *aa=a->a,*v;
2135: PetscScalar *x,*s,*t,*ls;
2136: const PetscScalar *b;
2139: VecGetArrayRead(bb,&b);
2140: VecGetArray(xx,&x);
2141: t = a->solve_work;
2143: ISGetIndices(isrow,&rout); r = rout;
2144: ISGetIndices(iscol,&cout); c = cout + (n-1);
2146: /* forward solve the lower triangular */
2147: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
2148: for (i=1; i<n; i++) {
2149: v = aa + bs2*ai[i];
2150: vi = aj + ai[i];
2151: nz = a->diag[i] - ai[i];
2152: s = t + bs*i;
2153: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
2154: while (nz--) {
2155: PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2156: v += bs2;
2157: }
2158: }
2159: /* backward solve the upper triangular */
2160: ls = a->solve_work + A->cmap->n;
2161: for (i=n-1; i>=0; i--){
2162: v = aa + bs2*(a->diag[i] + 1);
2163: vi = aj + a->diag[i] + 1;
2164: nz = ai[i+1] - a->diag[i] - 1;
2165: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2166: while (nz--) {
2167: PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2168: v += bs2;
2169: }
2170: PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2171: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
2172: }
2174: ISRestoreIndices(isrow,&rout);
2175: ISRestoreIndices(iscol,&cout);
2176: VecRestoreArrayRead(bb,&b);
2177: VecRestoreArray(xx,&x);
2178: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2179: return(0);
2180: }
2182: /* ----------------------------------------------------------- */
2185: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2186: {
2187: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2188: IS iscol=a->col,isrow=a->row;
2189: PetscErrorCode ierr;
2190: const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2191: PetscInt i,nz,j;
2192: const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2193: const MatScalar *aa=a->a,*v;
2194: PetscScalar *x,*t,*ls;
2195: const PetscScalar *b;
2197: VecGetArrayRead(bb,&b);
2198: VecGetArray(xx,&x);
2199: t = a->solve_work;
2201: ISGetIndices(isrow,&rout); r = rout;
2202: ISGetIndices(iscol,&cout); c = cout;
2204: /* copy the b into temp work space according to permutation */
2205: for (i=0; i<n; i++) {
2206: for (j=0; j<bs; j++) {
2207: t[i*bs+j] = b[c[i]*bs+j];
2208: }
2209: }
2212: /* forward solve the upper triangular transpose */
2213: ls = a->solve_work + A->cmap->n;
2214: for (i=0; i<n; i++){
2215: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2216: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2217: v = aa + bs2*(a->diag[i] + 1);
2218: vi = aj + a->diag[i] + 1;
2219: nz = ai[i+1] - a->diag[i] - 1;
2220: while (nz--) {
2221: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2222: v += bs2;
2223: }
2224: }
2226: /* backward solve the lower triangular transpose */
2227: for (i=n-1; i>=0; i--) {
2228: v = aa + bs2*ai[i];
2229: vi = aj + ai[i];
2230: nz = a->diag[i] - ai[i];
2231: while (nz--) {
2232: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2233: v += bs2;
2234: }
2235: }
2237: /* copy t into x according to permutation */
2238: for (i=0; i<n; i++) {
2239: for (j=0; j<bs; j++) {
2240: x[bs*r[i]+j] = t[bs*i+j];
2241: }
2242: }
2244: ISRestoreIndices(isrow,&rout);
2245: ISRestoreIndices(iscol,&cout);
2246: VecRestoreArrayRead(bb,&b);
2247: VecRestoreArray(xx,&x);
2248: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2249: return(0);
2250: }
2254: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2255: {
2256: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2257: IS iscol=a->col,isrow=a->row;
2258: PetscErrorCode ierr;
2259: const PetscInt *r,*c,*rout,*cout;
2260: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2261: PetscInt i,j,nz;
2262: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
2263: const MatScalar *aa=a->a,*v;
2264: PetscScalar *x,*t,*ls;
2265: const PetscScalar *b;
2268: VecGetArrayRead(bb,&b);
2269: VecGetArray(xx,&x);
2270: t = a->solve_work;
2272: ISGetIndices(isrow,&rout); r = rout;
2273: ISGetIndices(iscol,&cout); c = cout;
2275: /* copy the b into temp work space according to permutation */
2276: for (i=0; i<n; i++) {
2277: for (j=0; j<bs; j++) {
2278: t[i*bs+j] = b[c[i]*bs+j];
2279: }
2280: }
2283: /* forward solve the upper triangular transpose */
2284: ls = a->solve_work + A->cmap->n;
2285: for (i=0; i<n; i++){
2286: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2287: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2288: v = aa + bs2*(diag[i] - 1);
2289: vi = aj + diag[i] - 1;
2290: nz = diag[i] - diag[i+1] - 1;
2291: for(j=0;j>-nz;j--){
2292: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2293: v -= bs2;
2294: }
2295: }
2297: /* backward solve the lower triangular transpose */
2298: for (i=n-1; i>=0; i--) {
2299: v = aa + bs2*ai[i];
2300: vi = aj + ai[i];
2301: nz = ai[i+1] - ai[i];
2302: for(j=0;j<nz;j++){
2303: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2304: v += bs2;
2305: }
2306: }
2308: /* copy t into x according to permutation */
2309: for (i=0; i<n; i++) {
2310: for (j=0; j<bs; j++) {
2311: x[bs*r[i]+j] = t[bs*i+j];
2312: }
2313: }
2315: ISRestoreIndices(isrow,&rout);
2316: ISRestoreIndices(iscol,&cout);
2317: VecRestoreArrayRead(bb,&b);
2318: VecRestoreArray(xx,&x);
2319: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2320: return(0);
2321: }
2323: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */
2327: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2328: {
2329: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2330: PetscErrorCode ierr;
2331: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2332: PetscInt i,nz,idx,idt,m;
2333: const MatScalar *aa=a->a,*v;
2334: PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2335: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2336: PetscScalar *x;
2337: const PetscScalar *b;
2340: VecGetArrayRead(bb,&b);
2341: VecGetArray(xx,&x);
2343: /* forward solve the lower triangular */
2344: idx = 0;
2345: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx];
2346: x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx];
2347: x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2349: for (i=1; i<n; i++) {
2350: v = aa + bs2*ai[i];
2351: vi = aj + ai[i];
2352: nz = ai[i+1] - ai[i];
2353: idt = bs*i;
2354: s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt];
2355: s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt];
2356: s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2357: for(m=0;m<nz;m++){
2358: idx = bs*vi[m];
2359: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2360: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
2361: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2363:
2364: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2365: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2366: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2367: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2368: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2369: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2370: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2371: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2372: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2373: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2374: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2375: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2376: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2377: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2378: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2379:
2380: v += bs2;
2381: }
2382: x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5;
2383: x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10;
2384: x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2385:
2386: }
2387: /* backward solve the upper triangular */
2388: for (i=n-1; i>=0; i--){
2389: v = aa + bs2*(adiag[i+1]+1);
2390: vi = aj + adiag[i+1]+1;
2391: nz = adiag[i] - adiag[i+1] - 1;
2392: idt = bs*i;
2393: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
2394: s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt];
2395: s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2396:
2397: for(m=0;m<nz;m++){
2398: idx = bs*vi[m];
2399: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2400: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
2401: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2403: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2404: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2405: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2406: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2407: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2408: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2409: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2410: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2411: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2412: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2413: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2414: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2415: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2416: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2417: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2419: v += bs2;
2420: }
2422: x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2423: x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2424: x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2425: x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2426: x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2427: x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2428: x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2429: x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2430: x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2431: x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2432: x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2433: x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2434: x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2435: x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2436: x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2438: }
2440: VecRestoreArrayRead(bb,&b);
2441: VecRestoreArray(xx,&x);
2442: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2443: return(0);
2444: }
2446: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2447: /* Default MatSolve for block size 15 */
2451: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2452: {
2453: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2454: PetscErrorCode ierr;
2455: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2456: PetscInt i,k,nz,idx,idt,m;
2457: const MatScalar *aa=a->a,*v;
2458: PetscScalar s[15];
2459: PetscScalar *x,xv;
2460: const PetscScalar *b;
2463: VecGetArrayRead(bb,&b);
2464: VecGetArray(xx,&x);
2466: /* forward solve the lower triangular */
2467: for (i=0; i<n; i++) {
2468: v = aa + bs2*ai[i];
2469: vi = aj + ai[i];
2470: nz = ai[i+1] - ai[i];
2471: idt = bs*i;
2472: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
2473: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
2474: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2475: for(m=0;m<nz;m++){
2476: idx = bs*vi[m];
2477: for(k=0;k<15;k++){
2478: xv = x[k + idx];
2479: x[idt] -= v[0]*xv;
2480: x[1+idt] -= v[1]*xv;
2481: x[2+idt] -= v[2]*xv;
2482: x[3+idt] -= v[3]*xv;
2483: x[4+idt] -= v[4]*xv;
2484: x[5+idt] -= v[5]*xv;
2485: x[6+idt] -= v[6]*xv;
2486: x[7+idt] -= v[7]*xv;
2487: x[8+idt] -= v[8]*xv;
2488: x[9+idt] -= v[9]*xv;
2489: x[10+idt] -= v[10]*xv;
2490: x[11+idt] -= v[11]*xv;
2491: x[12+idt] -= v[12]*xv;
2492: x[13+idt] -= v[13]*xv;
2493: x[14+idt] -= v[14]*xv;
2494: v += 15;
2495: }
2496: }
2497: }
2498: /* backward solve the upper triangular */
2499: for (i=n-1; i>=0; i--){
2500: v = aa + bs2*(adiag[i+1]+1);
2501: vi = aj + adiag[i+1]+1;
2502: nz = adiag[i] - adiag[i+1] - 1;
2503: idt = bs*i;
2504: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
2505: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
2506: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2507:
2508: for(m=0;m<nz;m++){
2509: idx = bs*vi[m];
2510: for(k=0;k<15;k++){
2511: xv = x[k + idx];
2512: s[0] -= v[0]*xv;
2513: s[1] -= v[1]*xv;
2514: s[2] -= v[2]*xv;
2515: s[3] -= v[3]*xv;
2516: s[4] -= v[4]*xv;
2517: s[5] -= v[5]*xv;
2518: s[6] -= v[6]*xv;
2519: s[7] -= v[7]*xv;
2520: s[8] -= v[8]*xv;
2521: s[9] -= v[9]*xv;
2522: s[10] -= v[10]*xv;
2523: s[11] -= v[11]*xv;
2524: s[12] -= v[12]*xv;
2525: s[13] -= v[13]*xv;
2526: s[14] -= v[14]*xv;
2527: v += 15;
2528: }
2529: }
2530: PetscMemzero(x+idt,bs*sizeof(MatScalar));
2531: for(k=0;k<15;k++){
2532: x[idt] += v[0]*s[k];
2533: x[1+idt] += v[1]*s[k];
2534: x[2+idt] += v[2]*s[k];
2535: x[3+idt] += v[3]*s[k];
2536: x[4+idt] += v[4]*s[k];
2537: x[5+idt] += v[5]*s[k];
2538: x[6+idt] += v[6]*s[k];
2539: x[7+idt] += v[7]*s[k];
2540: x[8+idt] += v[8]*s[k];
2541: x[9+idt] += v[9]*s[k];
2542: x[10+idt] += v[10]*s[k];
2543: x[11+idt] += v[11]*s[k];
2544: x[12+idt] += v[12]*s[k];
2545: x[13+idt] += v[13]*s[k];
2546: x[14+idt] += v[14]*s[k];
2547: v += 15;
2548: }
2549: }
2550: VecRestoreArrayRead(bb,&b);
2551: VecRestoreArray(xx,&x);
2552: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2553: return(0);
2554: }
2559: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2560: {
2561: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2562: IS iscol=a->col,isrow=a->row;
2563: PetscErrorCode ierr;
2564: const PetscInt *r,*c,*ai=a->i,*aj=a->j;
2565: const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2566: PetscInt i,nz,idx,idt,idc;
2567: const MatScalar *aa=a->a,*v;
2568: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2569: const PetscScalar *b;
2572: VecGetArrayRead(bb,&b);
2573: VecGetArray(xx,&x);
2574: t = a->solve_work;
2576: ISGetIndices(isrow,&rout); r = rout;
2577: ISGetIndices(iscol,&cout); c = cout + (n-1);
2579: /* forward solve the lower triangular */
2580: idx = 7*(*r++);
2581: t[0] = b[idx]; t[1] = b[1+idx];
2582: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2583: t[5] = b[5+idx]; t[6] = b[6+idx];
2585: for (i=1; i<n; i++) {
2586: v = aa + 49*ai[i];
2587: vi = aj + ai[i];
2588: nz = diag[i] - ai[i];
2589: idx = 7*(*r++);
2590: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2591: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2592: while (nz--) {
2593: idx = 7*(*vi++);
2594: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
2595: x4 = t[3+idx];x5 = t[4+idx];
2596: x6 = t[5+idx];x7 = t[6+idx];
2597: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2598: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2599: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2600: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2601: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2602: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2603: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2604: v += 49;
2605: }
2606: idx = 7*i;
2607: t[idx] = s1;t[1+idx] = s2;
2608: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2609: t[5+idx] = s6;t[6+idx] = s7;
2610: }
2611: /* backward solve the upper triangular */
2612: for (i=n-1; i>=0; i--){
2613: v = aa + 49*diag[i] + 49;
2614: vi = aj + diag[i] + 1;
2615: nz = ai[i+1] - diag[i] - 1;
2616: idt = 7*i;
2617: s1 = t[idt]; s2 = t[1+idt];
2618: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2619: s6 = t[5+idt];s7 = t[6+idt];
2620: while (nz--) {
2621: idx = 7*(*vi++);
2622: x1 = t[idx]; x2 = t[1+idx];
2623: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2624: x6 = t[5+idx]; x7 = t[6+idx];
2625: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2626: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2627: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2628: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2629: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2630: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2631: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2632: v += 49;
2633: }
2634: idc = 7*(*c--);
2635: v = aa + 49*diag[i];
2636: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
2637: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2638: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2639: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2640: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2641: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2642: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2643: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2644: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2645: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2646: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2647: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2648: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2649: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2650: }
2652: ISRestoreIndices(isrow,&rout);
2653: ISRestoreIndices(iscol,&cout);
2654: VecRestoreArrayRead(bb,&b);
2655: VecRestoreArray(xx,&x);
2656: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2657: return(0);
2658: }
2662: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2663: {
2664: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2665: IS iscol=a->col,isrow=a->row;
2666: PetscErrorCode ierr;
2667: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2668: const PetscInt n=a->mbs,*rout,*cout,*vi;
2669: PetscInt i,nz,idx,idt,idc,m;
2670: const MatScalar *aa=a->a,*v;
2671: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2672: const PetscScalar *b;
2675: VecGetArrayRead(bb,&b);
2676: VecGetArray(xx,&x);
2677: t = a->solve_work;
2679: ISGetIndices(isrow,&rout); r = rout;
2680: ISGetIndices(iscol,&cout); c = cout;
2682: /* forward solve the lower triangular */
2683: idx = 7*r[0];
2684: t[0] = b[idx]; t[1] = b[1+idx];
2685: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2686: t[5] = b[5+idx]; t[6] = b[6+idx];
2688: for (i=1; i<n; i++) {
2689: v = aa + 49*ai[i];
2690: vi = aj + ai[i];
2691: nz = ai[i+1] - ai[i];
2692: idx = 7*r[i];
2693: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2694: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2695: for(m=0;m<nz;m++){
2696: idx = 7*vi[m];
2697: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
2698: x4 = t[3+idx];x5 = t[4+idx];
2699: x6 = t[5+idx];x7 = t[6+idx];
2700: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2701: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2702: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2703: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2704: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2705: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2706: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2707: v += 49;
2708: }
2709: idx = 7*i;
2710: t[idx] = s1;t[1+idx] = s2;
2711: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2712: t[5+idx] = s6;t[6+idx] = s7;
2713: }
2714: /* backward solve the upper triangular */
2715: for (i=n-1; i>=0; i--){
2716: v = aa + 49*(adiag[i+1]+1);
2717: vi = aj + adiag[i+1]+1;
2718: nz = adiag[i] - adiag[i+1] - 1;
2719: idt = 7*i;
2720: s1 = t[idt]; s2 = t[1+idt];
2721: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2722: s6 = t[5+idt];s7 = t[6+idt];
2723: for(m=0;m<nz;m++){
2724: idx = 7*vi[m];
2725: x1 = t[idx]; x2 = t[1+idx];
2726: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2727: x6 = t[5+idx]; x7 = t[6+idx];
2728: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2729: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2730: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2731: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2732: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2733: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2734: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2735: v += 49;
2736: }
2737: idc = 7*c[i];
2738: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
2739: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2740: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2741: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2742: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2743: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2744: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2745: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2746: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2747: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2748: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2749: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2750: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2751: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2752: }
2754: ISRestoreIndices(isrow,&rout);
2755: ISRestoreIndices(iscol,&cout);
2756: VecRestoreArrayRead(bb,&b);
2757: VecRestoreArray(xx,&x);
2758: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2759: return(0);
2760: }
2764: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2765: {
2766: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2767: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2768: PetscErrorCode ierr;
2769: PetscInt i,nz,idx,idt,jdx;
2770: const MatScalar *aa=a->a,*v;
2771: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2772: const PetscScalar *b;
2775: VecGetArrayRead(bb,&b);
2776: VecGetArray(xx,&x);
2777: /* forward solve the lower triangular */
2778: idx = 0;
2779: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
2780: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2781: x[6] = b[6+idx];
2782: for (i=1; i<n; i++) {
2783: v = aa + 49*ai[i];
2784: vi = aj + ai[i];
2785: nz = diag[i] - ai[i];
2786: idx = 7*i;
2787: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2788: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2789: s7 = b[6+idx];
2790: while (nz--) {
2791: jdx = 7*(*vi++);
2792: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
2793: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2794: x7 = x[6+jdx];
2795: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2796: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2797: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2798: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2799: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2800: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2801: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2802: v += 49;
2803: }
2804: x[idx] = s1;
2805: x[1+idx] = s2;
2806: x[2+idx] = s3;
2807: x[3+idx] = s4;
2808: x[4+idx] = s5;
2809: x[5+idx] = s6;
2810: x[6+idx] = s7;
2811: }
2812: /* backward solve the upper triangular */
2813: for (i=n-1; i>=0; i--){
2814: v = aa + 49*diag[i] + 49;
2815: vi = aj + diag[i] + 1;
2816: nz = ai[i+1] - diag[i] - 1;
2817: idt = 7*i;
2818: s1 = x[idt]; s2 = x[1+idt];
2819: s3 = x[2+idt]; s4 = x[3+idt];
2820: s5 = x[4+idt]; s6 = x[5+idt];
2821: s7 = x[6+idt];
2822: while (nz--) {
2823: idx = 7*(*vi++);
2824: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
2825: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2826: x7 = x[6+idx];
2827: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2828: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2829: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2830: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2831: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2832: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2833: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2834: v += 49;
2835: }
2836: v = aa + 49*diag[i];
2837: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
2838: + v[28]*s5 + v[35]*s6 + v[42]*s7;
2839: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
2840: + v[29]*s5 + v[36]*s6 + v[43]*s7;
2841: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
2842: + v[30]*s5 + v[37]*s6 + v[44]*s7;
2843: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
2844: + v[31]*s5 + v[38]*s6 + v[45]*s7;
2845: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
2846: + v[32]*s5 + v[39]*s6 + v[46]*s7;
2847: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
2848: + v[33]*s5 + v[40]*s6 + v[47]*s7;
2849: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
2850: + v[34]*s5 + v[41]*s6 + v[48]*s7;
2851: }
2853: VecRestoreArrayRead(bb,&b);
2854: VecRestoreArray(xx,&x);
2855: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2856: return(0);
2857: }
2861: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2862: {
2863: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2864: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2865: PetscErrorCode ierr;
2866: PetscInt i,k,nz,idx,jdx,idt;
2867: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
2868: const MatScalar *aa=a->a,*v;
2869: PetscScalar *x;
2870: const PetscScalar *b;
2871: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2874: VecGetArrayRead(bb,&b);
2875: VecGetArray(xx,&x);
2876: /* forward solve the lower triangular */
2877: idx = 0;
2878: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2879: x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2880: for (i=1; i<n; i++) {
2881: v = aa + bs2*ai[i];
2882: vi = aj + ai[i];
2883: nz = ai[i+1] - ai[i];
2884: idx = bs*i;
2885: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2886: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2887: for(k=0;k<nz;k++) {
2888: jdx = bs*vi[k];
2889: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2890: x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2891: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2892: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2893: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2894: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2895: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2896: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2897: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2898: v += bs2;
2899: }
2901: x[idx] = s1;
2902: x[1+idx] = s2;
2903: x[2+idx] = s3;
2904: x[3+idx] = s4;
2905: x[4+idx] = s5;
2906: x[5+idx] = s6;
2907: x[6+idx] = s7;
2908: }
2909:
2910: /* backward solve the upper triangular */
2911: for (i=n-1; i>=0; i--){
2912: v = aa + bs2*(adiag[i+1]+1);
2913: vi = aj + adiag[i+1]+1;
2914: nz = adiag[i] - adiag[i+1]-1;
2915: idt = bs*i;
2916: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2917: s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2918: for(k=0;k<nz;k++) {
2919: idx = bs*vi[k];
2920: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2921: x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2922: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2923: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2924: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2925: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2926: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2927: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2928: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2929: v += bs2;
2930: }
2931: /* x = inv_diagonal*x */
2932: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7;
2933: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7;
2934: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7;
2935: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7;
2936: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7;
2937: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7;
2938: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7;
2939: }
2941: VecRestoreArrayRead(bb,&b);
2942: VecRestoreArray(xx,&x);
2943: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2944: return(0);
2945: }
2949: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2950: {
2951: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2952: IS iscol=a->col,isrow=a->row;
2953: PetscErrorCode ierr;
2954: const PetscInt *r,*c,*rout,*cout;
2955: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2956: PetscInt i,nz,idx,idt,idc;
2957: const MatScalar *aa=a->a,*v;
2958: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2959: const PetscScalar *b;
2962: VecGetArrayRead(bb,&b);
2963: VecGetArray(xx,&x);
2964: t = a->solve_work;
2966: ISGetIndices(isrow,&rout); r = rout;
2967: ISGetIndices(iscol,&cout); c = cout + (n-1);
2969: /* forward solve the lower triangular */
2970: idx = 6*(*r++);
2971: t[0] = b[idx]; t[1] = b[1+idx];
2972: t[2] = b[2+idx]; t[3] = b[3+idx];
2973: t[4] = b[4+idx]; t[5] = b[5+idx];
2974: for (i=1; i<n; i++) {
2975: v = aa + 36*ai[i];
2976: vi = aj + ai[i];
2977: nz = diag[i] - ai[i];
2978: idx = 6*(*r++);
2979: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2980: s5 = b[4+idx]; s6 = b[5+idx];
2981: while (nz--) {
2982: idx = 6*(*vi++);
2983: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2984: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2985: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2986: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2987: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2988: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2989: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2990: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2991: v += 36;
2992: }
2993: idx = 6*i;
2994: t[idx] = s1;t[1+idx] = s2;
2995: t[2+idx] = s3;t[3+idx] = s4;
2996: t[4+idx] = s5;t[5+idx] = s6;
2997: }
2998: /* backward solve the upper triangular */
2999: for (i=n-1; i>=0; i--){
3000: v = aa + 36*diag[i] + 36;
3001: vi = aj + diag[i] + 1;
3002: nz = ai[i+1] - diag[i] - 1;
3003: idt = 6*i;
3004: s1 = t[idt]; s2 = t[1+idt];
3005: s3 = t[2+idt];s4 = t[3+idt];
3006: s5 = t[4+idt];s6 = t[5+idt];
3007: while (nz--) {
3008: idx = 6*(*vi++);
3009: x1 = t[idx]; x2 = t[1+idx];
3010: x3 = t[2+idx]; x4 = t[3+idx];
3011: x5 = t[4+idx]; x6 = t[5+idx];
3012: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3013: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3014: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3015: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3016: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3017: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3018: v += 36;
3019: }
3020: idc = 6*(*c--);
3021: v = aa + 36*diag[i];
3022: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
3023: v[18]*s4+v[24]*s5+v[30]*s6;
3024: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3025: v[19]*s4+v[25]*s5+v[31]*s6;
3026: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3027: v[20]*s4+v[26]*s5+v[32]*s6;
3028: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3029: v[21]*s4+v[27]*s5+v[33]*s6;
3030: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3031: v[22]*s4+v[28]*s5+v[34]*s6;
3032: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3033: v[23]*s4+v[29]*s5+v[35]*s6;
3034: }
3036: ISRestoreIndices(isrow,&rout);
3037: ISRestoreIndices(iscol,&cout);
3038: VecRestoreArrayRead(bb,&b);
3039: VecRestoreArray(xx,&x);
3040: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3041: return(0);
3042: }
3046: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3047: {
3048: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
3049: IS iscol=a->col,isrow=a->row;
3050: PetscErrorCode ierr;
3051: const PetscInt *r,*c,*rout,*cout;
3052: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3053: PetscInt i,nz,idx,idt,idc,m;
3054: const MatScalar *aa=a->a,*v;
3055: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3056: const PetscScalar *b;
3059: VecGetArrayRead(bb,&b);
3060: VecGetArray(xx,&x);
3061: t = a->solve_work;
3063: ISGetIndices(isrow,&rout); r = rout;
3064: ISGetIndices(iscol,&cout); c = cout;
3066: /* forward solve the lower triangular */
3067: idx = 6*r[0];
3068: t[0] = b[idx]; t[1] = b[1+idx];
3069: t[2] = b[2+idx]; t[3] = b[3+idx];
3070: t[4] = b[4+idx]; t[5] = b[5+idx];
3071: for (i=1; i<n; i++) {
3072: v = aa + 36*ai[i];
3073: vi = aj + ai[i];
3074: nz = ai[i+1] - ai[i];
3075: idx = 6*r[i];
3076: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3077: s5 = b[4+idx]; s6 = b[5+idx];
3078: for(m=0;m<nz;m++){
3079: idx = 6*vi[m];
3080: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3081: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3082: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3083: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3084: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3085: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3086: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3087: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3088: v += 36;
3089: }
3090: idx = 6*i;
3091: t[idx] = s1;t[1+idx] = s2;
3092: t[2+idx] = s3;t[3+idx] = s4;
3093: t[4+idx] = s5;t[5+idx] = s6;
3094: }
3095: /* backward solve the upper triangular */
3096: for (i=n-1; i>=0; i--){
3097: v = aa + 36*(adiag[i+1]+1);
3098: vi = aj + adiag[i+1]+1;
3099: nz = adiag[i] - adiag[i+1] - 1;
3100: idt = 6*i;
3101: s1 = t[idt]; s2 = t[1+idt];
3102: s3 = t[2+idt];s4 = t[3+idt];
3103: s5 = t[4+idt];s6 = t[5+idt];
3104: for(m=0;m<nz;m++){
3105: idx = 6*vi[m];
3106: x1 = t[idx]; x2 = t[1+idx];
3107: x3 = t[2+idx]; x4 = t[3+idx];
3108: x5 = t[4+idx]; x6 = t[5+idx];
3109: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3110: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3111: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3112: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3113: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3114: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3115: v += 36;
3116: }
3117: idc = 6*c[i];
3118: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
3119: v[18]*s4+v[24]*s5+v[30]*s6;
3120: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3121: v[19]*s4+v[25]*s5+v[31]*s6;
3122: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3123: v[20]*s4+v[26]*s5+v[32]*s6;
3124: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3125: v[21]*s4+v[27]*s5+v[33]*s6;
3126: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3127: v[22]*s4+v[28]*s5+v[34]*s6;
3128: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3129: v[23]*s4+v[29]*s5+v[35]*s6;
3130: }
3132: ISRestoreIndices(isrow,&rout);
3133: ISRestoreIndices(iscol,&cout);
3134: VecRestoreArrayRead(bb,&b);
3135: VecRestoreArray(xx,&x);
3136: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3137: return(0);
3138: }
3142: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3143: {
3144: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3145: PetscInt i,nz,idx,idt,jdx;
3146: PetscErrorCode ierr;
3147: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3148: const MatScalar *aa=a->a,*v;
3149: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3150: const PetscScalar *b;
3153: VecGetArrayRead(bb,&b);
3154: VecGetArray(xx,&x);
3155: /* forward solve the lower triangular */
3156: idx = 0;
3157: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
3158: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3159: for (i=1; i<n; i++) {
3160: v = aa + 36*ai[i];
3161: vi = aj + ai[i];
3162: nz = diag[i] - ai[i];
3163: idx = 6*i;
3164: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3165: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3166: while (nz--) {
3167: jdx = 6*(*vi++);
3168: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
3169: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3170: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3171: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3172: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3173: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3174: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3175: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3176: v += 36;
3177: }
3178: x[idx] = s1;
3179: x[1+idx] = s2;
3180: x[2+idx] = s3;
3181: x[3+idx] = s4;
3182: x[4+idx] = s5;
3183: x[5+idx] = s6;
3184: }
3185: /* backward solve the upper triangular */
3186: for (i=n-1; i>=0; i--){
3187: v = aa + 36*diag[i] + 36;
3188: vi = aj + diag[i] + 1;
3189: nz = ai[i+1] - diag[i] - 1;
3190: idt = 6*i;
3191: s1 = x[idt]; s2 = x[1+idt];
3192: s3 = x[2+idt]; s4 = x[3+idt];
3193: s5 = x[4+idt]; s6 = x[5+idt];
3194: while (nz--) {
3195: idx = 6*(*vi++);
3196: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
3197: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3198: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3199: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3200: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3201: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3202: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3203: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3204: v += 36;
3205: }
3206: v = aa + 36*diag[i];
3207: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3208: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3209: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3210: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3211: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3212: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3213: }
3215: VecRestoreArrayRead(bb,&b);
3216: VecRestoreArray(xx,&x);
3217: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3218: return(0);
3219: }
3223: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3224: {
3225: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3226: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3227: PetscErrorCode ierr;
3228: PetscInt i,k,nz,idx,jdx,idt;
3229: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
3230: const MatScalar *aa=a->a,*v;
3231: PetscScalar *x;
3232: const PetscScalar *b;
3233: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3236: VecGetArrayRead(bb,&b);
3237: VecGetArray(xx,&x);
3238: /* forward solve the lower triangular */
3239: idx = 0;
3240: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3241: x[4] = b[4+idx];x[5] = b[5+idx];
3242: for (i=1; i<n; i++) {
3243: v = aa + bs2*ai[i];
3244: vi = aj + ai[i];
3245: nz = ai[i+1] - ai[i];
3246: idx = bs*i;
3247: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3248: s5 = b[4+idx];s6 = b[5+idx];
3249: for(k=0;k<nz;k++){
3250: jdx = bs*vi[k];
3251: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3252: x5 = x[4+jdx]; x6 = x[5+jdx];
3253: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3254: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
3255: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3256: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3257: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3258: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3259: v += bs2;
3260: }
3262: x[idx] = s1;
3263: x[1+idx] = s2;
3264: x[2+idx] = s3;
3265: x[3+idx] = s4;
3266: x[4+idx] = s5;
3267: x[5+idx] = s6;
3268: }
3269:
3270: /* backward solve the upper triangular */
3271: for (i=n-1; i>=0; i--){
3272: v = aa + bs2*(adiag[i+1]+1);
3273: vi = aj + adiag[i+1]+1;
3274: nz = adiag[i] - adiag[i+1]-1;
3275: idt = bs*i;
3276: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3277: s5 = x[4+idt];s6 = x[5+idt];
3278: for(k=0;k<nz;k++){
3279: idx = bs*vi[k];
3280: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3281: x5 = x[4+idx];x6 = x[5+idx];
3282: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3283: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
3284: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3285: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3286: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3287: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3288: v += bs2;
3289: }
3290: /* x = inv_diagonal*x */
3291: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3292: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3293: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3294: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3295: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3296: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3297: }
3299: VecRestoreArrayRead(bb,&b);
3300: VecRestoreArray(xx,&x);
3301: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
3302: return(0);
3303: }
3307: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3308: {
3309: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
3310: IS iscol=a->col,isrow=a->row;
3311: PetscErrorCode ierr;
3312: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
3313: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3314: PetscInt i,nz,idx,idt,idc;
3315: const MatScalar *aa=a->a,*v;
3316: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3317: const PetscScalar *b;
3320: VecGetArrayRead(bb,&b);
3321: VecGetArray(xx,&x);
3322: t = a->solve_work;
3324: ISGetIndices(isrow,&rout); r = rout;
3325: ISGetIndices(iscol,&cout); c = cout + (n-1);
3327: /* forward solve the lower triangular */
3328: idx = 5*(*r++);
3329: t[0] = b[idx]; t[1] = b[1+idx];
3330: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3331: for (i=1; i<n; i++) {
3332: v = aa + 25*ai[i];
3333: vi = aj + ai[i];
3334: nz = diag[i] - ai[i];
3335: idx = 5*(*r++);
3336: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3337: s5 = b[4+idx];
3338: while (nz--) {
3339: idx = 5*(*vi++);
3340: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
3341: x4 = t[3+idx];x5 = t[4+idx];
3342: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3343: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3344: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3345: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3346: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3347: v += 25;
3348: }
3349: idx = 5*i;
3350: t[idx] = s1;t[1+idx] = s2;
3351: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3352: }
3353: /* backward solve the upper triangular */
3354: for (i=n-1; i>=0; i--){
3355: v = aa + 25*diag[i] + 25;
3356: vi = aj + diag[i] + 1;
3357: nz = ai[i+1] - diag[i] - 1;
3358: idt = 5*i;
3359: s1 = t[idt]; s2 = t[1+idt];
3360: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3361: while (nz--) {
3362: idx = 5*(*vi++);
3363: x1 = t[idx]; x2 = t[1+idx];
3364: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3365: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3366: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3367: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3368: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3369: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3370: v += 25;
3371: }
3372: idc = 5*(*c--);
3373: v = aa + 25*diag[i];
3374: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
3375: v[15]*s4+v[20]*s5;
3376: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3377: v[16]*s4+v[21]*s5;
3378: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3379: v[17]*s4+v[22]*s5;
3380: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3381: v[18]*s4+v[23]*s5;
3382: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3383: v[19]*s4+v[24]*s5;
3384: }
3386: ISRestoreIndices(isrow,&rout);
3387: ISRestoreIndices(iscol,&cout);
3388: VecRestoreArrayRead(bb,&b);
3389: VecRestoreArray(xx,&x);
3390: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3391: return(0);
3392: }
3396: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3397: {
3398: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
3399: IS iscol=a->col,isrow=a->row;
3400: PetscErrorCode ierr;
3401: const PetscInt *r,*c,*rout,*cout;
3402: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3403: PetscInt i,nz,idx,idt,idc,m;
3404: const MatScalar *aa=a->a,*v;
3405: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3406: const PetscScalar *b;
3409: VecGetArrayRead(bb,&b);
3410: VecGetArray(xx,&x);
3411: t = a->solve_work;
3413: ISGetIndices(isrow,&rout); r = rout;
3414: ISGetIndices(iscol,&cout); c = cout;
3416: /* forward solve the lower triangular */
3417: idx = 5*r[0];
3418: t[0] = b[idx]; t[1] = b[1+idx];
3419: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3420: for (i=1; i<n; i++) {
3421: v = aa + 25*ai[i];
3422: vi = aj + ai[i];
3423: nz = ai[i+1] - ai[i];
3424: idx = 5*r[i];
3425: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3426: s5 = b[4+idx];
3427: for(m=0;m<nz;m++){
3428: idx = 5*vi[m];
3429: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
3430: x4 = t[3+idx];x5 = t[4+idx];
3431: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3432: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3433: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3434: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3435: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3436: v += 25;
3437: }
3438: idx = 5*i;
3439: t[idx] = s1;t[1+idx] = s2;
3440: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3441: }
3442: /* backward solve the upper triangular */
3443: for (i=n-1; i>=0; i--){
3444: v = aa + 25*(adiag[i+1]+1);
3445: vi = aj + adiag[i+1]+1;
3446: nz = adiag[i] - adiag[i+1] - 1;
3447: idt = 5*i;
3448: s1 = t[idt]; s2 = t[1+idt];
3449: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3450: for(m=0;m<nz;m++){
3451: idx = 5*vi[m];
3452: x1 = t[idx]; x2 = t[1+idx];
3453: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3454: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3455: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3456: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3457: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3458: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3459: v += 25;
3460: }
3461: idc = 5*c[i];
3462: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
3463: v[15]*s4+v[20]*s5;
3464: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3465: v[16]*s4+v[21]*s5;
3466: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3467: v[17]*s4+v[22]*s5;
3468: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3469: v[18]*s4+v[23]*s5;
3470: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3471: v[19]*s4+v[24]*s5;
3472: }
3474: ISRestoreIndices(isrow,&rout);
3475: ISRestoreIndices(iscol,&cout);
3476: VecRestoreArrayRead(bb,&b);
3477: VecRestoreArray(xx,&x);
3478: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3479: return(0);
3480: }
3484: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3485: {
3486: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3487: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3488: PetscInt i,nz,idx,idt,jdx;
3489: PetscErrorCode ierr;
3490: const MatScalar *aa=a->a,*v;
3491: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3492: const PetscScalar *b;
3495: VecGetArrayRead(bb,&b);
3496: VecGetArray(xx,&x);
3497: /* forward solve the lower triangular */
3498: idx = 0;
3499: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3500: for (i=1; i<n; i++) {
3501: v = aa + 25*ai[i];
3502: vi = aj + ai[i];
3503: nz = diag[i] - ai[i];
3504: idx = 5*i;
3505: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3506: while (nz--) {
3507: jdx = 5*(*vi++);
3508: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3509: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3510: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3511: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3512: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3513: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3514: v += 25;
3515: }
3516: x[idx] = s1;
3517: x[1+idx] = s2;
3518: x[2+idx] = s3;
3519: x[3+idx] = s4;
3520: x[4+idx] = s5;
3521: }
3522: /* backward solve the upper triangular */
3523: for (i=n-1; i>=0; i--){
3524: v = aa + 25*diag[i] + 25;
3525: vi = aj + diag[i] + 1;
3526: nz = ai[i+1] - diag[i] - 1;
3527: idt = 5*i;
3528: s1 = x[idt]; s2 = x[1+idt];
3529: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3530: while (nz--) {
3531: idx = 5*(*vi++);
3532: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3533: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3534: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3535: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3536: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3537: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3538: v += 25;
3539: }
3540: v = aa + 25*diag[i];
3541: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
3542: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
3543: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
3544: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
3545: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
3546: }
3548: VecRestoreArrayRead(bb,&b);
3549: VecRestoreArray(xx,&x);
3550: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3551: return(0);
3552: }
3556: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3557: {
3558: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3559: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3560: PetscInt i,k,nz,idx,idt,jdx;
3561: PetscErrorCode ierr;
3562: const MatScalar *aa=a->a,*v;
3563: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3564: const PetscScalar *b;
3567: VecGetArrayRead(bb,&b);
3568: VecGetArray(xx,&x);
3569: /* forward solve the lower triangular */
3570: idx = 0;
3571: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3572: for (i=1; i<n; i++) {
3573: v = aa + 25*ai[i];
3574: vi = aj + ai[i];
3575: nz = ai[i+1] - ai[i];
3576: idx = 5*i;
3577: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3578: for(k=0;k<nz;k++) {
3579: jdx = 5*vi[k];
3580: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3581: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3582: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3583: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3584: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3585: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3586: v += 25;
3587: }
3588: x[idx] = s1;
3589: x[1+idx] = s2;
3590: x[2+idx] = s3;
3591: x[3+idx] = s4;
3592: x[4+idx] = s5;
3593: }
3595: /* backward solve the upper triangular */
3596: for (i=n-1; i>=0; i--){
3597: v = aa + 25*(adiag[i+1]+1);
3598: vi = aj + adiag[i+1]+1;
3599: nz = adiag[i] - adiag[i+1]-1;
3600: idt = 5*i;
3601: s1 = x[idt]; s2 = x[1+idt];
3602: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3603: for(k=0;k<nz;k++){
3604: idx = 5*vi[k];
3605: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3606: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3607: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3608: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3609: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3610: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3611: v += 25;
3612: }
3613: /* x = inv_diagonal*x */
3614: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
3615: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
3616: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
3617: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
3618: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
3619: }
3621: VecRestoreArrayRead(bb,&b);
3622: VecRestoreArray(xx,&x);
3623: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3624: return(0);
3625: }
3629: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3630: {
3631: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3632: IS iscol=a->col,isrow=a->row;
3633: PetscErrorCode ierr;
3634: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3635: PetscInt i,nz,idx,idt,idc;
3636: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3637: const MatScalar *aa=a->a,*v;
3638: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3639: const PetscScalar *b;
3642: VecGetArrayRead(bb,&b);
3643: VecGetArray(xx,&x);
3644: t = a->solve_work;
3646: ISGetIndices(isrow,&rout); r = rout;
3647: ISGetIndices(iscol,&cout); c = cout + (n-1);
3649: /* forward solve the lower triangular */
3650: idx = 4*(*r++);
3651: t[0] = b[idx]; t[1] = b[1+idx];
3652: t[2] = b[2+idx]; t[3] = b[3+idx];
3653: for (i=1; i<n; i++) {
3654: v = aa + 16*ai[i];
3655: vi = aj + ai[i];
3656: nz = diag[i] - ai[i];
3657: idx = 4*(*r++);
3658: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3659: while (nz--) {
3660: idx = 4*(*vi++);
3661: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3662: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3663: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3664: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3665: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3666: v += 16;
3667: }
3668: idx = 4*i;
3669: t[idx] = s1;t[1+idx] = s2;
3670: t[2+idx] = s3;t[3+idx] = s4;
3671: }
3672: /* backward solve the upper triangular */
3673: for (i=n-1; i>=0; i--){
3674: v = aa + 16*diag[i] + 16;
3675: vi = aj + diag[i] + 1;
3676: nz = ai[i+1] - diag[i] - 1;
3677: idt = 4*i;
3678: s1 = t[idt]; s2 = t[1+idt];
3679: s3 = t[2+idt];s4 = t[3+idt];
3680: while (nz--) {
3681: idx = 4*(*vi++);
3682: x1 = t[idx]; x2 = t[1+idx];
3683: x3 = t[2+idx]; x4 = t[3+idx];
3684: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3685: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3686: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3687: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3688: v += 16;
3689: }
3690: idc = 4*(*c--);
3691: v = aa + 16*diag[i];
3692: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3693: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3694: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3695: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3696: }
3698: ISRestoreIndices(isrow,&rout);
3699: ISRestoreIndices(iscol,&cout);
3700: VecRestoreArrayRead(bb,&b);
3701: VecRestoreArray(xx,&x);
3702: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3703: return(0);
3704: }
3708: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3709: {
3710: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3711: IS iscol=a->col,isrow=a->row;
3712: PetscErrorCode ierr;
3713: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3714: PetscInt i,nz,idx,idt,idc,m;
3715: const PetscInt *r,*c,*rout,*cout;
3716: const MatScalar *aa=a->a,*v;
3717: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3718: const PetscScalar *b;
3721: VecGetArrayRead(bb,&b);
3722: VecGetArray(xx,&x);
3723: t = a->solve_work;
3725: ISGetIndices(isrow,&rout); r = rout;
3726: ISGetIndices(iscol,&cout); c = cout;
3728: /* forward solve the lower triangular */
3729: idx = 4*r[0];
3730: t[0] = b[idx]; t[1] = b[1+idx];
3731: t[2] = b[2+idx]; t[3] = b[3+idx];
3732: for (i=1; i<n; i++) {
3733: v = aa + 16*ai[i];
3734: vi = aj + ai[i];
3735: nz = ai[i+1] - ai[i];
3736: idx = 4*r[i];
3737: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3738: for(m=0;m<nz;m++){
3739: idx = 4*vi[m];
3740: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3741: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3742: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3743: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3744: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3745: v += 16;
3746: }
3747: idx = 4*i;
3748: t[idx] = s1;t[1+idx] = s2;
3749: t[2+idx] = s3;t[3+idx] = s4;
3750: }
3751: /* backward solve the upper triangular */
3752: for (i=n-1; i>=0; i--){
3753: v = aa + 16*(adiag[i+1]+1);
3754: vi = aj + adiag[i+1]+1;
3755: nz = adiag[i] - adiag[i+1] - 1;
3756: idt = 4*i;
3757: s1 = t[idt]; s2 = t[1+idt];
3758: s3 = t[2+idt];s4 = t[3+idt];
3759: for(m=0;m<nz;m++){
3760: idx = 4*vi[m];
3761: x1 = t[idx]; x2 = t[1+idx];
3762: x3 = t[2+idx]; x4 = t[3+idx];
3763: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3764: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3765: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3766: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3767: v += 16;
3768: }
3769: idc = 4*c[i];
3770: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3771: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3772: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3773: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3774: }
3776: ISRestoreIndices(isrow,&rout);
3777: ISRestoreIndices(iscol,&cout);
3778: VecRestoreArrayRead(bb,&b);
3779: VecRestoreArray(xx,&x);
3780: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3781: return(0);
3782: }
3786: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3787: {
3788: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3789: IS iscol=a->col,isrow=a->row;
3790: PetscErrorCode ierr;
3791: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3792: PetscInt i,nz,idx,idt,idc;
3793: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3794: const MatScalar *aa=a->a,*v;
3795: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
3796: PetscScalar *x;
3797: const PetscScalar *b;
3800: VecGetArrayRead(bb,&b);
3801: VecGetArray(xx,&x);
3802: t = (MatScalar *)a->solve_work;
3804: ISGetIndices(isrow,&rout); r = rout;
3805: ISGetIndices(iscol,&cout); c = cout + (n-1);
3807: /* forward solve the lower triangular */
3808: idx = 4*(*r++);
3809: t[0] = (MatScalar)b[idx];
3810: t[1] = (MatScalar)b[1+idx];
3811: t[2] = (MatScalar)b[2+idx];
3812: t[3] = (MatScalar)b[3+idx];
3813: for (i=1; i<n; i++) {
3814: v = aa + 16*ai[i];
3815: vi = aj + ai[i];
3816: nz = diag[i] - ai[i];
3817: idx = 4*(*r++);
3818: s1 = (MatScalar)b[idx];
3819: s2 = (MatScalar)b[1+idx];
3820: s3 = (MatScalar)b[2+idx];
3821: s4 = (MatScalar)b[3+idx];
3822: while (nz--) {
3823: idx = 4*(*vi++);
3824: x1 = t[idx];
3825: x2 = t[1+idx];
3826: x3 = t[2+idx];
3827: x4 = t[3+idx];
3828: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3829: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3830: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3831: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3832: v += 16;
3833: }
3834: idx = 4*i;
3835: t[idx] = s1;
3836: t[1+idx] = s2;
3837: t[2+idx] = s3;
3838: t[3+idx] = s4;
3839: }
3840: /* backward solve the upper triangular */
3841: for (i=n-1; i>=0; i--){
3842: v = aa + 16*diag[i] + 16;
3843: vi = aj + diag[i] + 1;
3844: nz = ai[i+1] - diag[i] - 1;
3845: idt = 4*i;
3846: s1 = t[idt];
3847: s2 = t[1+idt];
3848: s3 = t[2+idt];
3849: s4 = t[3+idt];
3850: while (nz--) {
3851: idx = 4*(*vi++);
3852: x1 = t[idx];
3853: x2 = t[1+idx];
3854: x3 = t[2+idx];
3855: x4 = t[3+idx];
3856: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3857: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3858: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3859: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3860: v += 16;
3861: }
3862: idc = 4*(*c--);
3863: v = aa + 16*diag[i];
3864: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3865: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3866: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3867: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3868: x[idc] = (PetscScalar)t[idt];
3869: x[1+idc] = (PetscScalar)t[1+idt];
3870: x[2+idc] = (PetscScalar)t[2+idt];
3871: x[3+idc] = (PetscScalar)t[3+idt];
3872: }
3874: ISRestoreIndices(isrow,&rout);
3875: ISRestoreIndices(iscol,&cout);
3876: VecRestoreArrayRead(bb,&b);
3877: VecRestoreArray(xx,&x);
3878: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3879: return(0);
3880: }
3882: #if defined (PETSC_HAVE_SSE)
3884: #include PETSC_HAVE_SSE
3888: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3889: {
3890: /*
3891: Note: This code uses demotion of double
3892: to float when performing the mixed-mode computation.
3893: This may not be numerically reasonable for all applications.
3894: */
3895: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3896: IS iscol=a->col,isrow=a->row;
3898: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3899: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3900: MatScalar *aa=a->a,*v;
3901: PetscScalar *x,*b,*t;
3903: /* Make space in temp stack for 16 Byte Aligned arrays */
3904: float ssealignedspace[11],*tmps,*tmpx;
3905: unsigned long offset;
3906:
3908: SSE_SCOPE_BEGIN;
3910: offset = (unsigned long)ssealignedspace % 16;
3911: if (offset) offset = (16 - offset)/4;
3912: tmps = &ssealignedspace[offset];
3913: tmpx = &ssealignedspace[offset+4];
3914: PREFETCH_NTA(aa+16*ai[1]);
3916: VecGetArray(bb,&b);
3917: VecGetArray(xx,&x);
3918: t = a->solve_work;
3920: ISGetIndices(isrow,&rout); r = rout;
3921: ISGetIndices(iscol,&cout); c = cout + (n-1);
3923: /* forward solve the lower triangular */
3924: idx = 4*(*r++);
3925: t[0] = b[idx]; t[1] = b[1+idx];
3926: t[2] = b[2+idx]; t[3] = b[3+idx];
3927: v = aa + 16*ai[1];
3929: for (i=1; i<n;) {
3930: PREFETCH_NTA(&v[8]);
3931: vi = aj + ai[i];
3932: nz = diag[i] - ai[i];
3933: idx = 4*(*r++);
3935: /* Demote sum from double to float */
3936: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3937: LOAD_PS(tmps,XMM7);
3939: while (nz--) {
3940: PREFETCH_NTA(&v[16]);
3941: idx = 4*(*vi++);
3942:
3943: /* Demote solution (so far) from double to float */
3944: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3946: /* 4x4 Matrix-Vector product with negative accumulation: */
3947: SSE_INLINE_BEGIN_2(tmpx,v)
3948: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3950: /* First Column */
3951: SSE_COPY_PS(XMM0,XMM6)
3952: SSE_SHUFFLE(XMM0,XMM0,0x00)
3953: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3954: SSE_SUB_PS(XMM7,XMM0)
3955:
3956: /* Second Column */
3957: SSE_COPY_PS(XMM1,XMM6)
3958: SSE_SHUFFLE(XMM1,XMM1,0x55)
3959: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3960: SSE_SUB_PS(XMM7,XMM1)
3961:
3962: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3963:
3964: /* Third Column */
3965: SSE_COPY_PS(XMM2,XMM6)
3966: SSE_SHUFFLE(XMM2,XMM2,0xAA)
3967: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3968: SSE_SUB_PS(XMM7,XMM2)
3970: /* Fourth Column */
3971: SSE_COPY_PS(XMM3,XMM6)
3972: SSE_SHUFFLE(XMM3,XMM3,0xFF)
3973: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3974: SSE_SUB_PS(XMM7,XMM3)
3975: SSE_INLINE_END_2
3976:
3977: v += 16;
3978: }
3979: idx = 4*i;
3980: v = aa + 16*ai[++i];
3981: PREFETCH_NTA(v);
3982: STORE_PS(tmps,XMM7);
3984: /* Promote result from float to double */
3985: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3986: }
3987: /* backward solve the upper triangular */
3988: idt = 4*(n-1);
3989: ai16 = 16*diag[n-1];
3990: v = aa + ai16 + 16;
3991: for (i=n-1; i>=0;){
3992: PREFETCH_NTA(&v[8]);
3993: vi = aj + diag[i] + 1;
3994: nz = ai[i+1] - diag[i] - 1;
3995:
3996: /* Demote accumulator from double to float */
3997: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3998: LOAD_PS(tmps,XMM7);
4000: while (nz--) {
4001: PREFETCH_NTA(&v[16]);
4002: idx = 4*(*vi++);
4004: /* Demote solution (so far) from double to float */
4005: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4007: /* 4x4 Matrix-Vector Product with negative accumulation: */
4008: SSE_INLINE_BEGIN_2(tmpx,v)
4009: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4011: /* First Column */
4012: SSE_COPY_PS(XMM0,XMM6)
4013: SSE_SHUFFLE(XMM0,XMM0,0x00)
4014: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4015: SSE_SUB_PS(XMM7,XMM0)
4017: /* Second Column */
4018: SSE_COPY_PS(XMM1,XMM6)
4019: SSE_SHUFFLE(XMM1,XMM1,0x55)
4020: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4021: SSE_SUB_PS(XMM7,XMM1)
4023: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4024:
4025: /* Third Column */
4026: SSE_COPY_PS(XMM2,XMM6)
4027: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4028: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4029: SSE_SUB_PS(XMM7,XMM2)
4031: /* Fourth Column */
4032: SSE_COPY_PS(XMM3,XMM6)
4033: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4034: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4035: SSE_SUB_PS(XMM7,XMM3)
4036: SSE_INLINE_END_2
4037: v += 16;
4038: }
4039: v = aa + ai16;
4040: ai16 = 16*diag[--i];
4041: PREFETCH_NTA(aa+ai16+16);
4042: /*
4043: Scale the result by the diagonal 4x4 block,
4044: which was inverted as part of the factorization
4045: */
4046: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4047: /* First Column */
4048: SSE_COPY_PS(XMM0,XMM7)
4049: SSE_SHUFFLE(XMM0,XMM0,0x00)
4050: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4052: /* Second Column */
4053: SSE_COPY_PS(XMM1,XMM7)
4054: SSE_SHUFFLE(XMM1,XMM1,0x55)
4055: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4056: SSE_ADD_PS(XMM0,XMM1)
4058: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4059:
4060: /* Third Column */
4061: SSE_COPY_PS(XMM2,XMM7)
4062: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4063: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4064: SSE_ADD_PS(XMM0,XMM2)
4066: /* Fourth Column */
4067: SSE_COPY_PS(XMM3,XMM7)
4068: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4069: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4070: SSE_ADD_PS(XMM0,XMM3)
4071:
4072: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4073: SSE_INLINE_END_3
4075: /* Promote solution from float to double */
4076: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4078: /* Apply reordering to t and stream into x. */
4079: /* This way, x doesn't pollute the cache. */
4080: /* Be careful with size: 2 doubles = 4 floats! */
4081: idc = 4*(*c--);
4082: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4083: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
4084: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4085: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4086: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4087: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4088: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4089: SSE_INLINE_END_2
4090: v = aa + ai16 + 16;
4091: idt -= 4;
4092: }
4094: ISRestoreIndices(isrow,&rout);
4095: ISRestoreIndices(iscol,&cout);
4096: VecRestoreArray(bb,&b);
4097: VecRestoreArray(xx,&x);
4098: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4099: SSE_SCOPE_END;
4100: return(0);
4101: }
4103: #endif
4106: /*
4107: Special case where the matrix was ILU(0) factored in the natural
4108: ordering. This eliminates the need for the column and row permutation.
4109: */
4112: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4113: {
4114: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4115: PetscInt n=a->mbs;
4116: const PetscInt *ai=a->i,*aj=a->j;
4117: PetscErrorCode ierr;
4118: const PetscInt *diag = a->diag;
4119: const MatScalar *aa=a->a;
4120: PetscScalar *x;
4121: const PetscScalar *b;
4124: VecGetArrayRead(bb,&b);
4125: VecGetArray(xx,&x);
4127: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4128: {
4129: static PetscScalar w[2000]; /* very BAD need to fix */
4130: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4131: }
4132: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4133: {
4134: static PetscScalar w[2000]; /* very BAD need to fix */
4135: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4136: }
4137: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4138: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4139: #else
4140: {
4141: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
4142: const MatScalar *v;
4143: PetscInt jdx,idt,idx,nz,i,ai16;
4144: const PetscInt *vi;
4146: /* forward solve the lower triangular */
4147: idx = 0;
4148: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4149: for (i=1; i<n; i++) {
4150: v = aa + 16*ai[i];
4151: vi = aj + ai[i];
4152: nz = diag[i] - ai[i];
4153: idx += 4;
4154: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4155: while (nz--) {
4156: jdx = 4*(*vi++);
4157: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4158: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4159: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4160: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4161: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4162: v += 16;
4163: }
4164: x[idx] = s1;
4165: x[1+idx] = s2;
4166: x[2+idx] = s3;
4167: x[3+idx] = s4;
4168: }
4169: /* backward solve the upper triangular */
4170: idt = 4*(n-1);
4171: for (i=n-1; i>=0; i--){
4172: ai16 = 16*diag[i];
4173: v = aa + ai16 + 16;
4174: vi = aj + diag[i] + 1;
4175: nz = ai[i+1] - diag[i] - 1;
4176: s1 = x[idt]; s2 = x[1+idt];
4177: s3 = x[2+idt];s4 = x[3+idt];
4178: while (nz--) {
4179: idx = 4*(*vi++);
4180: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
4181: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4182: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4183: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4184: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4185: v += 16;
4186: }
4187: v = aa + ai16;
4188: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4189: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
4190: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4191: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4192: idt -= 4;
4193: }
4194: }
4195: #endif
4197: VecRestoreArrayRead(bb,&b);
4198: VecRestoreArray(xx,&x);
4199: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4200: return(0);
4201: }
4205: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4206: {
4207: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4208: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4209: PetscInt i,k,nz,idx,jdx,idt;
4210: PetscErrorCode ierr;
4211: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
4212: const MatScalar *aa=a->a,*v;
4213: PetscScalar *x;
4214: const PetscScalar *b;
4215: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
4218: VecGetArrayRead(bb,&b);
4219: VecGetArray(xx,&x);
4220: /* forward solve the lower triangular */
4221: idx = 0;
4222: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4223: for (i=1; i<n; i++) {
4224: v = aa + bs2*ai[i];
4225: vi = aj + ai[i];
4226: nz = ai[i+1] - ai[i];
4227: idx = bs*i;
4228: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4229: for(k=0;k<nz;k++) {
4230: jdx = bs*vi[k];
4231: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4232: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4233: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4234: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4235: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4236:
4237: v += bs2;
4238: }
4240: x[idx] = s1;
4241: x[1+idx] = s2;
4242: x[2+idx] = s3;
4243: x[3+idx] = s4;
4244: }
4245:
4246: /* backward solve the upper triangular */
4247: for (i=n-1; i>=0; i--){
4248: v = aa + bs2*(adiag[i+1]+1);
4249: vi = aj + adiag[i+1]+1;
4250: nz = adiag[i] - adiag[i+1]-1;
4251: idt = bs*i;
4252: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4253:
4254: for(k=0;k<nz;k++){
4255: idx = bs*vi[k];
4256: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4257: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4258: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4259: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4260: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4262: v += bs2;
4263: }
4264: /* x = inv_diagonal*x */
4265: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4266: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4267: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4268: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4270: }
4272: VecRestoreArrayRead(bb,&b);
4273: VecRestoreArray(xx,&x);
4274: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4275: return(0);
4276: }
4280: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4281: {
4282: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4283: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4284: PetscErrorCode ierr;
4285: const MatScalar *aa=a->a;
4286: const PetscScalar *b;
4287: PetscScalar *x;
4290: VecGetArrayRead(bb,&b);
4291: VecGetArray(xx,&x);
4293: {
4294: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
4295: const MatScalar *v;
4296: MatScalar *t=(MatScalar *)x;
4297: PetscInt jdx,idt,idx,nz,i,ai16;
4298: const PetscInt *vi;
4300: /* forward solve the lower triangular */
4301: idx = 0;
4302: t[0] = (MatScalar)b[0];
4303: t[1] = (MatScalar)b[1];
4304: t[2] = (MatScalar)b[2];
4305: t[3] = (MatScalar)b[3];
4306: for (i=1; i<n; i++) {
4307: v = aa + 16*ai[i];
4308: vi = aj + ai[i];
4309: nz = diag[i] - ai[i];
4310: idx += 4;
4311: s1 = (MatScalar)b[idx];
4312: s2 = (MatScalar)b[1+idx];
4313: s3 = (MatScalar)b[2+idx];
4314: s4 = (MatScalar)b[3+idx];
4315: while (nz--) {
4316: jdx = 4*(*vi++);
4317: x1 = t[jdx];
4318: x2 = t[1+jdx];
4319: x3 = t[2+jdx];
4320: x4 = t[3+jdx];
4321: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4322: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4323: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4324: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4325: v += 16;
4326: }
4327: t[idx] = s1;
4328: t[1+idx] = s2;
4329: t[2+idx] = s3;
4330: t[3+idx] = s4;
4331: }
4332: /* backward solve the upper triangular */
4333: idt = 4*(n-1);
4334: for (i=n-1; i>=0; i--){
4335: ai16 = 16*diag[i];
4336: v = aa + ai16 + 16;
4337: vi = aj + diag[i] + 1;
4338: nz = ai[i+1] - diag[i] - 1;
4339: s1 = t[idt];
4340: s2 = t[1+idt];
4341: s3 = t[2+idt];
4342: s4 = t[3+idt];
4343: while (nz--) {
4344: idx = 4*(*vi++);
4345: x1 = (MatScalar)x[idx];
4346: x2 = (MatScalar)x[1+idx];
4347: x3 = (MatScalar)x[2+idx];
4348: x4 = (MatScalar)x[3+idx];
4349: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4350: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4351: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4352: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4353: v += 16;
4354: }
4355: v = aa + ai16;
4356: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
4357: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
4358: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4359: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4360: idt -= 4;
4361: }
4362: }
4364: VecRestoreArrayRead(bb,&b);
4365: VecRestoreArray(xx,&x);
4366: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4367: return(0);
4368: }
4370: #if defined (PETSC_HAVE_SSE)
4372: #include PETSC_HAVE_SSE
4375: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4376: {
4377: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4378: unsigned short *aj=(unsigned short *)a->j;
4380: int *ai=a->i,n=a->mbs,*diag = a->diag;
4381: MatScalar *aa=a->a;
4382: PetscScalar *x,*b;
4385: SSE_SCOPE_BEGIN;
4386: /*
4387: Note: This code currently uses demotion of double
4388: to float when performing the mixed-mode computation.
4389: This may not be numerically reasonable for all applications.
4390: */
4391: PREFETCH_NTA(aa+16*ai[1]);
4393: VecGetArray(bb,&b);
4394: VecGetArray(xx,&x);
4395: {
4396: /* x will first be computed in single precision then promoted inplace to double */
4397: MatScalar *v,*t=(MatScalar *)x;
4398: int nz,i,idt,ai16;
4399: unsigned int jdx,idx;
4400: unsigned short *vi;
4401: /* Forward solve the lower triangular factor. */
4403: /* First block is the identity. */
4404: idx = 0;
4405: CONVERT_DOUBLE4_FLOAT4(t,b);
4406: v = aa + 16*((unsigned int)ai[1]);
4408: for (i=1; i<n;) {
4409: PREFETCH_NTA(&v[8]);
4410: vi = aj + ai[i];
4411: nz = diag[i] - ai[i];
4412: idx += 4;
4414: /* Demote RHS from double to float. */
4415: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4416: LOAD_PS(&t[idx],XMM7);
4418: while (nz--) {
4419: PREFETCH_NTA(&v[16]);
4420: jdx = 4*((unsigned int)(*vi++));
4421:
4422: /* 4x4 Matrix-Vector product with negative accumulation: */
4423: SSE_INLINE_BEGIN_2(&t[jdx],v)
4424: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4426: /* First Column */
4427: SSE_COPY_PS(XMM0,XMM6)
4428: SSE_SHUFFLE(XMM0,XMM0,0x00)
4429: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4430: SSE_SUB_PS(XMM7,XMM0)
4432: /* Second Column */
4433: SSE_COPY_PS(XMM1,XMM6)
4434: SSE_SHUFFLE(XMM1,XMM1,0x55)
4435: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4436: SSE_SUB_PS(XMM7,XMM1)
4438: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4439:
4440: /* Third Column */
4441: SSE_COPY_PS(XMM2,XMM6)
4442: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4443: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4444: SSE_SUB_PS(XMM7,XMM2)
4446: /* Fourth Column */
4447: SSE_COPY_PS(XMM3,XMM6)
4448: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4449: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4450: SSE_SUB_PS(XMM7,XMM3)
4451: SSE_INLINE_END_2
4452:
4453: v += 16;
4454: }
4455: v = aa + 16*ai[++i];
4456: PREFETCH_NTA(v);
4457: STORE_PS(&t[idx],XMM7);
4458: }
4460: /* Backward solve the upper triangular factor.*/
4462: idt = 4*(n-1);
4463: ai16 = 16*diag[n-1];
4464: v = aa + ai16 + 16;
4465: for (i=n-1; i>=0;){
4466: PREFETCH_NTA(&v[8]);
4467: vi = aj + diag[i] + 1;
4468: nz = ai[i+1] - diag[i] - 1;
4469:
4470: LOAD_PS(&t[idt],XMM7);
4472: while (nz--) {
4473: PREFETCH_NTA(&v[16]);
4474: idx = 4*((unsigned int)(*vi++));
4476: /* 4x4 Matrix-Vector Product with negative accumulation: */
4477: SSE_INLINE_BEGIN_2(&t[idx],v)
4478: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4480: /* First Column */
4481: SSE_COPY_PS(XMM0,XMM6)
4482: SSE_SHUFFLE(XMM0,XMM0,0x00)
4483: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4484: SSE_SUB_PS(XMM7,XMM0)
4486: /* Second Column */
4487: SSE_COPY_PS(XMM1,XMM6)
4488: SSE_SHUFFLE(XMM1,XMM1,0x55)
4489: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4490: SSE_SUB_PS(XMM7,XMM1)
4492: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4493:
4494: /* Third Column */
4495: SSE_COPY_PS(XMM2,XMM6)
4496: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4497: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4498: SSE_SUB_PS(XMM7,XMM2)
4500: /* Fourth Column */
4501: SSE_COPY_PS(XMM3,XMM6)
4502: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4503: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4504: SSE_SUB_PS(XMM7,XMM3)
4505: SSE_INLINE_END_2
4506: v += 16;
4507: }
4508: v = aa + ai16;
4509: ai16 = 16*diag[--i];
4510: PREFETCH_NTA(aa+ai16+16);
4511: /*
4512: Scale the result by the diagonal 4x4 block,
4513: which was inverted as part of the factorization
4514: */
4515: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4516: /* First Column */
4517: SSE_COPY_PS(XMM0,XMM7)
4518: SSE_SHUFFLE(XMM0,XMM0,0x00)
4519: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4521: /* Second Column */
4522: SSE_COPY_PS(XMM1,XMM7)
4523: SSE_SHUFFLE(XMM1,XMM1,0x55)
4524: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4525: SSE_ADD_PS(XMM0,XMM1)
4527: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4528:
4529: /* Third Column */
4530: SSE_COPY_PS(XMM2,XMM7)
4531: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4532: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4533: SSE_ADD_PS(XMM0,XMM2)
4535: /* Fourth Column */
4536: SSE_COPY_PS(XMM3,XMM7)
4537: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4538: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4539: SSE_ADD_PS(XMM0,XMM3)
4541: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4542: SSE_INLINE_END_3
4544: v = aa + ai16 + 16;
4545: idt -= 4;
4546: }
4548: /* Convert t from single precision back to double precision (inplace)*/
4549: idt = 4*(n-1);
4550: for (i=n-1;i>=0;i--) {
4551: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4552: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4553: PetscScalar *xtemp=&x[idt];
4554: MatScalar *ttemp=&t[idt];
4555: xtemp[3] = (PetscScalar)ttemp[3];
4556: xtemp[2] = (PetscScalar)ttemp[2];
4557: xtemp[1] = (PetscScalar)ttemp[1];
4558: xtemp[0] = (PetscScalar)ttemp[0];
4559: idt -= 4;
4560: }
4562: } /* End of artificial scope. */
4563: VecRestoreArray(bb,&b);
4564: VecRestoreArray(xx,&x);
4565: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4566: SSE_SCOPE_END;
4567: return(0);
4568: }
4572: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4573: {
4574: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4575: int *aj=a->j;
4577: int *ai=a->i,n=a->mbs,*diag = a->diag;
4578: MatScalar *aa=a->a;
4579: PetscScalar *x,*b;
4582: SSE_SCOPE_BEGIN;
4583: /*
4584: Note: This code currently uses demotion of double
4585: to float when performing the mixed-mode computation.
4586: This may not be numerically reasonable for all applications.
4587: */
4588: PREFETCH_NTA(aa+16*ai[1]);
4590: VecGetArray(bb,&b);
4591: VecGetArray(xx,&x);
4592: {
4593: /* x will first be computed in single precision then promoted inplace to double */
4594: MatScalar *v,*t=(MatScalar *)x;
4595: int nz,i,idt,ai16;
4596: int jdx,idx;
4597: int *vi;
4598: /* Forward solve the lower triangular factor. */
4600: /* First block is the identity. */
4601: idx = 0;
4602: CONVERT_DOUBLE4_FLOAT4(t,b);
4603: v = aa + 16*ai[1];
4605: for (i=1; i<n;) {
4606: PREFETCH_NTA(&v[8]);
4607: vi = aj + ai[i];
4608: nz = diag[i] - ai[i];
4609: idx += 4;
4611: /* Demote RHS from double to float. */
4612: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4613: LOAD_PS(&t[idx],XMM7);
4615: while (nz--) {
4616: PREFETCH_NTA(&v[16]);
4617: jdx = 4*(*vi++);
4618: /* jdx = *vi++; */
4619:
4620: /* 4x4 Matrix-Vector product with negative accumulation: */
4621: SSE_INLINE_BEGIN_2(&t[jdx],v)
4622: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4624: /* First Column */
4625: SSE_COPY_PS(XMM0,XMM6)
4626: SSE_SHUFFLE(XMM0,XMM0,0x00)
4627: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4628: SSE_SUB_PS(XMM7,XMM0)
4630: /* Second Column */
4631: SSE_COPY_PS(XMM1,XMM6)
4632: SSE_SHUFFLE(XMM1,XMM1,0x55)
4633: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4634: SSE_SUB_PS(XMM7,XMM1)
4636: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4637:
4638: /* Third Column */
4639: SSE_COPY_PS(XMM2,XMM6)
4640: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4641: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4642: SSE_SUB_PS(XMM7,XMM2)
4644: /* Fourth Column */
4645: SSE_COPY_PS(XMM3,XMM6)
4646: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4647: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4648: SSE_SUB_PS(XMM7,XMM3)
4649: SSE_INLINE_END_2
4650:
4651: v += 16;
4652: }
4653: v = aa + 16*ai[++i];
4654: PREFETCH_NTA(v);
4655: STORE_PS(&t[idx],XMM7);
4656: }
4658: /* Backward solve the upper triangular factor.*/
4660: idt = 4*(n-1);
4661: ai16 = 16*diag[n-1];
4662: v = aa + ai16 + 16;
4663: for (i=n-1; i>=0;){
4664: PREFETCH_NTA(&v[8]);
4665: vi = aj + diag[i] + 1;
4666: nz = ai[i+1] - diag[i] - 1;
4667:
4668: LOAD_PS(&t[idt],XMM7);
4670: while (nz--) {
4671: PREFETCH_NTA(&v[16]);
4672: idx = 4*(*vi++);
4673: /* idx = *vi++; */
4675: /* 4x4 Matrix-Vector Product with negative accumulation: */
4676: SSE_INLINE_BEGIN_2(&t[idx],v)
4677: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4679: /* First Column */
4680: SSE_COPY_PS(XMM0,XMM6)
4681: SSE_SHUFFLE(XMM0,XMM0,0x00)
4682: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4683: SSE_SUB_PS(XMM7,XMM0)
4685: /* Second Column */
4686: SSE_COPY_PS(XMM1,XMM6)
4687: SSE_SHUFFLE(XMM1,XMM1,0x55)
4688: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4689: SSE_SUB_PS(XMM7,XMM1)
4691: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4692:
4693: /* Third Column */
4694: SSE_COPY_PS(XMM2,XMM6)
4695: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4696: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4697: SSE_SUB_PS(XMM7,XMM2)
4699: /* Fourth Column */
4700: SSE_COPY_PS(XMM3,XMM6)
4701: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4702: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4703: SSE_SUB_PS(XMM7,XMM3)
4704: SSE_INLINE_END_2
4705: v += 16;
4706: }
4707: v = aa + ai16;
4708: ai16 = 16*diag[--i];
4709: PREFETCH_NTA(aa+ai16+16);
4710: /*
4711: Scale the result by the diagonal 4x4 block,
4712: which was inverted as part of the factorization
4713: */
4714: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4715: /* First Column */
4716: SSE_COPY_PS(XMM0,XMM7)
4717: SSE_SHUFFLE(XMM0,XMM0,0x00)
4718: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4720: /* Second Column */
4721: SSE_COPY_PS(XMM1,XMM7)
4722: SSE_SHUFFLE(XMM1,XMM1,0x55)
4723: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4724: SSE_ADD_PS(XMM0,XMM1)
4726: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4727:
4728: /* Third Column */
4729: SSE_COPY_PS(XMM2,XMM7)
4730: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4731: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4732: SSE_ADD_PS(XMM0,XMM2)
4734: /* Fourth Column */
4735: SSE_COPY_PS(XMM3,XMM7)
4736: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4737: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4738: SSE_ADD_PS(XMM0,XMM3)
4740: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4741: SSE_INLINE_END_3
4743: v = aa + ai16 + 16;
4744: idt -= 4;
4745: }
4747: /* Convert t from single precision back to double precision (inplace)*/
4748: idt = 4*(n-1);
4749: for (i=n-1;i>=0;i--) {
4750: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4751: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4752: PetscScalar *xtemp=&x[idt];
4753: MatScalar *ttemp=&t[idt];
4754: xtemp[3] = (PetscScalar)ttemp[3];
4755: xtemp[2] = (PetscScalar)ttemp[2];
4756: xtemp[1] = (PetscScalar)ttemp[1];
4757: xtemp[0] = (PetscScalar)ttemp[0];
4758: idt -= 4;
4759: }
4761: } /* End of artificial scope. */
4762: VecRestoreArray(bb,&b);
4763: VecRestoreArray(xx,&x);
4764: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4765: SSE_SCOPE_END;
4766: return(0);
4767: }
4769: #endif
4773: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4774: {
4775: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
4776: IS iscol=a->col,isrow=a->row;
4777: PetscErrorCode ierr;
4778: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4779: PetscInt i,nz,idx,idt,idc;
4780: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4781: const MatScalar *aa=a->a,*v;
4782: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
4783: const PetscScalar *b;
4786: VecGetArrayRead(bb,&b);
4787: VecGetArray(xx,&x);
4788: t = a->solve_work;
4790: ISGetIndices(isrow,&rout); r = rout;
4791: ISGetIndices(iscol,&cout); c = cout + (n-1);
4793: /* forward solve the lower triangular */
4794: idx = 3*(*r++);
4795: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4796: for (i=1; i<n; i++) {
4797: v = aa + 9*ai[i];
4798: vi = aj + ai[i];
4799: nz = diag[i] - ai[i];
4800: idx = 3*(*r++);
4801: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4802: while (nz--) {
4803: idx = 3*(*vi++);
4804: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4805: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4806: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4807: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4808: v += 9;
4809: }
4810: idx = 3*i;
4811: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4812: }
4813: /* backward solve the upper triangular */
4814: for (i=n-1; i>=0; i--){
4815: v = aa + 9*diag[i] + 9;
4816: vi = aj + diag[i] + 1;
4817: nz = ai[i+1] - diag[i] - 1;
4818: idt = 3*i;
4819: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4820: while (nz--) {
4821: idx = 3*(*vi++);
4822: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4823: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4824: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4825: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4826: v += 9;
4827: }
4828: idc = 3*(*c--);
4829: v = aa + 9*diag[i];
4830: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4831: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4832: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4833: }
4834: ISRestoreIndices(isrow,&rout);
4835: ISRestoreIndices(iscol,&cout);
4836: VecRestoreArrayRead(bb,&b);
4837: VecRestoreArray(xx,&x);
4838: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4839: return(0);
4840: }
4844: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4845: {
4846: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
4847: IS iscol=a->col,isrow=a->row;
4848: PetscErrorCode ierr;
4849: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4850: PetscInt i,nz,idx,idt,idc,m;
4851: const PetscInt *r,*c,*rout,*cout;
4852: const MatScalar *aa=a->a,*v;
4853: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
4854: const PetscScalar *b;
4857: VecGetArrayRead(bb,&b);
4858: VecGetArray(xx,&x);
4859: t = a->solve_work;
4861: ISGetIndices(isrow,&rout); r = rout;
4862: ISGetIndices(iscol,&cout); c = cout;
4864: /* forward solve the lower triangular */
4865: idx = 3*r[0];
4866: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4867: for (i=1; i<n; i++) {
4868: v = aa + 9*ai[i];
4869: vi = aj + ai[i];
4870: nz = ai[i+1] - ai[i];
4871: idx = 3*r[i];
4872: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4873: for(m=0;m<nz;m++){
4874: idx = 3*vi[m];
4875: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4876: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4879: v += 9;
4880: }
4881: idx = 3*i;
4882: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4883: }
4884: /* backward solve the upper triangular */
4885: for (i=n-1; i>=0; i--){
4886: v = aa + 9*(adiag[i+1]+1);
4887: vi = aj + adiag[i+1]+1;
4888: nz = adiag[i] - adiag[i+1] - 1;
4889: idt = 3*i;
4890: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4891: for(m=0;m<nz;m++){
4892: idx = 3*vi[m];
4893: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4894: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4895: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4896: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4897: v += 9;
4898: }
4899: idc = 3*c[i];
4900: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4901: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4902: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4903: }
4904: ISRestoreIndices(isrow,&rout);
4905: ISRestoreIndices(iscol,&cout);
4906: VecRestoreArrayRead(bb,&b);
4907: VecRestoreArray(xx,&x);
4908: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4909: return(0);
4910: }
4912: /*
4913: Special case where the matrix was ILU(0) factored in the natural
4914: ordering. This eliminates the need for the column and row permutation.
4915: */
4918: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4919: {
4920: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4921: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
4922: PetscErrorCode ierr;
4923: const PetscInt *diag = a->diag,*vi;
4924: const MatScalar *aa=a->a,*v;
4925: PetscScalar *x,s1,s2,s3,x1,x2,x3;
4926: const PetscScalar *b;
4927: PetscInt jdx,idt,idx,nz,i;
4930: VecGetArrayRead(bb,&b);
4931: VecGetArray(xx,&x);
4933: /* forward solve the lower triangular */
4934: idx = 0;
4935: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
4936: for (i=1; i<n; i++) {
4937: v = aa + 9*ai[i];
4938: vi = aj + ai[i];
4939: nz = diag[i] - ai[i];
4940: idx += 3;
4941: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4942: while (nz--) {
4943: jdx = 3*(*vi++);
4944: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4945: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4946: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4947: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4948: v += 9;
4949: }
4950: x[idx] = s1;
4951: x[1+idx] = s2;
4952: x[2+idx] = s3;
4953: }
4954: /* backward solve the upper triangular */
4955: for (i=n-1; i>=0; i--){
4956: v = aa + 9*diag[i] + 9;
4957: vi = aj + diag[i] + 1;
4958: nz = ai[i+1] - diag[i] - 1;
4959: idt = 3*i;
4960: s1 = x[idt]; s2 = x[1+idt];
4961: s3 = x[2+idt];
4962: while (nz--) {
4963: idx = 3*(*vi++);
4964: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
4965: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4968: v += 9;
4969: }
4970: v = aa + 9*diag[i];
4971: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4972: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4973: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4974: }
4976: VecRestoreArrayRead(bb,&b);
4977: VecRestoreArray(xx,&x);
4978: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4979: return(0);
4980: }
4984: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4985: {
4986: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4987: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4988: PetscErrorCode ierr;
4989: PetscInt i,k,nz,idx,jdx,idt;
4990: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
4991: const MatScalar *aa=a->a,*v;
4992: PetscScalar *x;
4993: const PetscScalar *b;
4994: PetscScalar s1,s2,s3,x1,x2,x3;
4997: VecGetArrayRead(bb,&b);
4998: VecGetArray(xx,&x);
4999: /* forward solve the lower triangular */
5000: idx = 0;
5001: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5002: for (i=1; i<n; i++) {
5003: v = aa + bs2*ai[i];
5004: vi = aj + ai[i];
5005: nz = ai[i+1] - ai[i];
5006: idx = bs*i;
5007: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5008: for(k=0;k<nz;k++){
5009: jdx = bs*vi[k];
5010: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5011: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5012: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5013: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5014:
5015: v += bs2;
5016: }
5018: x[idx] = s1;
5019: x[1+idx] = s2;
5020: x[2+idx] = s3;
5021: }
5022:
5023: /* backward solve the upper triangular */
5024: for (i=n-1; i>=0; i--){
5025: v = aa + bs2*(adiag[i+1]+1);
5026: vi = aj + adiag[i+1]+1;
5027: nz = adiag[i] - adiag[i+1]-1;
5028: idt = bs*i;
5029: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];
5030:
5031: for(k=0;k<nz;k++){
5032: idx = bs*vi[k];
5033: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
5034: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5035: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5036: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5038: v += bs2;
5039: }
5040: /* x = inv_diagonal*x */
5041: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
5042: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5043: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5045: }
5047: VecRestoreArrayRead(bb,&b);
5048: VecRestoreArray(xx,&x);
5049: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
5050: return(0);
5051: }
5055: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5056: {
5057: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
5058: IS iscol=a->col,isrow=a->row;
5059: PetscErrorCode ierr;
5060: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5061: PetscInt i,nz,idx,idt,idc;
5062: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5063: const MatScalar *aa=a->a,*v;
5064: PetscScalar *x,s1,s2,x1,x2,*t;
5065: const PetscScalar *b;
5068: VecGetArrayRead(bb,&b);
5069: VecGetArray(xx,&x);
5070: t = a->solve_work;
5072: ISGetIndices(isrow,&rout); r = rout;
5073: ISGetIndices(iscol,&cout); c = cout + (n-1);
5075: /* forward solve the lower triangular */
5076: idx = 2*(*r++);
5077: t[0] = b[idx]; t[1] = b[1+idx];
5078: for (i=1; i<n; i++) {
5079: v = aa + 4*ai[i];
5080: vi = aj + ai[i];
5081: nz = diag[i] - ai[i];
5082: idx = 2*(*r++);
5083: s1 = b[idx]; s2 = b[1+idx];
5084: while (nz--) {
5085: idx = 2*(*vi++);
5086: x1 = t[idx]; x2 = t[1+idx];
5087: s1 -= v[0]*x1 + v[2]*x2;
5088: s2 -= v[1]*x1 + v[3]*x2;
5089: v += 4;
5090: }
5091: idx = 2*i;
5092: t[idx] = s1; t[1+idx] = s2;
5093: }
5094: /* backward solve the upper triangular */
5095: for (i=n-1; i>=0; i--){
5096: v = aa + 4*diag[i] + 4;
5097: vi = aj + diag[i] + 1;
5098: nz = ai[i+1] - diag[i] - 1;
5099: idt = 2*i;
5100: s1 = t[idt]; s2 = t[1+idt];
5101: while (nz--) {
5102: idx = 2*(*vi++);
5103: x1 = t[idx]; x2 = t[1+idx];
5104: s1 -= v[0]*x1 + v[2]*x2;
5105: s2 -= v[1]*x1 + v[3]*x2;
5106: v += 4;
5107: }
5108: idc = 2*(*c--);
5109: v = aa + 4*diag[i];
5110: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
5111: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5112: }
5113: ISRestoreIndices(isrow,&rout);
5114: ISRestoreIndices(iscol,&cout);
5115: VecRestoreArrayRead(bb,&b);
5116: VecRestoreArray(xx,&x);
5117: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5118: return(0);
5119: }
5123: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5124: {
5125: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
5126: IS iscol=a->col,isrow=a->row;
5127: PetscErrorCode ierr;
5128: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5129: PetscInt i,nz,idx,jdx,idt,idc,m;
5130: const PetscInt *r,*c,*rout,*cout;
5131: const MatScalar *aa=a->a,*v;
5132: PetscScalar *x,s1,s2,x1,x2,*t;
5133: const PetscScalar *b;
5136: VecGetArrayRead(bb,&b);
5137: VecGetArray(xx,&x);
5138: t = a->solve_work;
5140: ISGetIndices(isrow,&rout); r = rout;
5141: ISGetIndices(iscol,&cout); c = cout;
5143: /* forward solve the lower triangular */
5144: idx = 2*r[0];
5145: t[0] = b[idx]; t[1] = b[1+idx];
5146: for (i=1; i<n; i++) {
5147: v = aa + 4*ai[i];
5148: vi = aj + ai[i];
5149: nz = ai[i+1] - ai[i];
5150: idx = 2*r[i];
5151: s1 = b[idx]; s2 = b[1+idx];
5152: for(m=0;m<nz;m++){
5153: jdx = 2*vi[m];
5154: x1 = t[jdx]; x2 = t[1+jdx];
5155: s1 -= v[0]*x1 + v[2]*x2;
5156: s2 -= v[1]*x1 + v[3]*x2;
5157: v += 4;
5158: }
5159: idx = 2*i;
5160: t[idx] = s1; t[1+idx] = s2;
5161: }
5162: /* backward solve the upper triangular */
5163: for (i=n-1; i>=0; i--){
5164: v = aa + 4*(adiag[i+1]+1);
5165: vi = aj + adiag[i+1]+1;
5166: nz = adiag[i] - adiag[i+1] - 1;
5167: idt = 2*i;
5168: s1 = t[idt]; s2 = t[1+idt];
5169: for(m=0;m<nz;m++){
5170: idx = 2*vi[m];
5171: x1 = t[idx]; x2 = t[1+idx];
5172: s1 -= v[0]*x1 + v[2]*x2;
5173: s2 -= v[1]*x1 + v[3]*x2;
5174: v += 4;
5175: }
5176: idc = 2*c[i];
5177: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
5178: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5179: }
5180: ISRestoreIndices(isrow,&rout);
5181: ISRestoreIndices(iscol,&cout);
5182: VecRestoreArrayRead(bb,&b);
5183: VecRestoreArray(xx,&x);
5184: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5185: return(0);
5186: }
5188: /*
5189: Special case where the matrix was ILU(0) factored in the natural
5190: ordering. This eliminates the need for the column and row permutation.
5191: */
5194: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5195: {
5196: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
5197: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5198: PetscErrorCode ierr;
5199: const MatScalar *aa=a->a,*v;
5200: PetscScalar *x,s1,s2,x1,x2;
5201: const PetscScalar *b;
5202: PetscInt jdx,idt,idx,nz,i;
5205: VecGetArrayRead(bb,&b);
5206: VecGetArray(xx,&x);
5208: /* forward solve the lower triangular */
5209: idx = 0;
5210: x[0] = b[0]; x[1] = b[1];
5211: for (i=1; i<n; i++) {
5212: v = aa + 4*ai[i];
5213: vi = aj + ai[i];
5214: nz = diag[i] - ai[i];
5215: idx += 2;
5216: s1 = b[idx];s2 = b[1+idx];
5217: while (nz--) {
5218: jdx = 2*(*vi++);
5219: x1 = x[jdx];x2 = x[1+jdx];
5220: s1 -= v[0]*x1 + v[2]*x2;
5221: s2 -= v[1]*x1 + v[3]*x2;
5222: v += 4;
5223: }
5224: x[idx] = s1;
5225: x[1+idx] = s2;
5226: }
5227: /* backward solve the upper triangular */
5228: for (i=n-1; i>=0; i--){
5229: v = aa + 4*diag[i] + 4;
5230: vi = aj + diag[i] + 1;
5231: nz = ai[i+1] - diag[i] - 1;
5232: idt = 2*i;
5233: s1 = x[idt]; s2 = x[1+idt];
5234: while (nz--) {
5235: idx = 2*(*vi++);
5236: x1 = x[idx]; x2 = x[1+idx];
5237: s1 -= v[0]*x1 + v[2]*x2;
5238: s2 -= v[1]*x1 + v[3]*x2;
5239: v += 4;
5240: }
5241: v = aa + 4*diag[i];
5242: x[idt] = v[0]*s1 + v[2]*s2;
5243: x[1+idt] = v[1]*s1 + v[3]*s2;
5244: }
5246: VecRestoreArrayRead(bb,&b);
5247: VecRestoreArray(xx,&x);
5248: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5249: return(0);
5250: }
5254: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5255: {
5256: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
5257: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5258: PetscInt i,k,nz,idx,idt,jdx;
5259: PetscErrorCode ierr;
5260: const MatScalar *aa=a->a,*v;
5261: PetscScalar *x,s1,s2,x1,x2;
5262: const PetscScalar *b;
5263:
5265: VecGetArrayRead(bb,&b);
5266: VecGetArray(xx,&x);
5267: /* forward solve the lower triangular */
5268: idx = 0;
5269: x[0] = b[idx]; x[1] = b[1+idx];
5270: for (i=1; i<n; i++) {
5271: v = aa + 4*ai[i];
5272: vi = aj + ai[i];
5273: nz = ai[i+1] - ai[i];
5274: idx = 2*i;
5275: s1 = b[idx];s2 = b[1+idx];
5276: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5277: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5278: for(k=0;k<nz;k++){
5279: jdx = 2*vi[k];
5280: x1 = x[jdx];x2 = x[1+jdx];
5281: s1 -= v[0]*x1 + v[2]*x2;
5282: s2 -= v[1]*x1 + v[3]*x2;
5283: v += 4;
5284: }
5285: x[idx] = s1;
5286: x[1+idx] = s2;
5287: }
5288:
5289: /* backward solve the upper triangular */
5290: for (i=n-1; i>=0; i--){
5291: v = aa + 4*(adiag[i+1]+1);
5292: vi = aj + adiag[i+1]+1;
5293: nz = adiag[i] - adiag[i+1]-1;
5294: idt = 2*i;
5295: s1 = x[idt]; s2 = x[1+idt];
5296: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5297: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5298: for(k=0;k<nz;k++){
5299: idx = 2*vi[k];
5300: x1 = x[idx]; x2 = x[1+idx];
5301: s1 -= v[0]*x1 + v[2]*x2;
5302: s2 -= v[1]*x1 + v[3]*x2;
5303: v += 4;
5304: }
5305: /* x = inv_diagonal*x */
5306: x[idt] = v[0]*s1 + v[2]*s2;
5307: x[1+idt] = v[1]*s1 + v[3]*s2;
5308: }
5310: VecRestoreArrayRead(bb,&b);
5311: VecRestoreArray(xx,&x);
5312: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5313: return(0);
5314: }
5318: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5319: {
5320: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
5321: IS iscol=a->col,isrow=a->row;
5322: PetscErrorCode ierr;
5323: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5324: PetscInt i,nz;
5325: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5326: const MatScalar *aa=a->a,*v;
5327: PetscScalar *x,s1,*t;
5328: const PetscScalar *b;
5331: if (!n) return(0);
5333: VecGetArrayRead(bb,&b);
5334: VecGetArray(xx,&x);
5335: t = a->solve_work;
5337: ISGetIndices(isrow,&rout); r = rout;
5338: ISGetIndices(iscol,&cout); c = cout + (n-1);
5340: /* forward solve the lower triangular */
5341: t[0] = b[*r++];
5342: for (i=1; i<n; i++) {
5343: v = aa + ai[i];
5344: vi = aj + ai[i];
5345: nz = diag[i] - ai[i];
5346: s1 = b[*r++];
5347: while (nz--) {
5348: s1 -= (*v++)*t[*vi++];
5349: }
5350: t[i] = s1;
5351: }
5352: /* backward solve the upper triangular */
5353: for (i=n-1; i>=0; i--){
5354: v = aa + diag[i] + 1;
5355: vi = aj + diag[i] + 1;
5356: nz = ai[i+1] - diag[i] - 1;
5357: s1 = t[i];
5358: while (nz--) {
5359: s1 -= (*v++)*t[*vi++];
5360: }
5361: x[*c--] = t[i] = aa[diag[i]]*s1;
5362: }
5364: ISRestoreIndices(isrow,&rout);
5365: ISRestoreIndices(iscol,&cout);
5366: VecRestoreArrayRead(bb,&b);
5367: VecRestoreArray(xx,&x);
5368: PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
5369: return(0);
5370: }
5374: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5375: {
5376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5377: IS iscol = a->col,isrow = a->row;
5378: PetscErrorCode ierr;
5379: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5380: const PetscInt *rout,*cout,*r,*c;
5381: PetscScalar *x,*tmp,sum;
5382: const PetscScalar *b;
5383: const MatScalar *aa = a->a,*v;
5386: if (!n) return(0);
5388: VecGetArrayRead(bb,&b);
5389: VecGetArray(xx,&x);
5390: tmp = a->solve_work;
5392: ISGetIndices(isrow,&rout); r = rout;
5393: ISGetIndices(iscol,&cout); c = cout;
5395: /* forward solve the lower triangular */
5396: tmp[0] = b[r[0]];
5397: v = aa;
5398: vi = aj;
5399: for (i=1; i<n; i++) {
5400: nz = ai[i+1] - ai[i];
5401: sum = b[r[i]];
5402: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5403: tmp[i] = sum;
5404: v += nz; vi += nz;
5405: }
5407: /* backward solve the upper triangular */
5408: for (i=n-1; i>=0; i--){
5409: v = aa + adiag[i+1]+1;
5410: vi = aj + adiag[i+1]+1;
5411: nz = adiag[i]-adiag[i+1]-1;
5412: sum = tmp[i];
5413: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5414: x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5415: }
5417: ISRestoreIndices(isrow,&rout);
5418: ISRestoreIndices(iscol,&cout);
5419: VecRestoreArrayRead(bb,&b);
5420: VecRestoreArray(xx,&x);
5421: PetscLogFlops(2*a->nz - A->cmap->n);
5422: return(0);
5423: }
5425: /*
5426: Special case where the matrix was ILU(0) factored in the natural
5427: ordering. This eliminates the need for the column and row permutation.
5428: */
5431: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5432: {
5433: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
5434: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5435: PetscErrorCode ierr;
5436: const MatScalar *aa=a->a,*v;
5437: PetscScalar *x;
5438: const PetscScalar *b;
5439: PetscScalar s1,x1;
5440: PetscInt jdx,idt,idx,nz,i;
5443: VecGetArrayRead(bb,&b);
5444: VecGetArray(xx,&x);
5446: /* forward solve the lower triangular */
5447: idx = 0;
5448: x[0] = b[0];
5449: for (i=1; i<n; i++) {
5450: v = aa + ai[i];
5451: vi = aj + ai[i];
5452: nz = diag[i] - ai[i];
5453: idx += 1;
5454: s1 = b[idx];
5455: while (nz--) {
5456: jdx = *vi++;
5457: x1 = x[jdx];
5458: s1 -= v[0]*x1;
5459: v += 1;
5460: }
5461: x[idx] = s1;
5462: }
5463: /* backward solve the upper triangular */
5464: for (i=n-1; i>=0; i--){
5465: v = aa + diag[i] + 1;
5466: vi = aj + diag[i] + 1;
5467: nz = ai[i+1] - diag[i] - 1;
5468: idt = i;
5469: s1 = x[idt];
5470: while (nz--) {
5471: idx = *vi++;
5472: x1 = x[idx];
5473: s1 -= v[0]*x1;
5474: v += 1;
5475: }
5476: v = aa + diag[i];
5477: x[idt] = v[0]*s1;
5478: }
5479: VecRestoreArrayRead(bb,&b);
5480: VecRestoreArray(xx,&x);
5481: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
5482: return(0);
5483: }
5488: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5489: {
5490: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5491: PetscErrorCode ierr;
5492: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5493: PetscScalar *x,sum;
5494: const PetscScalar *b;
5495: const MatScalar *aa = a->a,*v;
5496: PetscInt i,nz;
5499: if (!n) return(0);
5501: VecGetArrayRead(bb,&b);
5502: VecGetArray(xx,&x);
5504: /* forward solve the lower triangular */
5505: x[0] = b[0];
5506: v = aa;
5507: vi = aj;
5508: for (i=1; i<n; i++) {
5509: nz = ai[i+1] - ai[i];
5510: sum = b[i];
5511: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5512: v += nz;
5513: vi += nz;
5514: x[i] = sum;
5515: }
5516:
5517: /* backward solve the upper triangular */
5518: for (i=n-1; i>=0; i--){
5519: v = aa + adiag[i+1] + 1;
5520: vi = aj + adiag[i+1] + 1;
5521: nz = adiag[i] - adiag[i+1]-1;
5522: sum = x[i];
5523: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5524: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5525: }
5526:
5527: PetscLogFlops(2.0*a->nz - A->cmap->n);
5528: VecRestoreArrayRead(bb,&b);
5529: VecRestoreArray(xx,&x);
5530: return(0);
5531: }
5533: /* ----------------------------------------------------------------*/
5534: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );
5538: /*
5539: This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5540: */
5541: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5542: {
5543: Mat C=B;
5544: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5545: PetscErrorCode ierr;
5546: PetscInt i,j,k,ipvt[15];
5547: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5548: PetscInt nz,nzL,row;
5549: MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225];
5550: const MatScalar *v,*aa=a->a;
5551: PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg;
5552: PetscInt sol_ver;
5556: PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);
5558: /* generate work space needed by the factorization */
5559: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
5560: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5562: for (i=0; i<n; i++){
5563: /* zero rtmp */
5564: /* L part */
5565: nz = bi[i+1] - bi[i];
5566: bjtmp = bj + bi[i];
5567: for (j=0; j<nz; j++){
5568: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5569: }
5571: /* U part */
5572: nz = bdiag[i] - bdiag[i+1];
5573: bjtmp = bj + bdiag[i+1]+1;
5574: for (j=0; j<nz; j++){
5575: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5576: }
5577:
5578: /* load in initial (unfactored row) */
5579: nz = ai[i+1] - ai[i];
5580: ajtmp = aj + ai[i];
5581: v = aa + bs2*ai[i];
5582: for (j=0; j<nz; j++) {
5583: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
5584: }
5586: /* elimination */
5587: bjtmp = bj + bi[i];
5588: nzL = bi[i+1] - bi[i];
5589: for(k=0;k < nzL;k++) {
5590: row = bjtmp[k];
5591: pc = rtmp + bs2*row;
5592: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5593: if (flg) {
5594: pv = b->a + bs2*bdiag[row];
5595: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
5596: /*PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);*/
5597: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5598: pv = b->a + bs2*(bdiag[row+1]+1);
5599: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5600: for (j=0; j<nz; j++) {
5601: vv = rtmp + bs2*pj[j];
5602: PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5603: /* PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
5604: pv += bs2;
5605: }
5606: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5607: }
5608: }
5610: /* finished row so stick it into b->a */
5611: /* L part */
5612: pv = b->a + bs2*bi[i] ;
5613: pj = b->j + bi[i] ;
5614: nz = bi[i+1] - bi[i];
5615: for (j=0; j<nz; j++) {
5616: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5617: }
5619: /* Mark diagonal and invert diagonal for simplier triangular solves */
5620: pv = b->a + bs2*bdiag[i];
5621: pj = b->j + bdiag[i];
5622: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5623: /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
5624: PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);
5625:
5626: /* U part */
5627: pv = b->a + bs2*(bdiag[i+1]+1);
5628: pj = b->j + bdiag[i+1]+1;
5629: nz = bdiag[i] - bdiag[i+1] - 1;
5630: for (j=0; j<nz; j++){
5631: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5632: }
5633: }
5635: PetscFree2(rtmp,mwork);
5636: C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5637: C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5638: C->assembled = PETSC_TRUE;
5639: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5640: return(0);
5641: }
5645: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5646: {
5647: Mat C=B;
5648: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5649: IS isrow = b->row,isicol = b->icol;
5651: const PetscInt *r,*ic;
5652: PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5653: PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5654: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5655: PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5656: MatScalar *v_work;
5657: PetscBool col_identity,row_identity,both_identity;
5660: ISGetIndices(isrow,&r);
5661: ISGetIndices(isicol,&ic);
5662:
5663: PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);
5664: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5666: /* generate work space needed by dense LU factorization */
5667: PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);
5669: for (i=0; i<n; i++){
5670: /* zero rtmp */
5671: /* L part */
5672: nz = bi[i+1] - bi[i];
5673: bjtmp = bj + bi[i];
5674: for (j=0; j<nz; j++){
5675: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5676: }
5678: /* U part */
5679: nz = bdiag[i] - bdiag[i+1];
5680: bjtmp = bj + bdiag[i+1]+1;
5681: for (j=0; j<nz; j++){
5682: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5683: }
5684:
5685: /* load in initial (unfactored row) */
5686: nz = ai[r[i]+1] - ai[r[i]];
5687: ajtmp = aj + ai[r[i]];
5688: v = aa + bs2*ai[r[i]];
5689: for (j=0; j<nz; j++) {
5690: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
5691: }
5693: /* elimination */
5694: bjtmp = bj + bi[i];
5695: nzL = bi[i+1] - bi[i];
5696: for(k=0;k < nzL;k++) {
5697: row = bjtmp[k];
5698: pc = rtmp + bs2*row;
5699: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5700: if (flg) {
5701: pv = b->a + bs2*bdiag[row];
5702: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5703: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5704: pv = b->a + bs2*(bdiag[row+1]+1);
5705: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5706: for (j=0; j<nz; j++) {
5707: PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5708: }
5709: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5710: }
5711: }
5713: /* finished row so stick it into b->a */
5714: /* L part */
5715: pv = b->a + bs2*bi[i] ;
5716: pj = b->j + bi[i] ;
5717: nz = bi[i+1] - bi[i];
5718: for (j=0; j<nz; j++) {
5719: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5720: }
5722: /* Mark diagonal and invert diagonal for simplier triangular solves */
5723: pv = b->a + bs2*bdiag[i];
5724: pj = b->j + bdiag[i];
5725: /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5726: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5727: PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);
5728:
5729: /* U part */
5730: pv = b->a + bs2*(bdiag[i+1]+1);
5731: pj = b->j + bdiag[i+1]+1;
5732: nz = bdiag[i] - bdiag[i+1] - 1;
5733: for (j=0; j<nz; j++){
5734: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5735: }
5736: }
5738: PetscFree(rtmp);
5739: PetscFree3(v_work,mwork,v_pivots);
5740: ISRestoreIndices(isicol,&ic);
5741: ISRestoreIndices(isrow,&r);
5743: ISIdentity(isrow,&row_identity);
5744: ISIdentity(isicol,&col_identity);
5745: both_identity = (PetscBool) (row_identity && col_identity);
5746: if (both_identity){
5747: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5748: } else {
5749: C->ops->solve = MatSolve_SeqBAIJ_N;
5750: }
5751: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5752:
5753: C->assembled = PETSC_TRUE;
5754: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5755: return(0);
5756: }
5758: /*
5759: ilu(0) with natural ordering under new data structure.
5760: See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5761: because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5762: */
5766: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5767: {
5768:
5769: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5770: PetscErrorCode ierr;
5771: PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5772: PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp;
5775: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
5776: b = (Mat_SeqBAIJ*)(fact)->data;
5777:
5778: /* allocate matrix arrays for new data structure */
5779: PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);
5780: PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
5781: b->singlemalloc = PETSC_TRUE;
5782: b->free_a = PETSC_TRUE;
5783: b->free_ij = PETSC_TRUE;
5784: fact->preallocated = PETSC_TRUE;
5785: fact->assembled = PETSC_TRUE;
5786: if (!b->diag){
5787: PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);
5788: PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));
5789: }
5790: bdiag = b->diag;
5791:
5792: if (n > 0) {
5793: PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
5794: }
5795:
5796: /* set bi and bj with new data structure */
5797: bi = b->i;
5798: bj = b->j;
5800: /* L part */
5801: bi[0] = 0;
5802: for (i=0; i<n; i++){
5803: nz = adiag[i] - ai[i];
5804: bi[i+1] = bi[i] + nz;
5805: aj = a->j + ai[i];
5806: for (j=0; j<nz; j++){
5807: *bj = aj[j]; bj++;
5808: }
5809: }
5810:
5811: /* U part */
5812: bi_temp = bi[n];
5813: bdiag[n] = bi[n]-1;
5814: for (i=n-1; i>=0; i--){
5815: nz = ai[i+1] - adiag[i] - 1;
5816: bi_temp = bi_temp + nz + 1;
5817: aj = a->j + adiag[i] + 1;
5818: for (j=0; j<nz; j++){
5819: *bj = aj[j]; bj++;
5820: }
5821: /* diag[i] */
5822: *bj = i; bj++;
5823: bdiag[i] = bi_temp - 1;
5824: }
5825: return(0);
5826: }
5830: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5831: {
5832: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5833: IS isicol;
5834: PetscErrorCode ierr;
5835: const PetscInt *r,*ic;
5836: PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d;
5837: PetscInt *bi,*cols,nnz,*cols_lvl;
5838: PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5839: PetscInt i,levels,diagonal_fill;
5840: PetscBool col_identity,row_identity,both_identity;
5841: PetscReal f;
5842: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5843: PetscBT lnkbt;
5844: PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr;
5845: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5846: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5847: PetscBool missing;
5848: PetscInt bs=A->rmap->bs,bs2=a->bs2;
5851: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5852: if (bs>1){ /* check shifttype */
5853: if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5854: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5855: }
5857: MatMissingDiagonal(A,&missing,&d);
5858: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5860: f = info->fill;
5861: levels = (PetscInt)info->levels;
5862: diagonal_fill = (PetscInt)info->diagonal_fill;
5863: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
5865: ISIdentity(isrow,&row_identity);
5866: ISIdentity(iscol,&col_identity);
5867: both_identity = (PetscBool) (row_identity && col_identity);
5868:
5869: if (!levels && both_identity) {
5870: /* special case: ilu(0) with natural ordering */
5871: MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
5872: MatSeqBAIJSetNumericFactorization(fact,both_identity);
5874: fact->factortype = MAT_FACTOR_ILU;
5875: (fact)->info.factor_mallocs = 0;
5876: (fact)->info.fill_ratio_given = info->fill;
5877: (fact)->info.fill_ratio_needed = 1.0;
5878: b = (Mat_SeqBAIJ*)(fact)->data;
5879: b->row = isrow;
5880: b->col = iscol;
5881: b->icol = isicol;
5882: PetscObjectReference((PetscObject)isrow);
5883: PetscObjectReference((PetscObject)iscol);
5884: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5885: PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5886: return(0);
5887: }
5888:
5889: ISGetIndices(isrow,&r);
5890: ISGetIndices(isicol,&ic);
5891:
5892: /* get new row pointers */
5893: PetscMalloc((n+1)*sizeof(PetscInt),&bi);
5894: bi[0] = 0;
5895: /* bdiag is location of diagonal in factor */
5896: PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);
5897: bdiag[0] = 0;
5899: PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);
5901: /* create a linked list for storing column indices of the active row */
5902: nlnk = n + 1;
5903: PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
5904:
5905: /* initial FreeSpace size is f*(ai[n]+1) */
5906: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
5907: current_space = free_space;
5908: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
5909: current_space_lvl = free_space_lvl;
5910:
5911: for (i=0; i<n; i++) {
5912: nzi = 0;
5913: /* copy current row into linked list */
5914: nnz = ai[r[i]+1] - ai[r[i]];
5915: if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5916: cols = aj + ai[r[i]];
5917: lnk[i] = -1; /* marker to indicate if diagonal exists */
5918: PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
5919: nzi += nlnk;
5921: /* make sure diagonal entry is included */
5922: if (diagonal_fill && lnk[i] == -1) {
5923: fm = n;
5924: while (lnk[fm] < i) fm = lnk[fm];
5925: lnk[i] = lnk[fm]; /* insert diagonal into linked list */
5926: lnk[fm] = i;
5927: lnk_lvl[i] = 0;
5928: nzi++; dcount++;
5929: }
5931: /* add pivot rows into the active row */
5932: nzbd = 0;
5933: prow = lnk[n];
5934: while (prow < i) {
5935: nnz = bdiag[prow];
5936: cols = bj_ptr[prow] + nnz + 1;
5937: cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5938: nnz = bi[prow+1] - bi[prow] - nnz - 1;
5939: PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
5940: nzi += nlnk;
5941: prow = lnk[prow];
5942: nzbd++;
5943: }
5944: bdiag[i] = nzbd;
5945: bi[i+1] = bi[i] + nzi;
5947: /* if free space is not available, make more free space */
5948: if (current_space->local_remaining<nzi) {
5949: nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5950: PetscFreeSpaceGet(nnz,¤t_space);
5951: PetscFreeSpaceGet(nnz,¤t_space_lvl);
5952: reallocs++;
5953: }
5955: /* copy data into free_space and free_space_lvl, then initialize lnk */
5956: PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
5957: bj_ptr[i] = current_space->array;
5958: bjlvl_ptr[i] = current_space_lvl->array;
5960: /* make sure the active row i has diagonal entry */
5961: if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5963: current_space->array += nzi;
5964: current_space->local_used += nzi;
5965: current_space->local_remaining -= nzi;
5966: current_space_lvl->array += nzi;
5967: current_space_lvl->local_used += nzi;
5968: current_space_lvl->local_remaining -= nzi;
5969: }
5970:
5971: ISRestoreIndices(isrow,&r);
5972: ISRestoreIndices(isicol,&ic);
5973:
5974: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5975: PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);
5976: PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
5977:
5978: PetscIncompleteLLDestroy(lnk,lnkbt);
5979: PetscFreeSpaceDestroy(free_space_lvl);
5980: PetscFree2(bj_ptr,bjlvl_ptr);
5982: #if defined(PETSC_USE_INFO)
5983: {
5984: PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5985: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
5986: PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);
5987: PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);
5988: PetscInfo(A,"for best performance.\n");
5989: if (diagonal_fill) {
5990: PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);
5991: }
5992: }
5993: #endif
5995: /* put together the new matrix */
5996: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
5997: PetscLogObjectParent(fact,isicol);
5998: b = (Mat_SeqBAIJ*)(fact)->data;
5999: b->free_a = PETSC_TRUE;
6000: b->free_ij = PETSC_TRUE;
6001: b->singlemalloc = PETSC_FALSE;
6002: PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);
6003: b->j = bj;
6004: b->i = bi;
6005: b->diag = bdiag;
6006: b->free_diag = PETSC_TRUE;
6007: b->ilen = 0;
6008: b->imax = 0;
6009: b->row = isrow;
6010: b->col = iscol;
6011: PetscObjectReference((PetscObject)isrow);
6012: PetscObjectReference((PetscObject)iscol);
6013: b->icol = isicol;
6014: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6015: /* In b structure: Free imax, ilen, old a, old j.
6016: Allocate bdiag, solve_work, new a, new j */
6017: PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
6018: b->maxnz = b->nz = bdiag[0]+1;
6019: fact->info.factor_mallocs = reallocs;
6020: fact->info.fill_ratio_given = f;
6021: fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6022: MatSeqBAIJSetNumericFactorization(fact,both_identity);
6023: return(0);
6024: }
6026: /*
6027: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6028: except that the data structure of Mat_SeqAIJ is slightly different.
6029: Not a good example of code reuse.
6030: */
6033: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6034: {
6035: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
6036: IS isicol;
6038: const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6039: PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6040: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6041: PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6042: PetscBool col_identity,row_identity,both_identity,flg;
6043: PetscReal f;
6046: MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
6047: if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6048:
6049: f = info->fill;
6050: levels = (PetscInt)info->levels;
6051: diagonal_fill = (PetscInt)info->diagonal_fill;
6052: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
6054: ISIdentity(isrow,&row_identity);
6055: ISIdentity(iscol,&col_identity);
6056: both_identity = (PetscBool) (row_identity && col_identity);
6058: if (!levels && both_identity) { /* special case copy the nonzero structure */
6059: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
6060: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6062: fact->factortype = MAT_FACTOR_ILU;
6063: b = (Mat_SeqBAIJ*)fact->data;
6064: b->row = isrow;
6065: b->col = iscol;
6066: PetscObjectReference((PetscObject)isrow);
6067: PetscObjectReference((PetscObject)iscol);
6068: b->icol = isicol;
6069: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6070: PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
6071: return(0);
6072: }
6074: /* general case perform the symbolic factorization */
6075: ISGetIndices(isrow,&r);
6076: ISGetIndices(isicol,&ic);
6078: /* get new row pointers */
6079: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
6080: ainew[0] = 0;
6081: /* don't know how many column pointers are needed so estimate */
6082: jmax = (PetscInt)(f*ai[n] + 1);
6083: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
6084: /* ajfill is level of fill for each fill entry */
6085: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
6086: /* fill is a linked list of nonzeros in active row */
6087: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
6088: /* im is level for each filled value */
6089: PetscMalloc((n+1)*sizeof(PetscInt),&im);
6090: /* dloc is location of diagonal in factor */
6091: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
6092: dloc[0] = 0;
6093: for (prow=0; prow<n; prow++) {
6095: /* copy prow into linked list */
6096: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
6097: if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6098: xi = aj + ai[r[prow]];
6099: fill[n] = n;
6100: fill[prow] = -1; /* marker for diagonal entry */
6101: while (nz--) {
6102: fm = n;
6103: idx = ic[*xi++];
6104: do {
6105: m = fm;
6106: fm = fill[m];
6107: } while (fm < idx);
6108: fill[m] = idx;
6109: fill[idx] = fm;
6110: im[idx] = 0;
6111: }
6113: /* make sure diagonal entry is included */
6114: if (diagonal_fill && fill[prow] == -1) {
6115: fm = n;
6116: while (fill[fm] < prow) fm = fill[fm];
6117: fill[prow] = fill[fm]; /* insert diagonal into linked list */
6118: fill[fm] = prow;
6119: im[prow] = 0;
6120: nzf++;
6121: dcount++;
6122: }
6124: nzi = 0;
6125: row = fill[n];
6126: while (row < prow) {
6127: incrlev = im[row] + 1;
6128: nz = dloc[row];
6129: xi = ajnew + ainew[row] + nz + 1;
6130: flev = ajfill + ainew[row] + nz + 1;
6131: nnz = ainew[row+1] - ainew[row] - nz - 1;
6132: fm = row;
6133: while (nnz-- > 0) {
6134: idx = *xi++;
6135: if (*flev + incrlev > levels) {
6136: flev++;
6137: continue;
6138: }
6139: do {
6140: m = fm;
6141: fm = fill[m];
6142: } while (fm < idx);
6143: if (fm != idx) {
6144: im[idx] = *flev + incrlev;
6145: fill[m] = idx;
6146: fill[idx] = fm;
6147: fm = idx;
6148: nzf++;
6149: } else {
6150: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6151: }
6152: flev++;
6153: }
6154: row = fill[row];
6155: nzi++;
6156: }
6157: /* copy new filled row into permanent storage */
6158: ainew[prow+1] = ainew[prow] + nzf;
6159: if (ainew[prow+1] > jmax) {
6161: /* estimate how much additional space we will need */
6162: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6163: /* just double the memory each time */
6164: PetscInt maxadd = jmax;
6165: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6166: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6167: jmax += maxadd;
6169: /* allocate a longer ajnew and ajfill */
6170: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6171: PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
6172: PetscFree(ajnew);
6173: ajnew = xitmp;
6174: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6175: PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
6176: PetscFree(ajfill);
6177: ajfill = xitmp;
6178: reallocate++; /* count how many reallocations are needed */
6179: }
6180: xitmp = ajnew + ainew[prow];
6181: flev = ajfill + ainew[prow];
6182: dloc[prow] = nzi;
6183: fm = fill[n];
6184: while (nzf--) {
6185: *xitmp++ = fm;
6186: *flev++ = im[fm];
6187: fm = fill[fm];
6188: }
6189: /* make sure row has diagonal entry */
6190: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6191: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6192: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6193: }
6194: }
6195: PetscFree(ajfill);
6196: ISRestoreIndices(isrow,&r);
6197: ISRestoreIndices(isicol,&ic);
6198: PetscFree(fill);
6199: PetscFree(im);
6201: #if defined(PETSC_USE_INFO)
6202: {
6203: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6204: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
6205: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
6206: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
6207: PetscInfo(A,"for best performance.\n");
6208: if (diagonal_fill) {
6209: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
6210: }
6211: }
6212: #endif
6214: /* put together the new matrix */
6215: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
6216: PetscLogObjectParent(fact,isicol);
6217: b = (Mat_SeqBAIJ*)fact->data;
6218: b->free_a = PETSC_TRUE;
6219: b->free_ij = PETSC_TRUE;
6220: b->singlemalloc = PETSC_FALSE;
6221: PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
6222: b->j = ajnew;
6223: b->i = ainew;
6224: for (i=0; i<n; i++) dloc[i] += ainew[i];
6225: b->diag = dloc;
6226: b->free_diag = PETSC_TRUE;
6227: b->ilen = 0;
6228: b->imax = 0;
6229: b->row = isrow;
6230: b->col = iscol;
6231: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6232: PetscObjectReference((PetscObject)isrow);
6233: PetscObjectReference((PetscObject)iscol);
6234: b->icol = isicol;
6235: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6236: /* In b structure: Free imax, ilen, old a, old j.
6237: Allocate dloc, solve_work, new a, new j */
6238: PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
6239: b->maxnz = b->nz = ainew[n];
6241: fact->info.factor_mallocs = reallocate;
6242: fact->info.fill_ratio_given = f;
6243: fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6245: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6246: return(0);
6247: }
6251: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6252: {
6253: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6254: /* int i,*AJ=a->j,nz=a->nz; */
6256: /* Undo Column scaling */
6257: /* while (nz--) { */
6258: /* AJ[i] = AJ[i]/4; */
6259: /* } */
6260: /* This should really invoke a push/pop logic, but we don't have that yet. */
6261: A->ops->setunfactored = PETSC_NULL;
6262: return(0);
6263: }
6267: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6268: {
6269: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
6270: PetscInt *AJ=a->j,nz=a->nz;
6271: unsigned short *aj=(unsigned short *)AJ;
6273: /* Is this really necessary? */
6274: while (nz--) {
6275: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6276: }
6277: A->ops->setunfactored = PETSC_NULL;
6278: return(0);
6279: }