Actual source code: baijfact2.c
petsc-3.4.0 2013-05-13
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <petsc-private/kernels/blockinvert.h>
8: #include <petscbt.h>
9: #include <../src/mat/utils/freespace.h>
13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscErrorCode ierr;
17: const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
18: PetscInt i,n = a->mbs,j;
19: PetscInt nz;
20: PetscScalar *x,*tmp,s1;
21: const MatScalar *aa = a->a,*v;
22: const PetscScalar *b;
25: VecGetArrayRead(bb,&b);
26: VecGetArray(xx,&x);
27: tmp = a->solve_work;
30: /* copy the b into temp work space according to permutation */
31: for (i=0; i<n; i++) tmp[i] = b[i];
33: /* forward solve the U^T */
34: for (i=0; i<n; i++) {
35: v = aa + adiag[i+1] + 1;
36: vi = aj + adiag[i+1] + 1;
37: nz = adiag[i] - adiag[i+1] - 1;
38: s1 = tmp[i];
39: s1 *= v[nz]; /* multiply by inverse of diagonal entry */
40: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
41: tmp[i] = s1;
42: }
44: /* backward solve the L^T */
45: for (i=n-1; i>=0; i--) {
46: v = aa + ai[i];
47: vi = aj + ai[i];
48: nz = ai[i+1] - ai[i];
49: s1 = tmp[i];
50: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
51: }
53: /* copy tmp into x according to permutation */
54: for (i=0; i<n; i++) x[i] = tmp[i];
56: VecRestoreArrayRead(bb,&b);
57: VecRestoreArray(xx,&x);
59: PetscLogFlops(2.0*a->nz-A->cmap->n);
60: return(0);
61: }
65: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66: {
67: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
68: PetscErrorCode ierr;
69: PetscInt i,nz;
70: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
71: const MatScalar *aa =a->a,*v;
72: PetscScalar s1,*x;
75: VecCopy(bb,xx);
76: VecGetArray(xx,&x);
78: /* forward solve the U^T */
79: for (i=0; i<n; i++) {
81: v = aa + diag[i];
82: /* multiply by the inverse of the block diagonal */
83: s1 = (*v++)*x[i];
84: vi = aj + diag[i] + 1;
85: nz = ai[i+1] - diag[i] - 1;
86: while (nz--) {
87: x[*vi++] -= (*v++)*s1;
88: }
89: x[i] = s1;
90: }
91: /* backward solve the L^T */
92: for (i=n-1; i>=0; i--) {
93: v = aa + diag[i] - 1;
94: vi = aj + diag[i] - 1;
95: nz = diag[i] - ai[i];
96: s1 = x[i];
97: while (nz--) {
98: x[*vi--] -= (*v--)*s1;
99: }
100: }
101: VecRestoreArray(xx,&x);
102: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
103: return(0);
104: }
108: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109: {
110: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
111: PetscErrorCode ierr;
112: PetscInt i,nz,idx,idt,oidx;
113: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114: const MatScalar *aa =a->a,*v;
115: PetscScalar s1,s2,x1,x2,*x;
118: VecCopy(bb,xx);
119: VecGetArray(xx,&x);
121: /* forward solve the U^T */
122: idx = 0;
123: for (i=0; i<n; i++) {
125: v = aa + 4*diag[i];
126: /* multiply by the inverse of the block diagonal */
127: x1 = x[idx]; x2 = x[1+idx];
128: s1 = v[0]*x1 + v[1]*x2;
129: s2 = v[2]*x1 + v[3]*x2;
130: v += 4;
132: vi = aj + diag[i] + 1;
133: nz = ai[i+1] - diag[i] - 1;
134: while (nz--) {
135: oidx = 2*(*vi++);
136: x[oidx] -= v[0]*s1 + v[1]*s2;
137: x[oidx+1] -= v[2]*s1 + v[3]*s2;
138: v += 4;
139: }
140: x[idx] = s1;x[1+idx] = s2;
141: idx += 2;
142: }
143: /* backward solve the L^T */
144: for (i=n-1; i>=0; i--) {
145: v = aa + 4*diag[i] - 4;
146: vi = aj + diag[i] - 1;
147: nz = diag[i] - ai[i];
148: idt = 2*i;
149: s1 = x[idt]; s2 = x[1+idt];
150: while (nz--) {
151: idx = 2*(*vi--);
152: x[idx] -= v[0]*s1 + v[1]*s2;
153: x[idx+1] -= v[2]*s1 + v[3]*s2;
154: v -= 4;
155: }
156: }
157: VecRestoreArray(xx,&x);
158: PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
159: return(0);
160: }
164: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165: {
166: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
167: PetscErrorCode ierr;
168: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169: PetscInt nz,idx,idt,j,i,oidx;
170: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
171: const MatScalar *aa=a->a,*v;
172: PetscScalar s1,s2,x1,x2,*x;
175: VecCopy(bb,xx);
176: VecGetArray(xx,&x);
178: /* forward solve the U^T */
179: idx = 0;
180: for (i=0; i<n; i++) {
181: v = aa + bs2*diag[i];
182: /* multiply by the inverse of the block diagonal */
183: x1 = x[idx]; x2 = x[1+idx];
184: s1 = v[0]*x1 + v[1]*x2;
185: s2 = v[2]*x1 + v[3]*x2;
186: v -= bs2;
188: vi = aj + diag[i] - 1;
189: nz = diag[i] - diag[i+1] - 1;
190: for (j=0; j>-nz; j--) {
191: oidx = bs*vi[j];
192: x[oidx] -= v[0]*s1 + v[1]*s2;
193: x[oidx+1] -= v[2]*s1 + v[3]*s2;
194: v -= bs2;
195: }
196: x[idx] = s1;x[1+idx] = s2;
197: idx += bs;
198: }
199: /* backward solve the L^T */
200: for (i=n-1; i>=0; i--) {
201: v = aa + bs2*ai[i];
202: vi = aj + ai[i];
203: nz = ai[i+1] - ai[i];
204: idt = bs*i;
205: s1 = x[idt]; s2 = x[1+idt];
206: for (j=0; j<nz; j++) {
207: idx = bs*vi[j];
208: x[idx] -= v[0]*s1 + v[1]*s2;
209: x[idx+1] -= v[2]*s1 + v[3]*s2;
210: v += bs2;
211: }
212: }
213: VecRestoreArray(xx,&x);
214: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
215: return(0);
216: }
220: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221: {
222: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
223: PetscErrorCode ierr;
224: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225: PetscInt i,nz,idx,idt,oidx;
226: const MatScalar *aa=a->a,*v;
227: PetscScalar s1,s2,s3,x1,x2,x3,*x;
230: VecCopy(bb,xx);
231: VecGetArray(xx,&x);
233: /* forward solve the U^T */
234: idx = 0;
235: for (i=0; i<n; i++) {
237: v = aa + 9*diag[i];
238: /* multiply by the inverse of the block diagonal */
239: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
240: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
241: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
242: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
243: v += 9;
245: vi = aj + diag[i] + 1;
246: nz = ai[i+1] - diag[i] - 1;
247: while (nz--) {
248: oidx = 3*(*vi++);
249: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
250: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
251: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252: v += 9;
253: }
254: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
255: idx += 3;
256: }
257: /* backward solve the L^T */
258: for (i=n-1; i>=0; i--) {
259: v = aa + 9*diag[i] - 9;
260: vi = aj + diag[i] - 1;
261: nz = diag[i] - ai[i];
262: idt = 3*i;
263: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
264: while (nz--) {
265: idx = 3*(*vi--);
266: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
267: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
268: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269: v -= 9;
270: }
271: }
272: VecRestoreArray(xx,&x);
273: PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
274: return(0);
275: }
279: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280: {
281: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
282: PetscErrorCode ierr;
283: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284: PetscInt nz,idx,idt,j,i,oidx;
285: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
286: const MatScalar *aa=a->a,*v;
287: PetscScalar s1,s2,s3,x1,x2,x3,*x;
290: VecCopy(bb,xx);
291: VecGetArray(xx,&x);
293: /* forward solve the U^T */
294: idx = 0;
295: for (i=0; i<n; i++) {
296: v = aa + bs2*diag[i];
297: /* multiply by the inverse of the block diagonal */
298: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
299: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
300: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
301: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
302: v -= bs2;
304: vi = aj + diag[i] - 1;
305: nz = diag[i] - diag[i+1] - 1;
306: for (j=0; j>-nz; j--) {
307: oidx = bs*vi[j];
308: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
309: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
310: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
311: v -= bs2;
312: }
313: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
314: idx += bs;
315: }
316: /* backward solve the L^T */
317: for (i=n-1; i>=0; i--) {
318: v = aa + bs2*ai[i];
319: vi = aj + ai[i];
320: nz = ai[i+1] - ai[i];
321: idt = bs*i;
322: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
323: for (j=0; j<nz; j++) {
324: idx = bs*vi[j];
325: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
326: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
327: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
328: v += bs2;
329: }
330: }
331: VecRestoreArray(xx,&x);
332: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
333: return(0);
334: }
338: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339: {
340: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
341: PetscErrorCode ierr;
342: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343: PetscInt i,nz,idx,idt,oidx;
344: const MatScalar *aa=a->a,*v;
345: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x;
348: VecCopy(bb,xx);
349: VecGetArray(xx,&x);
351: /* forward solve the U^T */
352: idx = 0;
353: for (i=0; i<n; i++) {
355: v = aa + 16*diag[i];
356: /* multiply by the inverse of the block diagonal */
357: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
358: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
359: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
360: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
361: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362: v += 16;
364: vi = aj + diag[i] + 1;
365: nz = ai[i+1] - diag[i] - 1;
366: while (nz--) {
367: oidx = 4*(*vi++);
368: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
369: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
370: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372: v += 16;
373: }
374: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375: idx += 4;
376: }
377: /* backward solve the L^T */
378: for (i=n-1; i>=0; i--) {
379: v = aa + 16*diag[i] - 16;
380: vi = aj + diag[i] - 1;
381: nz = diag[i] - ai[i];
382: idt = 4*i;
383: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384: while (nz--) {
385: idx = 4*(*vi--);
386: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
387: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
388: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390: v -= 16;
391: }
392: }
393: VecRestoreArray(xx,&x);
394: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
395: return(0);
396: }
400: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401: {
402: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
403: PetscErrorCode ierr;
404: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405: PetscInt nz,idx,idt,j,i,oidx;
406: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
407: const MatScalar *aa=a->a,*v;
408: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x;
411: VecCopy(bb,xx);
412: VecGetArray(xx,&x);
414: /* forward solve the U^T */
415: idx = 0;
416: for (i=0; i<n; i++) {
417: v = aa + bs2*diag[i];
418: /* multiply by the inverse of the block diagonal */
419: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
420: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
421: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
422: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
423: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
424: v -= bs2;
426: vi = aj + diag[i] - 1;
427: nz = diag[i] - diag[i+1] - 1;
428: for (j=0; j>-nz; j--) {
429: oidx = bs*vi[j];
430: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
431: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
432: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
433: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
434: v -= bs2;
435: }
436: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4;
437: idx += bs;
438: }
439: /* backward solve the L^T */
440: for (i=n-1; i>=0; i--) {
441: v = aa + bs2*ai[i];
442: vi = aj + ai[i];
443: nz = ai[i+1] - ai[i];
444: idt = bs*i;
445: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt];
446: for (j=0; j<nz; j++) {
447: idx = bs*vi[j];
448: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
449: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
450: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
451: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
452: v += bs2;
453: }
454: }
455: VecRestoreArray(xx,&x);
456: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
457: return(0);
458: }
462: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463: {
464: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
465: PetscErrorCode ierr;
466: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467: PetscInt i,nz,idx,idt,oidx;
468: const MatScalar *aa=a->a,*v;
469: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
472: VecCopy(bb,xx);
473: VecGetArray(xx,&x);
475: /* forward solve the U^T */
476: idx = 0;
477: for (i=0; i<n; i++) {
479: v = aa + 25*diag[i];
480: /* multiply by the inverse of the block diagonal */
481: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
483: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
484: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487: v += 25;
489: vi = aj + diag[i] + 1;
490: nz = ai[i+1] - diag[i] - 1;
491: while (nz--) {
492: oidx = 5*(*vi++);
493: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
494: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
495: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498: v += 25;
499: }
500: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501: idx += 5;
502: }
503: /* backward solve the L^T */
504: for (i=n-1; i>=0; i--) {
505: v = aa + 25*diag[i] - 25;
506: vi = aj + diag[i] - 1;
507: nz = diag[i] - ai[i];
508: idt = 5*i;
509: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510: while (nz--) {
511: idx = 5*(*vi--);
512: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
513: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
514: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517: v -= 25;
518: }
519: }
520: VecRestoreArray(xx,&x);
521: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
522: return(0);
523: }
527: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528: {
529: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
530: PetscErrorCode ierr;
531: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532: PetscInt nz,idx,idt,j,i,oidx;
533: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
534: const MatScalar *aa=a->a,*v;
535: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
538: VecCopy(bb,xx);
539: VecGetArray(xx,&x);
541: /* forward solve the U^T */
542: idx = 0;
543: for (i=0; i<n; i++) {
544: v = aa + bs2*diag[i];
545: /* multiply by the inverse of the block diagonal */
546: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
547: x5 = x[4+idx];
548: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
549: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
550: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
551: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
552: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
553: v -= bs2;
555: vi = aj + diag[i] - 1;
556: nz = diag[i] - diag[i+1] - 1;
557: for (j=0; j>-nz; j--) {
558: oidx = bs*vi[j];
559: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
560: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
561: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
562: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
563: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
564: v -= bs2;
565: }
566: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
567: idx += bs;
568: }
569: /* backward solve the L^T */
570: for (i=n-1; i>=0; i--) {
571: v = aa + bs2*ai[i];
572: vi = aj + ai[i];
573: nz = ai[i+1] - ai[i];
574: idt = bs*i;
575: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
576: for (j=0; j<nz; j++) {
577: idx = bs*vi[j];
578: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
579: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
580: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
581: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
582: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
583: v += bs2;
584: }
585: }
586: VecRestoreArray(xx,&x);
587: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
588: return(0);
589: }
593: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594: {
595: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
596: PetscErrorCode ierr;
597: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598: PetscInt i,nz,idx,idt,oidx;
599: const MatScalar *aa=a->a,*v;
600: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
603: VecCopy(bb,xx);
604: VecGetArray(xx,&x);
606: /* forward solve the U^T */
607: idx = 0;
608: for (i=0; i<n; i++) {
610: v = aa + 36*diag[i];
611: /* multiply by the inverse of the block diagonal */
612: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613: x6 = x[5+idx];
614: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
615: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
616: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620: v += 36;
622: vi = aj + diag[i] + 1;
623: nz = ai[i+1] - diag[i] - 1;
624: while (nz--) {
625: oidx = 6*(*vi++);
626: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
627: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
628: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632: v += 36;
633: }
634: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635: x[5+idx] = s6;
636: idx += 6;
637: }
638: /* backward solve the L^T */
639: for (i=n-1; i>=0; i--) {
640: v = aa + 36*diag[i] - 36;
641: vi = aj + diag[i] - 1;
642: nz = diag[i] - ai[i];
643: idt = 6*i;
644: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645: s6 = x[5+idt];
646: while (nz--) {
647: idx = 6*(*vi--);
648: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
649: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
650: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654: v -= 36;
655: }
656: }
657: VecRestoreArray(xx,&x);
658: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
659: return(0);
660: }
664: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665: {
666: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
667: PetscErrorCode ierr;
668: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669: PetscInt nz,idx,idt,j,i,oidx;
670: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
671: const MatScalar *aa=a->a,*v;
672: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
675: VecCopy(bb,xx);
676: VecGetArray(xx,&x);
678: /* forward solve the U^T */
679: idx = 0;
680: for (i=0; i<n; i++) {
681: v = aa + bs2*diag[i];
682: /* multiply by the inverse of the block diagonal */
683: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
684: x5 = x[4+idx]; x6 = x[5+idx];
685: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
686: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
687: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691: v -= bs2;
693: vi = aj + diag[i] - 1;
694: nz = diag[i] - diag[i+1] - 1;
695: for (j=0; j>-nz; j--) {
696: oidx = bs*vi[j];
697: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
698: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
699: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703: v -= bs2;
704: }
705: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
706: x[5+idx] = s6;
707: idx += bs;
708: }
709: /* backward solve the L^T */
710: for (i=n-1; i>=0; i--) {
711: v = aa + bs2*ai[i];
712: vi = aj + ai[i];
713: nz = ai[i+1] - ai[i];
714: idt = bs*i;
715: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
716: s6 = x[5+idt];
717: for (j=0; j<nz; j++) {
718: idx = bs*vi[j];
719: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
720: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
721: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725: v += bs2;
726: }
727: }
728: VecRestoreArray(xx,&x);
729: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
730: return(0);
731: }
735: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736: {
737: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
738: PetscErrorCode ierr;
739: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740: PetscInt i,nz,idx,idt,oidx;
741: const MatScalar *aa=a->a,*v;
742: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
745: VecCopy(bb,xx);
746: VecGetArray(xx,&x);
748: /* forward solve the U^T */
749: idx = 0;
750: for (i=0; i<n; i++) {
752: v = aa + 49*diag[i];
753: /* multiply by the inverse of the block diagonal */
754: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755: x6 = x[5+idx]; x7 = x[6+idx];
756: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
757: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763: v += 49;
765: vi = aj + diag[i] + 1;
766: nz = ai[i+1] - diag[i] - 1;
767: while (nz--) {
768: oidx = 7*(*vi++);
769: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
770: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776: v += 49;
777: }
778: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779: x[5+idx] = s6;x[6+idx] = s7;
780: idx += 7;
781: }
782: /* backward solve the L^T */
783: for (i=n-1; i>=0; i--) {
784: v = aa + 49*diag[i] - 49;
785: vi = aj + diag[i] - 1;
786: nz = diag[i] - ai[i];
787: idt = 7*i;
788: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789: s6 = x[5+idt];s7 = x[6+idt];
790: while (nz--) {
791: idx = 7*(*vi--);
792: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
793: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799: v -= 49;
800: }
801: }
802: VecRestoreArray(xx,&x);
803: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
804: return(0);
805: }
808: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809: {
810: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
811: PetscErrorCode ierr;
812: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813: PetscInt nz,idx,idt,j,i,oidx;
814: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
815: const MatScalar *aa=a->a,*v;
816: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
819: VecCopy(bb,xx);
820: VecGetArray(xx,&x);
822: /* forward solve the U^T */
823: idx = 0;
824: for (i=0; i<n; i++) {
825: v = aa + bs2*diag[i];
826: /* multiply by the inverse of the block diagonal */
827: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
828: x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx];
829: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
830: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836: v -= bs2;
837: vi = aj + diag[i] - 1;
838: nz = diag[i] - diag[i+1] - 1;
839: for (j=0; j>-nz; j--) {
840: oidx = bs*vi[j];
841: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
842: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848: v -= bs2;
849: }
850: x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
851: x[5+idx] = s6; x[6+idx] = s7;
852: idx += bs;
853: }
854: /* backward solve the L^T */
855: for (i=n-1; i>=0; i--) {
856: v = aa + bs2*ai[i];
857: vi = aj + ai[i];
858: nz = ai[i+1] - ai[i];
859: idt = bs*i;
860: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
861: s6 = x[5+idt]; s7 = x[6+idt];
862: for (j=0; j<nz; j++) {
863: idx = bs*vi[j];
864: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
865: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871: v += bs2;
872: }
873: }
874: VecRestoreArray(xx,&x);
875: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
876: return(0);
877: }
879: /*---------------------------------------------------------------------------------------------*/
882: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883: {
884: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
885: IS iscol = a->col,isrow = a->row;
886: PetscErrorCode ierr;
887: const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888: PetscInt i,n = a->mbs,j;
889: PetscInt nz;
890: PetscScalar *x,*tmp,s1;
891: const MatScalar *aa = a->a,*v;
892: const PetscScalar *b;
895: VecGetArrayRead(bb,&b);
896: VecGetArray(xx,&x);
897: tmp = a->solve_work;
899: ISGetIndices(isrow,&rout); r = rout;
900: ISGetIndices(iscol,&cout); c = cout;
902: /* copy the b into temp work space according to permutation */
903: for (i=0; i<n; i++) tmp[i] = b[c[i]];
905: /* forward solve the U^T */
906: for (i=0; i<n; i++) {
907: v = aa + adiag[i+1] + 1;
908: vi = aj + adiag[i+1] + 1;
909: nz = adiag[i] - adiag[i+1] - 1;
910: s1 = tmp[i];
911: s1 *= v[nz]; /* multiply by inverse of diagonal entry */
912: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913: tmp[i] = s1;
914: }
916: /* backward solve the L^T */
917: for (i=n-1; i>=0; i--) {
918: v = aa + ai[i];
919: vi = aj + ai[i];
920: nz = ai[i+1] - ai[i];
921: s1 = tmp[i];
922: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923: }
925: /* copy tmp into x according to permutation */
926: for (i=0; i<n; i++) x[r[i]] = tmp[i];
928: ISRestoreIndices(isrow,&rout);
929: ISRestoreIndices(iscol,&cout);
930: VecRestoreArrayRead(bb,&b);
931: VecRestoreArray(xx,&x);
933: PetscLogFlops(2.0*a->nz-A->cmap->n);
934: return(0);
935: }
939: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940: {
941: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
942: IS iscol=a->col,isrow=a->row;
943: PetscErrorCode ierr;
944: const PetscInt *r,*c,*rout,*cout;
945: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946: PetscInt i,nz;
947: const MatScalar *aa=a->a,*v;
948: PetscScalar s1,*x,*t;
949: const PetscScalar *b;
952: VecGetArrayRead(bb,&b);
953: VecGetArray(xx,&x);
954: t = a->solve_work;
956: ISGetIndices(isrow,&rout); r = rout;
957: ISGetIndices(iscol,&cout); c = cout;
959: /* copy the b into temp work space according to permutation */
960: for (i=0; i<n; i++) t[i] = b[c[i]];
962: /* forward solve the U^T */
963: for (i=0; i<n; i++) {
965: v = aa + diag[i];
966: /* multiply by the inverse of the block diagonal */
967: s1 = (*v++)*t[i];
968: vi = aj + diag[i] + 1;
969: nz = ai[i+1] - diag[i] - 1;
970: while (nz--) {
971: t[*vi++] -= (*v++)*s1;
972: }
973: t[i] = s1;
974: }
975: /* backward solve the L^T */
976: for (i=n-1; i>=0; i--) {
977: v = aa + diag[i] - 1;
978: vi = aj + diag[i] - 1;
979: nz = diag[i] - ai[i];
980: s1 = t[i];
981: while (nz--) {
982: t[*vi--] -= (*v--)*s1;
983: }
984: }
986: /* copy t into x according to permutation */
987: for (i=0; i<n; i++) x[r[i]] = t[i];
989: ISRestoreIndices(isrow,&rout);
990: ISRestoreIndices(iscol,&cout);
991: VecRestoreArrayRead(bb,&b);
992: VecRestoreArray(xx,&x);
993: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
994: return(0);
995: }
999: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1000: {
1001: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1002: IS iscol=a->col,isrow=a->row;
1003: PetscErrorCode ierr;
1004: const PetscInt *r,*c,*rout,*cout;
1005: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1006: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1007: const MatScalar *aa=a->a,*v;
1008: PetscScalar s1,s2,x1,x2,*x,*t;
1009: const PetscScalar *b;
1012: VecGetArrayRead(bb,&b);
1013: VecGetArray(xx,&x);
1014: t = a->solve_work;
1016: ISGetIndices(isrow,&rout); r = rout;
1017: ISGetIndices(iscol,&cout); c = cout;
1019: /* copy the b into temp work space according to permutation */
1020: ii = 0;
1021: for (i=0; i<n; i++) {
1022: ic = 2*c[i];
1023: t[ii] = b[ic];
1024: t[ii+1] = b[ic+1];
1025: ii += 2;
1026: }
1028: /* forward solve the U^T */
1029: idx = 0;
1030: for (i=0; i<n; i++) {
1032: v = aa + 4*diag[i];
1033: /* multiply by the inverse of the block diagonal */
1034: x1 = t[idx]; x2 = t[1+idx];
1035: s1 = v[0]*x1 + v[1]*x2;
1036: s2 = v[2]*x1 + v[3]*x2;
1037: v += 4;
1039: vi = aj + diag[i] + 1;
1040: nz = ai[i+1] - diag[i] - 1;
1041: while (nz--) {
1042: oidx = 2*(*vi++);
1043: t[oidx] -= v[0]*s1 + v[1]*s2;
1044: t[oidx+1] -= v[2]*s1 + v[3]*s2;
1045: v += 4;
1046: }
1047: t[idx] = s1;t[1+idx] = s2;
1048: idx += 2;
1049: }
1050: /* backward solve the L^T */
1051: for (i=n-1; i>=0; i--) {
1052: v = aa + 4*diag[i] - 4;
1053: vi = aj + diag[i] - 1;
1054: nz = diag[i] - ai[i];
1055: idt = 2*i;
1056: s1 = t[idt]; s2 = t[1+idt];
1057: while (nz--) {
1058: idx = 2*(*vi--);
1059: t[idx] -= v[0]*s1 + v[1]*s2;
1060: t[idx+1] -= v[2]*s1 + v[3]*s2;
1061: v -= 4;
1062: }
1063: }
1065: /* copy t into x according to permutation */
1066: ii = 0;
1067: for (i=0; i<n; i++) {
1068: ir = 2*r[i];
1069: x[ir] = t[ii];
1070: x[ir+1] = t[ii+1];
1071: ii += 2;
1072: }
1074: ISRestoreIndices(isrow,&rout);
1075: ISRestoreIndices(iscol,&cout);
1076: VecRestoreArrayRead(bb,&b);
1077: VecRestoreArray(xx,&x);
1078: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1079: return(0);
1080: }
1084: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1085: {
1086: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1087: PetscErrorCode ierr;
1088: IS iscol=a->col,isrow=a->row;
1089: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1090: const PetscInt *r,*c,*rout,*cout;
1091: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1092: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1093: const MatScalar *aa=a->a,*v;
1094: PetscScalar s1,s2,x1,x2,*x,*t;
1095: const PetscScalar *b;
1098: VecGetArrayRead(bb,&b);
1099: VecGetArray(xx,&x);
1100: t = a->solve_work;
1102: ISGetIndices(isrow,&rout); r = rout;
1103: ISGetIndices(iscol,&cout); c = cout;
1105: /* copy b into temp work space according to permutation */
1106: for (i=0; i<n; i++) {
1107: ii = bs*i; ic = bs*c[i];
1108: t[ii] = b[ic]; t[ii+1] = b[ic+1];
1109: }
1111: /* forward solve the U^T */
1112: idx = 0;
1113: for (i=0; i<n; i++) {
1114: v = aa + bs2*diag[i];
1115: /* multiply by the inverse of the block diagonal */
1116: x1 = t[idx]; x2 = t[1+idx];
1117: s1 = v[0]*x1 + v[1]*x2;
1118: s2 = v[2]*x1 + v[3]*x2;
1119: v -= bs2;
1121: vi = aj + diag[i] - 1;
1122: nz = diag[i] - diag[i+1] - 1;
1123: for (j=0; j>-nz; j--) {
1124: oidx = bs*vi[j];
1125: t[oidx] -= v[0]*s1 + v[1]*s2;
1126: t[oidx+1] -= v[2]*s1 + v[3]*s2;
1127: v -= bs2;
1128: }
1129: t[idx] = s1;t[1+idx] = s2;
1130: idx += bs;
1131: }
1132: /* backward solve the L^T */
1133: for (i=n-1; i>=0; i--) {
1134: v = aa + bs2*ai[i];
1135: vi = aj + ai[i];
1136: nz = ai[i+1] - ai[i];
1137: idt = bs*i;
1138: s1 = t[idt]; s2 = t[1+idt];
1139: for (j=0; j<nz; j++) {
1140: idx = bs*vi[j];
1141: t[idx] -= v[0]*s1 + v[1]*s2;
1142: t[idx+1] -= v[2]*s1 + v[3]*s2;
1143: v += bs2;
1144: }
1145: }
1147: /* copy t into x according to permutation */
1148: for (i=0; i<n; i++) {
1149: ii = bs*i; ir = bs*r[i];
1150: x[ir] = t[ii]; x[ir+1] = t[ii+1];
1151: }
1153: ISRestoreIndices(isrow,&rout);
1154: ISRestoreIndices(iscol,&cout);
1155: VecRestoreArrayRead(bb,&b);
1156: VecRestoreArray(xx,&x);
1157: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1158: return(0);
1159: }
1163: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1164: {
1165: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1166: IS iscol=a->col,isrow=a->row;
1167: PetscErrorCode ierr;
1168: const PetscInt *r,*c,*rout,*cout;
1169: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1170: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1171: const MatScalar *aa=a->a,*v;
1172: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
1173: const PetscScalar *b;
1176: VecGetArrayRead(bb,&b);
1177: VecGetArray(xx,&x);
1178: t = a->solve_work;
1180: ISGetIndices(isrow,&rout); r = rout;
1181: ISGetIndices(iscol,&cout); c = cout;
1183: /* copy the b into temp work space according to permutation */
1184: ii = 0;
1185: for (i=0; i<n; i++) {
1186: ic = 3*c[i];
1187: t[ii] = b[ic];
1188: t[ii+1] = b[ic+1];
1189: t[ii+2] = b[ic+2];
1190: ii += 3;
1191: }
1193: /* forward solve the U^T */
1194: idx = 0;
1195: for (i=0; i<n; i++) {
1197: v = aa + 9*diag[i];
1198: /* multiply by the inverse of the block diagonal */
1199: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1200: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
1201: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
1202: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
1203: v += 9;
1205: vi = aj + diag[i] + 1;
1206: nz = ai[i+1] - diag[i] - 1;
1207: while (nz--) {
1208: oidx = 3*(*vi++);
1209: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1210: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1211: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1212: v += 9;
1213: }
1214: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1215: idx += 3;
1216: }
1217: /* backward solve the L^T */
1218: for (i=n-1; i>=0; i--) {
1219: v = aa + 9*diag[i] - 9;
1220: vi = aj + diag[i] - 1;
1221: nz = diag[i] - ai[i];
1222: idt = 3*i;
1223: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1224: while (nz--) {
1225: idx = 3*(*vi--);
1226: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1227: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1228: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1229: v -= 9;
1230: }
1231: }
1233: /* copy t into x according to permutation */
1234: ii = 0;
1235: for (i=0; i<n; i++) {
1236: ir = 3*r[i];
1237: x[ir] = t[ii];
1238: x[ir+1] = t[ii+1];
1239: x[ir+2] = t[ii+2];
1240: ii += 3;
1241: }
1243: ISRestoreIndices(isrow,&rout);
1244: ISRestoreIndices(iscol,&cout);
1245: VecRestoreArrayRead(bb,&b);
1246: VecRestoreArray(xx,&x);
1247: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1248: return(0);
1249: }
1253: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1254: {
1255: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1256: PetscErrorCode ierr;
1257: IS iscol=a->col,isrow=a->row;
1258: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1259: const PetscInt *r,*c,*rout,*cout;
1260: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1261: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1262: const MatScalar *aa=a->a,*v;
1263: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
1264: const PetscScalar *b;
1267: VecGetArrayRead(bb,&b);
1268: VecGetArray(xx,&x);
1269: t = a->solve_work;
1271: ISGetIndices(isrow,&rout); r = rout;
1272: ISGetIndices(iscol,&cout); c = cout;
1274: /* copy b into temp work space according to permutation */
1275: for (i=0; i<n; i++) {
1276: ii = bs*i; ic = bs*c[i];
1277: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1278: }
1280: /* forward solve the U^T */
1281: idx = 0;
1282: for (i=0; i<n; i++) {
1283: v = aa + bs2*diag[i];
1284: /* multiply by the inverse of the block diagonal */
1285: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1286: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
1287: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
1288: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
1289: v -= bs2;
1291: vi = aj + diag[i] - 1;
1292: nz = diag[i] - diag[i+1] - 1;
1293: for (j=0; j>-nz; j--) {
1294: oidx = bs*vi[j];
1295: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1296: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1297: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1298: v -= bs2;
1299: }
1300: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1301: idx += bs;
1302: }
1303: /* backward solve the L^T */
1304: for (i=n-1; i>=0; i--) {
1305: v = aa + bs2*ai[i];
1306: vi = aj + ai[i];
1307: nz = ai[i+1] - ai[i];
1308: idt = bs*i;
1309: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1310: for (j=0; j<nz; j++) {
1311: idx = bs*vi[j];
1312: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1313: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1314: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1315: v += bs2;
1316: }
1317: }
1319: /* copy t into x according to permutation */
1320: for (i=0; i<n; i++) {
1321: ii = bs*i; ir = bs*r[i];
1322: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1323: }
1325: ISRestoreIndices(isrow,&rout);
1326: ISRestoreIndices(iscol,&cout);
1327: VecRestoreArrayRead(bb,&b);
1328: VecRestoreArray(xx,&x);
1329: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1330: return(0);
1331: }
1335: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1336: {
1337: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1338: IS iscol=a->col,isrow=a->row;
1339: PetscErrorCode ierr;
1340: const PetscInt *r,*c,*rout,*cout;
1341: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1342: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1343: const MatScalar *aa=a->a,*v;
1344: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1345: const PetscScalar *b;
1348: VecGetArrayRead(bb,&b);
1349: VecGetArray(xx,&x);
1350: t = a->solve_work;
1352: ISGetIndices(isrow,&rout); r = rout;
1353: ISGetIndices(iscol,&cout); c = cout;
1355: /* copy the b into temp work space according to permutation */
1356: ii = 0;
1357: for (i=0; i<n; i++) {
1358: ic = 4*c[i];
1359: t[ii] = b[ic];
1360: t[ii+1] = b[ic+1];
1361: t[ii+2] = b[ic+2];
1362: t[ii+3] = b[ic+3];
1363: ii += 4;
1364: }
1366: /* forward solve the U^T */
1367: idx = 0;
1368: for (i=0; i<n; i++) {
1370: v = aa + 16*diag[i];
1371: /* multiply by the inverse of the block diagonal */
1372: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
1373: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1374: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1375: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1376: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1377: v += 16;
1379: vi = aj + diag[i] + 1;
1380: nz = ai[i+1] - diag[i] - 1;
1381: while (nz--) {
1382: oidx = 4*(*vi++);
1383: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1384: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1385: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1386: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1387: v += 16;
1388: }
1389: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1390: idx += 4;
1391: }
1392: /* backward solve the L^T */
1393: for (i=n-1; i>=0; i--) {
1394: v = aa + 16*diag[i] - 16;
1395: vi = aj + diag[i] - 1;
1396: nz = diag[i] - ai[i];
1397: idt = 4*i;
1398: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1399: while (nz--) {
1400: idx = 4*(*vi--);
1401: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1402: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1403: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1404: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1405: v -= 16;
1406: }
1407: }
1409: /* copy t into x according to permutation */
1410: ii = 0;
1411: for (i=0; i<n; i++) {
1412: ir = 4*r[i];
1413: x[ir] = t[ii];
1414: x[ir+1] = t[ii+1];
1415: x[ir+2] = t[ii+2];
1416: x[ir+3] = t[ii+3];
1417: ii += 4;
1418: }
1420: ISRestoreIndices(isrow,&rout);
1421: ISRestoreIndices(iscol,&cout);
1422: VecRestoreArrayRead(bb,&b);
1423: VecRestoreArray(xx,&x);
1424: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1425: return(0);
1426: }
1430: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1431: {
1432: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1433: PetscErrorCode ierr;
1434: IS iscol=a->col,isrow=a->row;
1435: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1436: const PetscInt *r,*c,*rout,*cout;
1437: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1438: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1439: const MatScalar *aa=a->a,*v;
1440: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1441: const PetscScalar *b;
1444: VecGetArrayRead(bb,&b);
1445: VecGetArray(xx,&x);
1446: t = a->solve_work;
1448: ISGetIndices(isrow,&rout); r = rout;
1449: ISGetIndices(iscol,&cout); c = cout;
1451: /* copy b into temp work space according to permutation */
1452: for (i=0; i<n; i++) {
1453: ii = bs*i; ic = bs*c[i];
1454: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1455: }
1457: /* forward solve the U^T */
1458: idx = 0;
1459: for (i=0; i<n; i++) {
1460: v = aa + bs2*diag[i];
1461: /* multiply by the inverse of the block diagonal */
1462: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
1463: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1464: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1465: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1466: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1467: v -= bs2;
1469: vi = aj + diag[i] - 1;
1470: nz = diag[i] - diag[i+1] - 1;
1471: for (j=0; j>-nz; j--) {
1472: oidx = bs*vi[j];
1473: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1474: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1475: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1476: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1477: v -= bs2;
1478: }
1479: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4;
1480: idx += bs;
1481: }
1482: /* backward solve the L^T */
1483: for (i=n-1; i>=0; i--) {
1484: v = aa + bs2*ai[i];
1485: vi = aj + ai[i];
1486: nz = ai[i+1] - ai[i];
1487: idt = bs*i;
1488: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt];
1489: for (j=0; j<nz; j++) {
1490: idx = bs*vi[j];
1491: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1492: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1493: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1494: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1495: v += bs2;
1496: }
1497: }
1499: /* copy t into x according to permutation */
1500: for (i=0; i<n; i++) {
1501: ii = bs*i; ir = bs*r[i];
1502: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1503: }
1505: ISRestoreIndices(isrow,&rout);
1506: ISRestoreIndices(iscol,&cout);
1507: VecRestoreArrayRead(bb,&b);
1508: VecRestoreArray(xx,&x);
1509: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1510: return(0);
1511: }
1515: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1516: {
1517: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1518: IS iscol=a->col,isrow=a->row;
1519: PetscErrorCode ierr;
1520: const PetscInt *r,*c,*rout,*cout;
1521: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1522: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1523: const MatScalar *aa=a->a,*v;
1524: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1525: const PetscScalar *b;
1528: VecGetArrayRead(bb,&b);
1529: VecGetArray(xx,&x);
1530: t = a->solve_work;
1532: ISGetIndices(isrow,&rout); r = rout;
1533: ISGetIndices(iscol,&cout); c = cout;
1535: /* copy the b into temp work space according to permutation */
1536: ii = 0;
1537: for (i=0; i<n; i++) {
1538: ic = 5*c[i];
1539: t[ii] = b[ic];
1540: t[ii+1] = b[ic+1];
1541: t[ii+2] = b[ic+2];
1542: t[ii+3] = b[ic+3];
1543: t[ii+4] = b[ic+4];
1544: ii += 5;
1545: }
1547: /* forward solve the U^T */
1548: idx = 0;
1549: for (i=0; i<n; i++) {
1551: v = aa + 25*diag[i];
1552: /* multiply by the inverse of the block diagonal */
1553: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1554: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1555: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1556: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1557: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1558: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1559: v += 25;
1561: vi = aj + diag[i] + 1;
1562: nz = ai[i+1] - diag[i] - 1;
1563: while (nz--) {
1564: oidx = 5*(*vi++);
1565: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1566: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1567: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1568: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1569: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1570: v += 25;
1571: }
1572: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1573: idx += 5;
1574: }
1575: /* backward solve the L^T */
1576: for (i=n-1; i>=0; i--) {
1577: v = aa + 25*diag[i] - 25;
1578: vi = aj + diag[i] - 1;
1579: nz = diag[i] - ai[i];
1580: idt = 5*i;
1581: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1582: while (nz--) {
1583: idx = 5*(*vi--);
1584: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1585: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1586: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1587: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1588: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1589: v -= 25;
1590: }
1591: }
1593: /* copy t into x according to permutation */
1594: ii = 0;
1595: for (i=0; i<n; i++) {
1596: ir = 5*r[i];
1597: x[ir] = t[ii];
1598: x[ir+1] = t[ii+1];
1599: x[ir+2] = t[ii+2];
1600: x[ir+3] = t[ii+3];
1601: x[ir+4] = t[ii+4];
1602: ii += 5;
1603: }
1605: ISRestoreIndices(isrow,&rout);
1606: ISRestoreIndices(iscol,&cout);
1607: VecRestoreArrayRead(bb,&b);
1608: VecRestoreArray(xx,&x);
1609: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1610: return(0);
1611: }
1615: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1616: {
1617: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1618: PetscErrorCode ierr;
1619: IS iscol=a->col,isrow=a->row;
1620: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1621: const PetscInt *r,*c,*rout,*cout;
1622: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1623: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1624: const MatScalar *aa=a->a,*v;
1625: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1626: const PetscScalar *b;
1629: VecGetArrayRead(bb,&b);
1630: VecGetArray(xx,&x);
1631: t = a->solve_work;
1633: ISGetIndices(isrow,&rout); r = rout;
1634: ISGetIndices(iscol,&cout); c = cout;
1636: /* copy b into temp work space according to permutation */
1637: for (i=0; i<n; i++) {
1638: ii = bs*i; ic = bs*c[i];
1639: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1640: t[ii+4] = b[ic+4];
1641: }
1643: /* forward solve the U^T */
1644: idx = 0;
1645: for (i=0; i<n; i++) {
1646: v = aa + bs2*diag[i];
1647: /* multiply by the inverse of the block diagonal */
1648: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1649: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1650: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1651: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1652: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1653: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1654: v -= bs2;
1656: vi = aj + diag[i] - 1;
1657: nz = diag[i] - diag[i+1] - 1;
1658: for (j=0; j>-nz; j--) {
1659: oidx = bs*vi[j];
1660: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1661: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1662: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1663: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1664: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1665: v -= bs2;
1666: }
1667: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1668: idx += bs;
1669: }
1670: /* backward solve the L^T */
1671: for (i=n-1; i>=0; i--) {
1672: v = aa + bs2*ai[i];
1673: vi = aj + ai[i];
1674: nz = ai[i+1] - ai[i];
1675: idt = bs*i;
1676: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1677: for (j=0; j<nz; j++) {
1678: idx = bs*vi[j];
1679: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1680: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1681: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1682: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1683: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1684: v += bs2;
1685: }
1686: }
1688: /* copy t into x according to permutation */
1689: for (i=0; i<n; i++) {
1690: ii = bs*i; ir = bs*r[i];
1691: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1692: x[ir+4] = t[ii+4];
1693: }
1695: ISRestoreIndices(isrow,&rout);
1696: ISRestoreIndices(iscol,&cout);
1697: VecRestoreArrayRead(bb,&b);
1698: VecRestoreArray(xx,&x);
1699: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1700: return(0);
1701: }
1705: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1706: {
1707: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1708: IS iscol=a->col,isrow=a->row;
1709: PetscErrorCode ierr;
1710: const PetscInt *r,*c,*rout,*cout;
1711: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1712: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1713: const MatScalar *aa=a->a,*v;
1714: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1715: const PetscScalar *b;
1718: VecGetArrayRead(bb,&b);
1719: VecGetArray(xx,&x);
1720: t = a->solve_work;
1722: ISGetIndices(isrow,&rout); r = rout;
1723: ISGetIndices(iscol,&cout); c = cout;
1725: /* copy the b into temp work space according to permutation */
1726: ii = 0;
1727: for (i=0; i<n; i++) {
1728: ic = 6*c[i];
1729: t[ii] = b[ic];
1730: t[ii+1] = b[ic+1];
1731: t[ii+2] = b[ic+2];
1732: t[ii+3] = b[ic+3];
1733: t[ii+4] = b[ic+4];
1734: t[ii+5] = b[ic+5];
1735: ii += 6;
1736: }
1738: /* forward solve the U^T */
1739: idx = 0;
1740: for (i=0; i<n; i++) {
1742: v = aa + 36*diag[i];
1743: /* multiply by the inverse of the block diagonal */
1744: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1745: x6 = t[5+idx];
1746: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1747: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1748: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1749: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1750: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1751: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1752: v += 36;
1754: vi = aj + diag[i] + 1;
1755: nz = ai[i+1] - diag[i] - 1;
1756: while (nz--) {
1757: oidx = 6*(*vi++);
1758: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1759: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1760: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1761: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1762: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1763: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1764: v += 36;
1765: }
1766: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1767: t[5+idx] = s6;
1768: idx += 6;
1769: }
1770: /* backward solve the L^T */
1771: for (i=n-1; i>=0; i--) {
1772: v = aa + 36*diag[i] - 36;
1773: vi = aj + diag[i] - 1;
1774: nz = diag[i] - ai[i];
1775: idt = 6*i;
1776: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1777: s6 = t[5+idt];
1778: while (nz--) {
1779: idx = 6*(*vi--);
1780: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1781: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1782: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1783: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1784: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1785: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1786: v -= 36;
1787: }
1788: }
1790: /* copy t into x according to permutation */
1791: ii = 0;
1792: for (i=0; i<n; i++) {
1793: ir = 6*r[i];
1794: x[ir] = t[ii];
1795: x[ir+1] = t[ii+1];
1796: x[ir+2] = t[ii+2];
1797: x[ir+3] = t[ii+3];
1798: x[ir+4] = t[ii+4];
1799: x[ir+5] = t[ii+5];
1800: ii += 6;
1801: }
1803: ISRestoreIndices(isrow,&rout);
1804: ISRestoreIndices(iscol,&cout);
1805: VecRestoreArrayRead(bb,&b);
1806: VecRestoreArray(xx,&x);
1807: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
1808: return(0);
1809: }
1813: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1814: {
1815: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1816: PetscErrorCode ierr;
1817: IS iscol=a->col,isrow=a->row;
1818: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1819: const PetscInt *r,*c,*rout,*cout;
1820: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1821: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1822: const MatScalar *aa=a->a,*v;
1823: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1824: const PetscScalar *b;
1827: VecGetArrayRead(bb,&b);
1828: VecGetArray(xx,&x);
1829: t = a->solve_work;
1831: ISGetIndices(isrow,&rout); r = rout;
1832: ISGetIndices(iscol,&cout); c = cout;
1834: /* copy b into temp work space according to permutation */
1835: for (i=0; i<n; i++) {
1836: ii = bs*i; ic = bs*c[i];
1837: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1838: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5];
1839: }
1841: /* forward solve the U^T */
1842: idx = 0;
1843: for (i=0; i<n; i++) {
1844: v = aa + bs2*diag[i];
1845: /* multiply by the inverse of the block diagonal */
1846: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1847: x6 = t[5+idx];
1848: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1849: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1850: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1851: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1852: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1853: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1854: v -= bs2;
1856: vi = aj + diag[i] - 1;
1857: nz = diag[i] - diag[i+1] - 1;
1858: for (j=0; j>-nz; j--) {
1859: oidx = bs*vi[j];
1860: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1861: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1862: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1863: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1864: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1865: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1866: v -= bs2;
1867: }
1868: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1869: t[5+idx] = s6;
1870: idx += bs;
1871: }
1872: /* backward solve the L^T */
1873: for (i=n-1; i>=0; i--) {
1874: v = aa + bs2*ai[i];
1875: vi = aj + ai[i];
1876: nz = ai[i+1] - ai[i];
1877: idt = bs*i;
1878: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1879: s6 = t[5+idt];
1880: for (j=0; j<nz; j++) {
1881: idx = bs*vi[j];
1882: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1883: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1884: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1885: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1886: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1887: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1888: v += bs2;
1889: }
1890: }
1892: /* copy t into x according to permutation */
1893: for (i=0; i<n; i++) {
1894: ii = bs*i; ir = bs*r[i];
1895: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1896: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5];
1897: }
1899: ISRestoreIndices(isrow,&rout);
1900: ISRestoreIndices(iscol,&cout);
1901: VecRestoreArrayRead(bb,&b);
1902: VecRestoreArray(xx,&x);
1903: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1904: return(0);
1905: }
1909: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1910: {
1911: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1912: IS iscol=a->col,isrow=a->row;
1913: PetscErrorCode ierr;
1914: const PetscInt *r,*c,*rout,*cout;
1915: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1916: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1917: const MatScalar *aa=a->a,*v;
1918: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1919: const PetscScalar *b;
1922: VecGetArrayRead(bb,&b);
1923: VecGetArray(xx,&x);
1924: t = a->solve_work;
1926: ISGetIndices(isrow,&rout); r = rout;
1927: ISGetIndices(iscol,&cout); c = cout;
1929: /* copy the b into temp work space according to permutation */
1930: ii = 0;
1931: for (i=0; i<n; i++) {
1932: ic = 7*c[i];
1933: t[ii] = b[ic];
1934: t[ii+1] = b[ic+1];
1935: t[ii+2] = b[ic+2];
1936: t[ii+3] = b[ic+3];
1937: t[ii+4] = b[ic+4];
1938: t[ii+5] = b[ic+5];
1939: t[ii+6] = b[ic+6];
1940: ii += 7;
1941: }
1943: /* forward solve the U^T */
1944: idx = 0;
1945: for (i=0; i<n; i++) {
1947: v = aa + 49*diag[i];
1948: /* multiply by the inverse of the block diagonal */
1949: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1950: x6 = t[5+idx]; x7 = t[6+idx];
1951: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1952: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1953: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1954: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1955: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1956: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1957: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1958: v += 49;
1960: vi = aj + diag[i] + 1;
1961: nz = ai[i+1] - diag[i] - 1;
1962: while (nz--) {
1963: oidx = 7*(*vi++);
1964: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1965: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1966: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1967: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1968: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1969: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1970: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1971: v += 49;
1972: }
1973: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1974: t[5+idx] = s6;t[6+idx] = s7;
1975: idx += 7;
1976: }
1977: /* backward solve the L^T */
1978: for (i=n-1; i>=0; i--) {
1979: v = aa + 49*diag[i] - 49;
1980: vi = aj + diag[i] - 1;
1981: nz = diag[i] - ai[i];
1982: idt = 7*i;
1983: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1984: s6 = t[5+idt];s7 = t[6+idt];
1985: while (nz--) {
1986: idx = 7*(*vi--);
1987: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1988: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1989: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1990: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1991: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1992: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1993: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1994: v -= 49;
1995: }
1996: }
1998: /* copy t into x according to permutation */
1999: ii = 0;
2000: for (i=0; i<n; i++) {
2001: ir = 7*r[i];
2002: x[ir] = t[ii];
2003: x[ir+1] = t[ii+1];
2004: x[ir+2] = t[ii+2];
2005: x[ir+3] = t[ii+3];
2006: x[ir+4] = t[ii+4];
2007: x[ir+5] = t[ii+5];
2008: x[ir+6] = t[ii+6];
2009: ii += 7;
2010: }
2012: ISRestoreIndices(isrow,&rout);
2013: ISRestoreIndices(iscol,&cout);
2014: VecRestoreArrayRead(bb,&b);
2015: VecRestoreArray(xx,&x);
2016: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2017: return(0);
2018: }
2021: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2022: {
2023: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
2024: PetscErrorCode ierr;
2025: IS iscol=a->col,isrow=a->row;
2026: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2027: const PetscInt *r,*c,*rout,*cout;
2028: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
2029: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
2030: const MatScalar *aa=a->a,*v;
2031: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2032: const PetscScalar *b;
2035: VecGetArrayRead(bb,&b);
2036: VecGetArray(xx,&x);
2037: t = a->solve_work;
2039: ISGetIndices(isrow,&rout); r = rout;
2040: ISGetIndices(iscol,&cout); c = cout;
2042: /* copy b into temp work space according to permutation */
2043: for (i=0; i<n; i++) {
2044: ii = bs*i; ic = bs*c[i];
2045: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2046: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6];
2047: }
2049: /* forward solve the U^T */
2050: idx = 0;
2051: for (i=0; i<n; i++) {
2052: v = aa + bs2*diag[i];
2053: /* multiply by the inverse of the block diagonal */
2054: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2055: x6 = t[5+idx]; x7 = t[6+idx];
2056: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
2057: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2058: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2059: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2060: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2061: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2062: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2063: v -= bs2;
2065: vi = aj + diag[i] - 1;
2066: nz = diag[i] - diag[i+1] - 1;
2067: for (j=0; j>-nz; j--) {
2068: oidx = bs*vi[j];
2069: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
2070: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2071: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2072: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2073: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2074: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2075: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2076: v -= bs2;
2077: }
2078: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
2079: t[5+idx] = s6; t[6+idx] = s7;
2080: idx += bs;
2081: }
2082: /* backward solve the L^T */
2083: for (i=n-1; i>=0; i--) {
2084: v = aa + bs2*ai[i];
2085: vi = aj + ai[i];
2086: nz = ai[i+1] - ai[i];
2087: idt = bs*i;
2088: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
2089: s6 = t[5+idt]; s7 = t[6+idt];
2090: for (j=0; j<nz; j++) {
2091: idx = bs*vi[j];
2092: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
2093: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2094: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2095: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2096: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2097: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2098: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2099: v += bs2;
2100: }
2101: }
2103: /* copy t into x according to permutation */
2104: for (i=0; i<n; i++) {
2105: ii = bs*i; ir = bs*r[i];
2106: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
2107: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6];
2108: }
2110: ISRestoreIndices(isrow,&rout);
2111: ISRestoreIndices(iscol,&cout);
2112: VecRestoreArrayRead(bb,&b);
2113: VecRestoreArray(xx,&x);
2114: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2115: return(0);
2116: }
2118: /* ----------------------------------------------------------- */
2121: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2122: {
2123: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
2124: IS iscol=a->col,isrow=a->row;
2125: PetscErrorCode ierr;
2126: const PetscInt *r,*c,*rout,*cout;
2127: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2128: PetscInt i,nz;
2129: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
2130: const MatScalar *aa=a->a,*v;
2131: PetscScalar *x,*s,*t,*ls;
2132: const PetscScalar *b;
2135: VecGetArrayRead(bb,&b);
2136: VecGetArray(xx,&x);
2137: t = a->solve_work;
2139: ISGetIndices(isrow,&rout); r = rout;
2140: ISGetIndices(iscol,&cout); c = cout + (n-1);
2142: /* forward solve the lower triangular */
2143: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
2144: for (i=1; i<n; i++) {
2145: v = aa + bs2*ai[i];
2146: vi = aj + ai[i];
2147: nz = a->diag[i] - ai[i];
2148: s = t + bs*i;
2149: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
2150: while (nz--) {
2151: PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2152: v += bs2;
2153: }
2154: }
2155: /* backward solve the upper triangular */
2156: ls = a->solve_work + A->cmap->n;
2157: for (i=n-1; i>=0; i--) {
2158: v = aa + bs2*(a->diag[i] + 1);
2159: vi = aj + a->diag[i] + 1;
2160: nz = ai[i+1] - a->diag[i] - 1;
2161: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2162: while (nz--) {
2163: PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2164: v += bs2;
2165: }
2166: PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2167: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
2168: }
2170: ISRestoreIndices(isrow,&rout);
2171: ISRestoreIndices(iscol,&cout);
2172: VecRestoreArrayRead(bb,&b);
2173: VecRestoreArray(xx,&x);
2174: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2175: return(0);
2176: }
2178: /* ----------------------------------------------------------- */
2181: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2182: {
2183: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
2184: IS iscol=a->col,isrow=a->row;
2185: PetscErrorCode ierr;
2186: const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2187: PetscInt i,nz,j;
2188: const PetscInt n =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2189: const MatScalar *aa=a->a,*v;
2190: PetscScalar *x,*t,*ls;
2191: const PetscScalar *b;
2194: VecGetArrayRead(bb,&b);
2195: VecGetArray(xx,&x);
2196: t = a->solve_work;
2198: ISGetIndices(isrow,&rout); r = rout;
2199: ISGetIndices(iscol,&cout); c = cout;
2201: /* copy the b into temp work space according to permutation */
2202: for (i=0; i<n; i++) {
2203: for (j=0; j<bs; j++) {
2204: t[i*bs+j] = b[c[i]*bs+j];
2205: }
2206: }
2209: /* forward solve the upper triangular transpose */
2210: ls = a->solve_work + A->cmap->n;
2211: for (i=0; i<n; i++) {
2212: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2213: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2214: v = aa + bs2*(a->diag[i] + 1);
2215: vi = aj + a->diag[i] + 1;
2216: nz = ai[i+1] - a->diag[i] - 1;
2217: while (nz--) {
2218: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2219: v += bs2;
2220: }
2221: }
2223: /* backward solve the lower triangular transpose */
2224: for (i=n-1; i>=0; i--) {
2225: v = aa + bs2*ai[i];
2226: vi = aj + ai[i];
2227: nz = a->diag[i] - ai[i];
2228: while (nz--) {
2229: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2230: v += bs2;
2231: }
2232: }
2234: /* copy t into x according to permutation */
2235: for (i=0; i<n; i++) {
2236: for (j=0; j<bs; j++) {
2237: x[bs*r[i]+j] = t[bs*i+j];
2238: }
2239: }
2241: ISRestoreIndices(isrow,&rout);
2242: ISRestoreIndices(iscol,&cout);
2243: VecRestoreArrayRead(bb,&b);
2244: VecRestoreArray(xx,&x);
2245: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2246: return(0);
2247: }
2251: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2252: {
2253: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
2254: IS iscol=a->col,isrow=a->row;
2255: PetscErrorCode ierr;
2256: const PetscInt *r,*c,*rout,*cout;
2257: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2258: PetscInt i,j,nz;
2259: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
2260: const MatScalar *aa=a->a,*v;
2261: PetscScalar *x,*t,*ls;
2262: const PetscScalar *b;
2265: VecGetArrayRead(bb,&b);
2266: VecGetArray(xx,&x);
2267: t = a->solve_work;
2269: ISGetIndices(isrow,&rout); r = rout;
2270: ISGetIndices(iscol,&cout); c = cout;
2272: /* copy the b into temp work space according to permutation */
2273: for (i=0; i<n; i++) {
2274: for (j=0; j<bs; j++) {
2275: t[i*bs+j] = b[c[i]*bs+j];
2276: }
2277: }
2280: /* forward solve the upper triangular transpose */
2281: ls = a->solve_work + A->cmap->n;
2282: for (i=0; i<n; i++) {
2283: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2284: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2285: v = aa + bs2*(diag[i] - 1);
2286: vi = aj + diag[i] - 1;
2287: nz = diag[i] - diag[i+1] - 1;
2288: for (j=0; j>-nz; j--) {
2289: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2290: v -= bs2;
2291: }
2292: }
2294: /* backward solve the lower triangular transpose */
2295: for (i=n-1; i>=0; i--) {
2296: v = aa + bs2*ai[i];
2297: vi = aj + ai[i];
2298: nz = ai[i+1] - ai[i];
2299: for (j=0; j<nz; j++) {
2300: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2301: v += bs2;
2302: }
2303: }
2305: /* copy t into x according to permutation */
2306: for (i=0; i<n; i++) {
2307: for (j=0; j<bs; j++) {
2308: x[bs*r[i]+j] = t[bs*i+j];
2309: }
2310: }
2312: ISRestoreIndices(isrow,&rout);
2313: ISRestoreIndices(iscol,&cout);
2314: VecRestoreArrayRead(bb,&b);
2315: VecRestoreArray(xx,&x);
2316: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2317: return(0);
2318: }
2320: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */
2324: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2325: {
2326: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
2327: PetscErrorCode ierr;
2328: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2329: PetscInt i,nz,idx,idt,m;
2330: const MatScalar *aa=a->a,*v;
2331: PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2332: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2333: PetscScalar *x;
2334: const PetscScalar *b;
2337: VecGetArrayRead(bb,&b);
2338: VecGetArray(xx,&x);
2340: /* forward solve the lower triangular */
2341: idx = 0;
2342: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx];
2343: x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx];
2344: x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2346: for (i=1; i<n; i++) {
2347: v = aa + bs2*ai[i];
2348: vi = aj + ai[i];
2349: nz = ai[i+1] - ai[i];
2350: idt = bs*i;
2351: s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt];
2352: s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt];
2353: s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2354: for (m=0; m<nz; m++) {
2355: idx = bs*vi[m];
2356: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2357: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
2358: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2361: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2362: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2363: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2364: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2365: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2366: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2367: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2368: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2369: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2370: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2371: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2372: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2373: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2374: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2375: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2377: v += bs2;
2378: }
2379: x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5;
2380: x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10;
2381: x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2383: }
2384: /* backward solve the upper triangular */
2385: for (i=n-1; i>=0; i--) {
2386: v = aa + bs2*(adiag[i+1]+1);
2387: vi = aj + adiag[i+1]+1;
2388: nz = adiag[i] - adiag[i+1] - 1;
2389: idt = bs*i;
2390: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
2391: s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt];
2392: s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2394: for (m=0; m<nz; m++) {
2395: idx = bs*vi[m];
2396: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2397: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
2398: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2400: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2401: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2402: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2403: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2404: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2405: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2406: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2407: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2408: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2409: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2410: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2411: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2412: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2413: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2414: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2416: v += bs2;
2417: }
2419: x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2420: x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2421: x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2422: x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2423: x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2424: x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2425: x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2426: x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2427: x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2428: x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2429: x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2430: x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2431: x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2432: x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2433: x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2435: }
2437: VecRestoreArrayRead(bb,&b);
2438: VecRestoreArray(xx,&x);
2439: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2440: return(0);
2441: }
2443: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2444: /* Default MatSolve for block size 15 */
2448: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2449: {
2450: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
2451: PetscErrorCode ierr;
2452: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2453: PetscInt i,k,nz,idx,idt,m;
2454: const MatScalar *aa=a->a,*v;
2455: PetscScalar s[15];
2456: PetscScalar *x,xv;
2457: const PetscScalar *b;
2460: VecGetArrayRead(bb,&b);
2461: VecGetArray(xx,&x);
2463: /* forward solve the lower triangular */
2464: for (i=0; i<n; i++) {
2465: v = aa + bs2*ai[i];
2466: vi = aj + ai[i];
2467: nz = ai[i+1] - ai[i];
2468: idt = bs*i;
2469: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
2470: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
2471: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2472: for (m=0; m<nz; m++) {
2473: idx = bs*vi[m];
2474: for (k=0; k<15; k++) {
2475: xv = x[k + idx];
2476: x[idt] -= v[0]*xv;
2477: x[1+idt] -= v[1]*xv;
2478: x[2+idt] -= v[2]*xv;
2479: x[3+idt] -= v[3]*xv;
2480: x[4+idt] -= v[4]*xv;
2481: x[5+idt] -= v[5]*xv;
2482: x[6+idt] -= v[6]*xv;
2483: x[7+idt] -= v[7]*xv;
2484: x[8+idt] -= v[8]*xv;
2485: x[9+idt] -= v[9]*xv;
2486: x[10+idt] -= v[10]*xv;
2487: x[11+idt] -= v[11]*xv;
2488: x[12+idt] -= v[12]*xv;
2489: x[13+idt] -= v[13]*xv;
2490: x[14+idt] -= v[14]*xv;
2491: v += 15;
2492: }
2493: }
2494: }
2495: /* backward solve the upper triangular */
2496: for (i=n-1; i>=0; i--) {
2497: v = aa + bs2*(adiag[i+1]+1);
2498: vi = aj + adiag[i+1]+1;
2499: nz = adiag[i] - adiag[i+1] - 1;
2500: idt = bs*i;
2501: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
2502: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
2503: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2505: for (m=0; m<nz; m++) {
2506: idx = bs*vi[m];
2507: for (k=0; k<15; k++) {
2508: xv = x[k + idx];
2509: s[0] -= v[0]*xv;
2510: s[1] -= v[1]*xv;
2511: s[2] -= v[2]*xv;
2512: s[3] -= v[3]*xv;
2513: s[4] -= v[4]*xv;
2514: s[5] -= v[5]*xv;
2515: s[6] -= v[6]*xv;
2516: s[7] -= v[7]*xv;
2517: s[8] -= v[8]*xv;
2518: s[9] -= v[9]*xv;
2519: s[10] -= v[10]*xv;
2520: s[11] -= v[11]*xv;
2521: s[12] -= v[12]*xv;
2522: s[13] -= v[13]*xv;
2523: s[14] -= v[14]*xv;
2524: v += 15;
2525: }
2526: }
2527: PetscMemzero(x+idt,bs*sizeof(MatScalar));
2528: for (k=0; k<15; k++) {
2529: x[idt] += v[0]*s[k];
2530: x[1+idt] += v[1]*s[k];
2531: x[2+idt] += v[2]*s[k];
2532: x[3+idt] += v[3]*s[k];
2533: x[4+idt] += v[4]*s[k];
2534: x[5+idt] += v[5]*s[k];
2535: x[6+idt] += v[6]*s[k];
2536: x[7+idt] += v[7]*s[k];
2537: x[8+idt] += v[8]*s[k];
2538: x[9+idt] += v[9]*s[k];
2539: x[10+idt] += v[10]*s[k];
2540: x[11+idt] += v[11]*s[k];
2541: x[12+idt] += v[12]*s[k];
2542: x[13+idt] += v[13]*s[k];
2543: x[14+idt] += v[14]*s[k];
2544: v += 15;
2545: }
2546: }
2547: VecRestoreArrayRead(bb,&b);
2548: VecRestoreArray(xx,&x);
2549: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2550: return(0);
2551: }
2556: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2557: {
2558: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
2559: IS iscol=a->col,isrow=a->row;
2560: PetscErrorCode ierr;
2561: const PetscInt *r,*c,*ai=a->i,*aj=a->j;
2562: const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2563: PetscInt i,nz,idx,idt,idc;
2564: const MatScalar *aa=a->a,*v;
2565: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2566: const PetscScalar *b;
2569: VecGetArrayRead(bb,&b);
2570: VecGetArray(xx,&x);
2571: t = a->solve_work;
2573: ISGetIndices(isrow,&rout); r = rout;
2574: ISGetIndices(iscol,&cout); c = cout + (n-1);
2576: /* forward solve the lower triangular */
2577: idx = 7*(*r++);
2578: t[0] = b[idx]; t[1] = b[1+idx];
2579: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2580: t[5] = b[5+idx]; t[6] = b[6+idx];
2582: for (i=1; i<n; i++) {
2583: v = aa + 49*ai[i];
2584: vi = aj + ai[i];
2585: nz = diag[i] - ai[i];
2586: idx = 7*(*r++);
2587: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2588: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2589: while (nz--) {
2590: idx = 7*(*vi++);
2591: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
2592: x4 = t[3+idx];x5 = t[4+idx];
2593: x6 = t[5+idx];x7 = t[6+idx];
2594: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2595: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2596: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2597: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2598: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2599: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2600: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2601: v += 49;
2602: }
2603: idx = 7*i;
2604: t[idx] = s1;t[1+idx] = s2;
2605: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2606: t[5+idx] = s6;t[6+idx] = s7;
2607: }
2608: /* backward solve the upper triangular */
2609: for (i=n-1; i>=0; i--) {
2610: v = aa + 49*diag[i] + 49;
2611: vi = aj + diag[i] + 1;
2612: nz = ai[i+1] - diag[i] - 1;
2613: idt = 7*i;
2614: s1 = t[idt]; s2 = t[1+idt];
2615: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2616: s6 = t[5+idt];s7 = t[6+idt];
2617: while (nz--) {
2618: idx = 7*(*vi++);
2619: x1 = t[idx]; x2 = t[1+idx];
2620: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2621: x6 = t[5+idx]; x7 = t[6+idx];
2622: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2623: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2624: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2625: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2626: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2627: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2628: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2629: v += 49;
2630: }
2631: idc = 7*(*c--);
2632: v = aa + 49*diag[i];
2633: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
2634: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2635: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2636: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2637: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2638: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2639: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2640: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2641: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2642: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2643: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2644: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2645: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2646: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2647: }
2649: ISRestoreIndices(isrow,&rout);
2650: ISRestoreIndices(iscol,&cout);
2651: VecRestoreArrayRead(bb,&b);
2652: VecRestoreArray(xx,&x);
2653: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2654: return(0);
2655: }
2659: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2660: {
2661: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
2662: IS iscol=a->col,isrow=a->row;
2663: PetscErrorCode ierr;
2664: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2665: const PetscInt n=a->mbs,*rout,*cout,*vi;
2666: PetscInt i,nz,idx,idt,idc,m;
2667: const MatScalar *aa=a->a,*v;
2668: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2669: const PetscScalar *b;
2672: VecGetArrayRead(bb,&b);
2673: VecGetArray(xx,&x);
2674: t = a->solve_work;
2676: ISGetIndices(isrow,&rout); r = rout;
2677: ISGetIndices(iscol,&cout); c = cout;
2679: /* forward solve the lower triangular */
2680: idx = 7*r[0];
2681: t[0] = b[idx]; t[1] = b[1+idx];
2682: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2683: t[5] = b[5+idx]; t[6] = b[6+idx];
2685: for (i=1; i<n; i++) {
2686: v = aa + 49*ai[i];
2687: vi = aj + ai[i];
2688: nz = ai[i+1] - ai[i];
2689: idx = 7*r[i];
2690: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2691: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2692: for (m=0; m<nz; m++) {
2693: idx = 7*vi[m];
2694: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
2695: x4 = t[3+idx];x5 = t[4+idx];
2696: x6 = t[5+idx];x7 = t[6+idx];
2697: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2698: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2699: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2700: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2701: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2702: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2703: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2704: v += 49;
2705: }
2706: idx = 7*i;
2707: t[idx] = s1;t[1+idx] = s2;
2708: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2709: t[5+idx] = s6;t[6+idx] = s7;
2710: }
2711: /* backward solve the upper triangular */
2712: for (i=n-1; i>=0; i--) {
2713: v = aa + 49*(adiag[i+1]+1);
2714: vi = aj + adiag[i+1]+1;
2715: nz = adiag[i] - adiag[i+1] - 1;
2716: idt = 7*i;
2717: s1 = t[idt]; s2 = t[1+idt];
2718: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2719: s6 = t[5+idt];s7 = t[6+idt];
2720: for (m=0; m<nz; m++) {
2721: idx = 7*vi[m];
2722: x1 = t[idx]; x2 = t[1+idx];
2723: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2724: x6 = t[5+idx]; x7 = t[6+idx];
2725: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2726: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2727: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2728: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2729: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2730: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2731: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2732: v += 49;
2733: }
2734: idc = 7*c[i];
2735: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
2736: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2737: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2738: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2739: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2740: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2741: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2742: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2743: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2744: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2745: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2746: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2747: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2748: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2749: }
2751: ISRestoreIndices(isrow,&rout);
2752: ISRestoreIndices(iscol,&cout);
2753: VecRestoreArrayRead(bb,&b);
2754: VecRestoreArray(xx,&x);
2755: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2756: return(0);
2757: }
2761: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2762: {
2763: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2764: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2765: PetscErrorCode ierr;
2766: PetscInt i,nz,idx,idt,jdx;
2767: const MatScalar *aa=a->a,*v;
2768: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2769: const PetscScalar *b;
2772: VecGetArrayRead(bb,&b);
2773: VecGetArray(xx,&x);
2774: /* forward solve the lower triangular */
2775: idx = 0;
2776: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
2777: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2778: x[6] = b[6+idx];
2779: for (i=1; i<n; i++) {
2780: v = aa + 49*ai[i];
2781: vi = aj + ai[i];
2782: nz = diag[i] - ai[i];
2783: idx = 7*i;
2784: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2785: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2786: s7 = b[6+idx];
2787: while (nz--) {
2788: jdx = 7*(*vi++);
2789: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
2790: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2791: x7 = x[6+jdx];
2792: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2793: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2794: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2795: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2796: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2797: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2798: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2799: v += 49;
2800: }
2801: x[idx] = s1;
2802: x[1+idx] = s2;
2803: x[2+idx] = s3;
2804: x[3+idx] = s4;
2805: x[4+idx] = s5;
2806: x[5+idx] = s6;
2807: x[6+idx] = s7;
2808: }
2809: /* backward solve the upper triangular */
2810: for (i=n-1; i>=0; i--) {
2811: v = aa + 49*diag[i] + 49;
2812: vi = aj + diag[i] + 1;
2813: nz = ai[i+1] - diag[i] - 1;
2814: idt = 7*i;
2815: s1 = x[idt]; s2 = x[1+idt];
2816: s3 = x[2+idt]; s4 = x[3+idt];
2817: s5 = x[4+idt]; s6 = x[5+idt];
2818: s7 = x[6+idt];
2819: while (nz--) {
2820: idx = 7*(*vi++);
2821: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
2822: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2823: x7 = x[6+idx];
2824: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2825: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2826: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2827: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2828: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2829: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2830: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2831: v += 49;
2832: }
2833: v = aa + 49*diag[i];
2834: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
2835: + v[28]*s5 + v[35]*s6 + v[42]*s7;
2836: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
2837: + v[29]*s5 + v[36]*s6 + v[43]*s7;
2838: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
2839: + v[30]*s5 + v[37]*s6 + v[44]*s7;
2840: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
2841: + v[31]*s5 + v[38]*s6 + v[45]*s7;
2842: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
2843: + v[32]*s5 + v[39]*s6 + v[46]*s7;
2844: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
2845: + v[33]*s5 + v[40]*s6 + v[47]*s7;
2846: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
2847: + v[34]*s5 + v[41]*s6 + v[48]*s7;
2848: }
2850: VecRestoreArrayRead(bb,&b);
2851: VecRestoreArray(xx,&x);
2852: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2853: return(0);
2854: }
2858: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2859: {
2860: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2861: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2862: PetscErrorCode ierr;
2863: PetscInt i,k,nz,idx,jdx,idt;
2864: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
2865: const MatScalar *aa=a->a,*v;
2866: PetscScalar *x;
2867: const PetscScalar *b;
2868: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2871: VecGetArrayRead(bb,&b);
2872: VecGetArray(xx,&x);
2873: /* forward solve the lower triangular */
2874: idx = 0;
2875: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2876: x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2877: for (i=1; i<n; i++) {
2878: v = aa + bs2*ai[i];
2879: vi = aj + ai[i];
2880: nz = ai[i+1] - ai[i];
2881: idx = bs*i;
2882: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2883: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2884: for (k=0; k<nz; k++) {
2885: jdx = bs*vi[k];
2886: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2887: x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2888: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2889: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2890: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2891: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2892: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2893: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2894: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2895: v += bs2;
2896: }
2898: x[idx] = s1;
2899: x[1+idx] = s2;
2900: x[2+idx] = s3;
2901: x[3+idx] = s4;
2902: x[4+idx] = s5;
2903: x[5+idx] = s6;
2904: x[6+idx] = s7;
2905: }
2907: /* backward solve the upper triangular */
2908: for (i=n-1; i>=0; i--) {
2909: v = aa + bs2*(adiag[i+1]+1);
2910: vi = aj + adiag[i+1]+1;
2911: nz = adiag[i] - adiag[i+1]-1;
2912: idt = bs*i;
2913: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2914: s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2915: for (k=0; k<nz; k++) {
2916: idx = bs*vi[k];
2917: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2918: x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2919: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2920: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2921: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2922: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2923: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2924: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2925: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2926: v += bs2;
2927: }
2928: /* x = inv_diagonal*x */
2929: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7;
2930: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7;
2931: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7;
2932: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7;
2933: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7;
2934: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7;
2935: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7;
2936: }
2938: VecRestoreArrayRead(bb,&b);
2939: VecRestoreArray(xx,&x);
2940: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2941: return(0);
2942: }
2946: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2947: {
2948: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
2949: IS iscol=a->col,isrow=a->row;
2950: PetscErrorCode ierr;
2951: const PetscInt *r,*c,*rout,*cout;
2952: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2953: PetscInt i,nz,idx,idt,idc;
2954: const MatScalar *aa=a->a,*v;
2955: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2956: const PetscScalar *b;
2959: VecGetArrayRead(bb,&b);
2960: VecGetArray(xx,&x);
2961: t = a->solve_work;
2963: ISGetIndices(isrow,&rout); r = rout;
2964: ISGetIndices(iscol,&cout); c = cout + (n-1);
2966: /* forward solve the lower triangular */
2967: idx = 6*(*r++);
2968: t[0] = b[idx]; t[1] = b[1+idx];
2969: t[2] = b[2+idx]; t[3] = b[3+idx];
2970: t[4] = b[4+idx]; t[5] = b[5+idx];
2971: for (i=1; i<n; i++) {
2972: v = aa + 36*ai[i];
2973: vi = aj + ai[i];
2974: nz = diag[i] - ai[i];
2975: idx = 6*(*r++);
2976: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2977: s5 = b[4+idx]; s6 = b[5+idx];
2978: while (nz--) {
2979: idx = 6*(*vi++);
2980: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2981: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2982: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2983: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2984: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2985: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2986: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2987: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2988: v += 36;
2989: }
2990: idx = 6*i;
2991: t[idx] = s1;t[1+idx] = s2;
2992: t[2+idx] = s3;t[3+idx] = s4;
2993: t[4+idx] = s5;t[5+idx] = s6;
2994: }
2995: /* backward solve the upper triangular */
2996: for (i=n-1; i>=0; i--) {
2997: v = aa + 36*diag[i] + 36;
2998: vi = aj + diag[i] + 1;
2999: nz = ai[i+1] - diag[i] - 1;
3000: idt = 6*i;
3001: s1 = t[idt]; s2 = t[1+idt];
3002: s3 = t[2+idt];s4 = t[3+idt];
3003: s5 = t[4+idt];s6 = t[5+idt];
3004: while (nz--) {
3005: idx = 6*(*vi++);
3006: x1 = t[idx]; x2 = t[1+idx];
3007: x3 = t[2+idx]; x4 = t[3+idx];
3008: x5 = t[4+idx]; x6 = t[5+idx];
3009: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3010: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3011: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3012: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3013: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3014: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3015: v += 36;
3016: }
3017: idc = 6*(*c--);
3018: v = aa + 36*diag[i];
3019: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
3020: v[18]*s4+v[24]*s5+v[30]*s6;
3021: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3022: v[19]*s4+v[25]*s5+v[31]*s6;
3023: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3024: v[20]*s4+v[26]*s5+v[32]*s6;
3025: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3026: v[21]*s4+v[27]*s5+v[33]*s6;
3027: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3028: v[22]*s4+v[28]*s5+v[34]*s6;
3029: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3030: v[23]*s4+v[29]*s5+v[35]*s6;
3031: }
3033: ISRestoreIndices(isrow,&rout);
3034: ISRestoreIndices(iscol,&cout);
3035: VecRestoreArrayRead(bb,&b);
3036: VecRestoreArray(xx,&x);
3037: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3038: return(0);
3039: }
3043: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3044: {
3045: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
3046: IS iscol=a->col,isrow=a->row;
3047: PetscErrorCode ierr;
3048: const PetscInt *r,*c,*rout,*cout;
3049: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3050: PetscInt i,nz,idx,idt,idc,m;
3051: const MatScalar *aa=a->a,*v;
3052: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3053: const PetscScalar *b;
3056: VecGetArrayRead(bb,&b);
3057: VecGetArray(xx,&x);
3058: t = a->solve_work;
3060: ISGetIndices(isrow,&rout); r = rout;
3061: ISGetIndices(iscol,&cout); c = cout;
3063: /* forward solve the lower triangular */
3064: idx = 6*r[0];
3065: t[0] = b[idx]; t[1] = b[1+idx];
3066: t[2] = b[2+idx]; t[3] = b[3+idx];
3067: t[4] = b[4+idx]; t[5] = b[5+idx];
3068: for (i=1; i<n; i++) {
3069: v = aa + 36*ai[i];
3070: vi = aj + ai[i];
3071: nz = ai[i+1] - ai[i];
3072: idx = 6*r[i];
3073: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3074: s5 = b[4+idx]; s6 = b[5+idx];
3075: for (m=0; m<nz; m++) {
3076: idx = 6*vi[m];
3077: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3078: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3079: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3080: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3081: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3082: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3083: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3084: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3085: v += 36;
3086: }
3087: idx = 6*i;
3088: t[idx] = s1;t[1+idx] = s2;
3089: t[2+idx] = s3;t[3+idx] = s4;
3090: t[4+idx] = s5;t[5+idx] = s6;
3091: }
3092: /* backward solve the upper triangular */
3093: for (i=n-1; i>=0; i--) {
3094: v = aa + 36*(adiag[i+1]+1);
3095: vi = aj + adiag[i+1]+1;
3096: nz = adiag[i] - adiag[i+1] - 1;
3097: idt = 6*i;
3098: s1 = t[idt]; s2 = t[1+idt];
3099: s3 = t[2+idt];s4 = t[3+idt];
3100: s5 = t[4+idt];s6 = t[5+idt];
3101: for (m=0; m<nz; m++) {
3102: idx = 6*vi[m];
3103: x1 = t[idx]; x2 = t[1+idx];
3104: x3 = t[2+idx]; x4 = t[3+idx];
3105: x5 = t[4+idx]; x6 = t[5+idx];
3106: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3107: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3108: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3109: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3110: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3111: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3112: v += 36;
3113: }
3114: idc = 6*c[i];
3115: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
3116: v[18]*s4+v[24]*s5+v[30]*s6;
3117: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3118: v[19]*s4+v[25]*s5+v[31]*s6;
3119: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3120: v[20]*s4+v[26]*s5+v[32]*s6;
3121: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3122: v[21]*s4+v[27]*s5+v[33]*s6;
3123: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3124: v[22]*s4+v[28]*s5+v[34]*s6;
3125: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3126: v[23]*s4+v[29]*s5+v[35]*s6;
3127: }
3129: ISRestoreIndices(isrow,&rout);
3130: ISRestoreIndices(iscol,&cout);
3131: VecRestoreArrayRead(bb,&b);
3132: VecRestoreArray(xx,&x);
3133: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3134: return(0);
3135: }
3139: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3140: {
3141: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3142: PetscInt i,nz,idx,idt,jdx;
3143: PetscErrorCode ierr;
3144: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3145: const MatScalar *aa =a->a,*v;
3146: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3147: const PetscScalar *b;
3150: VecGetArrayRead(bb,&b);
3151: VecGetArray(xx,&x);
3152: /* forward solve the lower triangular */
3153: idx = 0;
3154: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
3155: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3156: for (i=1; i<n; i++) {
3157: v = aa + 36*ai[i];
3158: vi = aj + ai[i];
3159: nz = diag[i] - ai[i];
3160: idx = 6*i;
3161: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3162: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3163: while (nz--) {
3164: jdx = 6*(*vi++);
3165: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
3166: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3167: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3168: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3169: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3170: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3171: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3172: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3173: v += 36;
3174: }
3175: x[idx] = s1;
3176: x[1+idx] = s2;
3177: x[2+idx] = s3;
3178: x[3+idx] = s4;
3179: x[4+idx] = s5;
3180: x[5+idx] = s6;
3181: }
3182: /* backward solve the upper triangular */
3183: for (i=n-1; i>=0; i--) {
3184: v = aa + 36*diag[i] + 36;
3185: vi = aj + diag[i] + 1;
3186: nz = ai[i+1] - diag[i] - 1;
3187: idt = 6*i;
3188: s1 = x[idt]; s2 = x[1+idt];
3189: s3 = x[2+idt]; s4 = x[3+idt];
3190: s5 = x[4+idt]; s6 = x[5+idt];
3191: while (nz--) {
3192: idx = 6*(*vi++);
3193: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
3194: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3195: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3196: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3197: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3198: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3199: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3200: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3201: v += 36;
3202: }
3203: v = aa + 36*diag[i];
3204: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3205: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3206: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3207: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3208: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3209: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3210: }
3212: VecRestoreArrayRead(bb,&b);
3213: VecRestoreArray(xx,&x);
3214: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3215: return(0);
3216: }
3220: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3221: {
3222: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3223: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3224: PetscErrorCode ierr;
3225: PetscInt i,k,nz,idx,jdx,idt;
3226: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
3227: const MatScalar *aa=a->a,*v;
3228: PetscScalar *x;
3229: const PetscScalar *b;
3230: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3233: VecGetArrayRead(bb,&b);
3234: VecGetArray(xx,&x);
3235: /* forward solve the lower triangular */
3236: idx = 0;
3237: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3238: x[4] = b[4+idx];x[5] = b[5+idx];
3239: for (i=1; i<n; i++) {
3240: v = aa + bs2*ai[i];
3241: vi = aj + ai[i];
3242: nz = ai[i+1] - ai[i];
3243: idx = bs*i;
3244: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3245: s5 = b[4+idx];s6 = b[5+idx];
3246: for (k=0; k<nz; k++) {
3247: jdx = bs*vi[k];
3248: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3249: x5 = x[4+jdx]; x6 = x[5+jdx];
3250: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3251: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
3252: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3253: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3254: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3255: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3256: v += bs2;
3257: }
3259: x[idx] = s1;
3260: x[1+idx] = s2;
3261: x[2+idx] = s3;
3262: x[3+idx] = s4;
3263: x[4+idx] = s5;
3264: x[5+idx] = s6;
3265: }
3267: /* backward solve the upper triangular */
3268: for (i=n-1; i>=0; i--) {
3269: v = aa + bs2*(adiag[i+1]+1);
3270: vi = aj + adiag[i+1]+1;
3271: nz = adiag[i] - adiag[i+1]-1;
3272: idt = bs*i;
3273: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3274: s5 = x[4+idt];s6 = x[5+idt];
3275: for (k=0; k<nz; k++) {
3276: idx = bs*vi[k];
3277: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3278: x5 = x[4+idx];x6 = x[5+idx];
3279: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3280: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
3281: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3282: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3283: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3284: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3285: v += bs2;
3286: }
3287: /* x = inv_diagonal*x */
3288: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3289: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3290: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3291: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3292: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3293: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3294: }
3296: VecRestoreArrayRead(bb,&b);
3297: VecRestoreArray(xx,&x);
3298: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
3299: return(0);
3300: }
3304: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3305: {
3306: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
3307: IS iscol=a->col,isrow=a->row;
3308: PetscErrorCode ierr;
3309: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
3310: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3311: PetscInt i,nz,idx,idt,idc;
3312: const MatScalar *aa=a->a,*v;
3313: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3314: const PetscScalar *b;
3317: VecGetArrayRead(bb,&b);
3318: VecGetArray(xx,&x);
3319: t = a->solve_work;
3321: ISGetIndices(isrow,&rout); r = rout;
3322: ISGetIndices(iscol,&cout); c = cout + (n-1);
3324: /* forward solve the lower triangular */
3325: idx = 5*(*r++);
3326: t[0] = b[idx]; t[1] = b[1+idx];
3327: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3328: for (i=1; i<n; i++) {
3329: v = aa + 25*ai[i];
3330: vi = aj + ai[i];
3331: nz = diag[i] - ai[i];
3332: idx = 5*(*r++);
3333: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3334: s5 = b[4+idx];
3335: while (nz--) {
3336: idx = 5*(*vi++);
3337: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
3338: x4 = t[3+idx];x5 = t[4+idx];
3339: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3340: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3341: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3342: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3343: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3344: v += 25;
3345: }
3346: idx = 5*i;
3347: t[idx] = s1;t[1+idx] = s2;
3348: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3349: }
3350: /* backward solve the upper triangular */
3351: for (i=n-1; i>=0; i--) {
3352: v = aa + 25*diag[i] + 25;
3353: vi = aj + diag[i] + 1;
3354: nz = ai[i+1] - diag[i] - 1;
3355: idt = 5*i;
3356: s1 = t[idt]; s2 = t[1+idt];
3357: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3358: while (nz--) {
3359: idx = 5*(*vi++);
3360: x1 = t[idx]; x2 = t[1+idx];
3361: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3362: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3367: v += 25;
3368: }
3369: idc = 5*(*c--);
3370: v = aa + 25*diag[i];
3371: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
3372: v[15]*s4+v[20]*s5;
3373: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3374: v[16]*s4+v[21]*s5;
3375: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3376: v[17]*s4+v[22]*s5;
3377: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3378: v[18]*s4+v[23]*s5;
3379: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3380: v[19]*s4+v[24]*s5;
3381: }
3383: ISRestoreIndices(isrow,&rout);
3384: ISRestoreIndices(iscol,&cout);
3385: VecRestoreArrayRead(bb,&b);
3386: VecRestoreArray(xx,&x);
3387: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3388: return(0);
3389: }
3393: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3394: {
3395: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
3396: IS iscol=a->col,isrow=a->row;
3397: PetscErrorCode ierr;
3398: const PetscInt *r,*c,*rout,*cout;
3399: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3400: PetscInt i,nz,idx,idt,idc,m;
3401: const MatScalar *aa=a->a,*v;
3402: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3403: const PetscScalar *b;
3406: VecGetArrayRead(bb,&b);
3407: VecGetArray(xx,&x);
3408: t = a->solve_work;
3410: ISGetIndices(isrow,&rout); r = rout;
3411: ISGetIndices(iscol,&cout); c = cout;
3413: /* forward solve the lower triangular */
3414: idx = 5*r[0];
3415: t[0] = b[idx]; t[1] = b[1+idx];
3416: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3417: for (i=1; i<n; i++) {
3418: v = aa + 25*ai[i];
3419: vi = aj + ai[i];
3420: nz = ai[i+1] - ai[i];
3421: idx = 5*r[i];
3422: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3423: s5 = b[4+idx];
3424: for (m=0; m<nz; m++) {
3425: idx = 5*vi[m];
3426: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
3427: x4 = t[3+idx];x5 = t[4+idx];
3428: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3429: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3430: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3431: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3432: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3433: v += 25;
3434: }
3435: idx = 5*i;
3436: t[idx] = s1;t[1+idx] = s2;
3437: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3438: }
3439: /* backward solve the upper triangular */
3440: for (i=n-1; i>=0; i--) {
3441: v = aa + 25*(adiag[i+1]+1);
3442: vi = aj + adiag[i+1]+1;
3443: nz = adiag[i] - adiag[i+1] - 1;
3444: idt = 5*i;
3445: s1 = t[idt]; s2 = t[1+idt];
3446: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3447: for (m=0; m<nz; m++) {
3448: idx = 5*vi[m];
3449: x1 = t[idx]; x2 = t[1+idx];
3450: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3451: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3452: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3453: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3454: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3455: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3456: v += 25;
3457: }
3458: idc = 5*c[i];
3459: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
3460: v[15]*s4+v[20]*s5;
3461: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3462: v[16]*s4+v[21]*s5;
3463: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3464: v[17]*s4+v[22]*s5;
3465: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3466: v[18]*s4+v[23]*s5;
3467: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3468: v[19]*s4+v[24]*s5;
3469: }
3471: ISRestoreIndices(isrow,&rout);
3472: ISRestoreIndices(iscol,&cout);
3473: VecRestoreArrayRead(bb,&b);
3474: VecRestoreArray(xx,&x);
3475: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3476: return(0);
3477: }
3481: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3482: {
3483: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3484: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3485: PetscInt i,nz,idx,idt,jdx;
3486: PetscErrorCode ierr;
3487: const MatScalar *aa=a->a,*v;
3488: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3489: const PetscScalar *b;
3492: VecGetArrayRead(bb,&b);
3493: VecGetArray(xx,&x);
3494: /* forward solve the lower triangular */
3495: idx = 0;
3496: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3497: for (i=1; i<n; i++) {
3498: v = aa + 25*ai[i];
3499: vi = aj + ai[i];
3500: nz = diag[i] - ai[i];
3501: idx = 5*i;
3502: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3503: while (nz--) {
3504: jdx = 5*(*vi++);
3505: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3506: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3507: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3508: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3509: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3510: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3511: v += 25;
3512: }
3513: x[idx] = s1;
3514: x[1+idx] = s2;
3515: x[2+idx] = s3;
3516: x[3+idx] = s4;
3517: x[4+idx] = s5;
3518: }
3519: /* backward solve the upper triangular */
3520: for (i=n-1; i>=0; i--) {
3521: v = aa + 25*diag[i] + 25;
3522: vi = aj + diag[i] + 1;
3523: nz = ai[i+1] - diag[i] - 1;
3524: idt = 5*i;
3525: s1 = x[idt]; s2 = x[1+idt];
3526: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3527: while (nz--) {
3528: idx = 5*(*vi++);
3529: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3530: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3531: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3532: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3533: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3534: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3535: v += 25;
3536: }
3537: v = aa + 25*diag[i];
3538: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
3539: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
3540: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
3541: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
3542: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
3543: }
3545: VecRestoreArrayRead(bb,&b);
3546: VecRestoreArray(xx,&x);
3547: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3548: return(0);
3549: }
3553: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3554: {
3555: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3556: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3557: PetscInt i,k,nz,idx,idt,jdx;
3558: PetscErrorCode ierr;
3559: const MatScalar *aa=a->a,*v;
3560: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3561: const PetscScalar *b;
3564: VecGetArrayRead(bb,&b);
3565: VecGetArray(xx,&x);
3566: /* forward solve the lower triangular */
3567: idx = 0;
3568: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3569: for (i=1; i<n; i++) {
3570: v = aa + 25*ai[i];
3571: vi = aj + ai[i];
3572: nz = ai[i+1] - ai[i];
3573: idx = 5*i;
3574: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3575: for (k=0; k<nz; k++) {
3576: jdx = 5*vi[k];
3577: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3578: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3579: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3580: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3581: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3582: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3583: v += 25;
3584: }
3585: x[idx] = s1;
3586: x[1+idx] = s2;
3587: x[2+idx] = s3;
3588: x[3+idx] = s4;
3589: x[4+idx] = s5;
3590: }
3592: /* backward solve the upper triangular */
3593: for (i=n-1; i>=0; i--) {
3594: v = aa + 25*(adiag[i+1]+1);
3595: vi = aj + adiag[i+1]+1;
3596: nz = adiag[i] - adiag[i+1]-1;
3597: idt = 5*i;
3598: s1 = x[idt]; s2 = x[1+idt];
3599: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3600: for (k=0; k<nz; k++) {
3601: idx = 5*vi[k];
3602: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3603: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3604: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3605: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3606: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3607: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3608: v += 25;
3609: }
3610: /* x = inv_diagonal*x */
3611: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
3612: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
3613: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
3614: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
3615: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
3616: }
3618: VecRestoreArrayRead(bb,&b);
3619: VecRestoreArray(xx,&x);
3620: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3621: return(0);
3622: }
3626: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3627: {
3628: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3629: IS iscol=a->col,isrow=a->row;
3630: PetscErrorCode ierr;
3631: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3632: PetscInt i,nz,idx,idt,idc;
3633: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3634: const MatScalar *aa=a->a,*v;
3635: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3636: const PetscScalar *b;
3639: VecGetArrayRead(bb,&b);
3640: VecGetArray(xx,&x);
3641: t = a->solve_work;
3643: ISGetIndices(isrow,&rout); r = rout;
3644: ISGetIndices(iscol,&cout); c = cout + (n-1);
3646: /* forward solve the lower triangular */
3647: idx = 4*(*r++);
3648: t[0] = b[idx]; t[1] = b[1+idx];
3649: t[2] = b[2+idx]; t[3] = b[3+idx];
3650: for (i=1; i<n; i++) {
3651: v = aa + 16*ai[i];
3652: vi = aj + ai[i];
3653: nz = diag[i] - ai[i];
3654: idx = 4*(*r++);
3655: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3656: while (nz--) {
3657: idx = 4*(*vi++);
3658: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3659: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3660: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3661: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3662: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3663: v += 16;
3664: }
3665: idx = 4*i;
3666: t[idx] = s1;t[1+idx] = s2;
3667: t[2+idx] = s3;t[3+idx] = s4;
3668: }
3669: /* backward solve the upper triangular */
3670: for (i=n-1; i>=0; i--) {
3671: v = aa + 16*diag[i] + 16;
3672: vi = aj + diag[i] + 1;
3673: nz = ai[i+1] - diag[i] - 1;
3674: idt = 4*i;
3675: s1 = t[idt]; s2 = t[1+idt];
3676: s3 = t[2+idt];s4 = t[3+idt];
3677: while (nz--) {
3678: idx = 4*(*vi++);
3679: x1 = t[idx]; x2 = t[1+idx];
3680: x3 = t[2+idx]; x4 = t[3+idx];
3681: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3682: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3683: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3684: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3685: v += 16;
3686: }
3687: idc = 4*(*c--);
3688: v = aa + 16*diag[i];
3689: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3690: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3691: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3692: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3693: }
3695: ISRestoreIndices(isrow,&rout);
3696: ISRestoreIndices(iscol,&cout);
3697: VecRestoreArrayRead(bb,&b);
3698: VecRestoreArray(xx,&x);
3699: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3700: return(0);
3701: }
3705: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3706: {
3707: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3708: IS iscol=a->col,isrow=a->row;
3709: PetscErrorCode ierr;
3710: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3711: PetscInt i,nz,idx,idt,idc,m;
3712: const PetscInt *r,*c,*rout,*cout;
3713: const MatScalar *aa=a->a,*v;
3714: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3715: const PetscScalar *b;
3718: VecGetArrayRead(bb,&b);
3719: VecGetArray(xx,&x);
3720: t = a->solve_work;
3722: ISGetIndices(isrow,&rout); r = rout;
3723: ISGetIndices(iscol,&cout); c = cout;
3725: /* forward solve the lower triangular */
3726: idx = 4*r[0];
3727: t[0] = b[idx]; t[1] = b[1+idx];
3728: t[2] = b[2+idx]; t[3] = b[3+idx];
3729: for (i=1; i<n; i++) {
3730: v = aa + 16*ai[i];
3731: vi = aj + ai[i];
3732: nz = ai[i+1] - ai[i];
3733: idx = 4*r[i];
3734: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3735: for (m=0; m<nz; m++) {
3736: idx = 4*vi[m];
3737: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3738: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3739: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3740: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3741: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3742: v += 16;
3743: }
3744: idx = 4*i;
3745: t[idx] = s1;t[1+idx] = s2;
3746: t[2+idx] = s3;t[3+idx] = s4;
3747: }
3748: /* backward solve the upper triangular */
3749: for (i=n-1; i>=0; i--) {
3750: v = aa + 16*(adiag[i+1]+1);
3751: vi = aj + adiag[i+1]+1;
3752: nz = adiag[i] - adiag[i+1] - 1;
3753: idt = 4*i;
3754: s1 = t[idt]; s2 = t[1+idt];
3755: s3 = t[2+idt];s4 = t[3+idt];
3756: for (m=0; m<nz; m++) {
3757: idx = 4*vi[m];
3758: x1 = t[idx]; x2 = t[1+idx];
3759: x3 = t[2+idx]; x4 = t[3+idx];
3760: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3761: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3762: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3763: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3764: v += 16;
3765: }
3766: idc = 4*c[i];
3767: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3768: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3769: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3770: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3771: }
3773: ISRestoreIndices(isrow,&rout);
3774: ISRestoreIndices(iscol,&cout);
3775: VecRestoreArrayRead(bb,&b);
3776: VecRestoreArray(xx,&x);
3777: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3778: return(0);
3779: }
3783: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3784: {
3785: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3786: IS iscol=a->col,isrow=a->row;
3787: PetscErrorCode ierr;
3788: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3789: PetscInt i,nz,idx,idt,idc;
3790: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3791: const MatScalar *aa=a->a,*v;
3792: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
3793: PetscScalar *x;
3794: const PetscScalar *b;
3797: VecGetArrayRead(bb,&b);
3798: VecGetArray(xx,&x);
3799: t = (MatScalar*)a->solve_work;
3801: ISGetIndices(isrow,&rout); r = rout;
3802: ISGetIndices(iscol,&cout); c = cout + (n-1);
3804: /* forward solve the lower triangular */
3805: idx = 4*(*r++);
3806: t[0] = (MatScalar)b[idx];
3807: t[1] = (MatScalar)b[1+idx];
3808: t[2] = (MatScalar)b[2+idx];
3809: t[3] = (MatScalar)b[3+idx];
3810: for (i=1; i<n; i++) {
3811: v = aa + 16*ai[i];
3812: vi = aj + ai[i];
3813: nz = diag[i] - ai[i];
3814: idx = 4*(*r++);
3815: s1 = (MatScalar)b[idx];
3816: s2 = (MatScalar)b[1+idx];
3817: s3 = (MatScalar)b[2+idx];
3818: s4 = (MatScalar)b[3+idx];
3819: while (nz--) {
3820: idx = 4*(*vi++);
3821: x1 = t[idx];
3822: x2 = t[1+idx];
3823: x3 = t[2+idx];
3824: x4 = t[3+idx];
3825: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3826: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3827: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3828: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3829: v += 16;
3830: }
3831: idx = 4*i;
3832: t[idx] = s1;
3833: t[1+idx] = s2;
3834: t[2+idx] = s3;
3835: t[3+idx] = s4;
3836: }
3837: /* backward solve the upper triangular */
3838: for (i=n-1; i>=0; i--) {
3839: v = aa + 16*diag[i] + 16;
3840: vi = aj + diag[i] + 1;
3841: nz = ai[i+1] - diag[i] - 1;
3842: idt = 4*i;
3843: s1 = t[idt];
3844: s2 = t[1+idt];
3845: s3 = t[2+idt];
3846: s4 = t[3+idt];
3847: while (nz--) {
3848: idx = 4*(*vi++);
3849: x1 = t[idx];
3850: x2 = t[1+idx];
3851: x3 = t[2+idx];
3852: x4 = t[3+idx];
3853: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3854: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3855: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3856: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3857: v += 16;
3858: }
3859: idc = 4*(*c--);
3860: v = aa + 16*diag[i];
3861: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3862: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3863: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3864: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3865: x[idc] = (PetscScalar)t[idt];
3866: x[1+idc] = (PetscScalar)t[1+idt];
3867: x[2+idc] = (PetscScalar)t[2+idt];
3868: x[3+idc] = (PetscScalar)t[3+idt];
3869: }
3871: ISRestoreIndices(isrow,&rout);
3872: ISRestoreIndices(iscol,&cout);
3873: VecRestoreArrayRead(bb,&b);
3874: VecRestoreArray(xx,&x);
3875: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3876: return(0);
3877: }
3879: #if defined(PETSC_HAVE_SSE)
3881: #include PETSC_HAVE_SSE
3885: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3886: {
3887: /*
3888: Note: This code uses demotion of double
3889: to float when performing the mixed-mode computation.
3890: This may not be numerically reasonable for all applications.
3891: */
3892: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3893: IS iscol=a->col,isrow=a->row;
3895: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3896: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3897: MatScalar *aa=a->a,*v;
3898: PetscScalar *x,*b,*t;
3900: /* Make space in temp stack for 16 Byte Aligned arrays */
3901: float ssealignedspace[11],*tmps,*tmpx;
3902: unsigned long offset;
3905: SSE_SCOPE_BEGIN;
3907: offset = (unsigned long)ssealignedspace % 16;
3908: if (offset) offset = (16 - offset)/4;
3909: tmps = &ssealignedspace[offset];
3910: tmpx = &ssealignedspace[offset+4];
3911: PREFETCH_NTA(aa+16*ai[1]);
3913: VecGetArray(bb,&b);
3914: VecGetArray(xx,&x);
3915: t = a->solve_work;
3917: ISGetIndices(isrow,&rout); r = rout;
3918: ISGetIndices(iscol,&cout); c = cout + (n-1);
3920: /* forward solve the lower triangular */
3921: idx = 4*(*r++);
3922: t[0] = b[idx]; t[1] = b[1+idx];
3923: t[2] = b[2+idx]; t[3] = b[3+idx];
3924: v = aa + 16*ai[1];
3926: for (i=1; i<n; ) {
3927: PREFETCH_NTA(&v[8]);
3928: vi = aj + ai[i];
3929: nz = diag[i] - ai[i];
3930: idx = 4*(*r++);
3932: /* Demote sum from double to float */
3933: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3934: LOAD_PS(tmps,XMM7);
3936: while (nz--) {
3937: PREFETCH_NTA(&v[16]);
3938: idx = 4*(*vi++);
3940: /* Demote solution (so far) from double to float */
3941: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3943: /* 4x4 Matrix-Vector product with negative accumulation: */
3944: SSE_INLINE_BEGIN_2(tmpx,v)
3945: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3947: /* First Column */
3948: SSE_COPY_PS(XMM0,XMM6)
3949: SSE_SHUFFLE(XMM0,XMM0,0x00)
3950: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3951: SSE_SUB_PS(XMM7,XMM0)
3953: /* Second Column */
3954: SSE_COPY_PS(XMM1,XMM6)
3955: SSE_SHUFFLE(XMM1,XMM1,0x55)
3956: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3957: SSE_SUB_PS(XMM7,XMM1)
3959: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3961: /* Third Column */
3962: SSE_COPY_PS(XMM2,XMM6)
3963: SSE_SHUFFLE(XMM2,XMM2,0xAA)
3964: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3965: SSE_SUB_PS(XMM7,XMM2)
3967: /* Fourth Column */
3968: SSE_COPY_PS(XMM3,XMM6)
3969: SSE_SHUFFLE(XMM3,XMM3,0xFF)
3970: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3971: SSE_SUB_PS(XMM7,XMM3)
3972: SSE_INLINE_END_2
3974: v += 16;
3975: }
3976: idx = 4*i;
3977: v = aa + 16*ai[++i];
3978: PREFETCH_NTA(v);
3979: STORE_PS(tmps,XMM7);
3981: /* Promote result from float to double */
3982: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3983: }
3984: /* backward solve the upper triangular */
3985: idt = 4*(n-1);
3986: ai16 = 16*diag[n-1];
3987: v = aa + ai16 + 16;
3988: for (i=n-1; i>=0; ) {
3989: PREFETCH_NTA(&v[8]);
3990: vi = aj + diag[i] + 1;
3991: nz = ai[i+1] - diag[i] - 1;
3993: /* Demote accumulator from double to float */
3994: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3995: LOAD_PS(tmps,XMM7);
3997: while (nz--) {
3998: PREFETCH_NTA(&v[16]);
3999: idx = 4*(*vi++);
4001: /* Demote solution (so far) from double to float */
4002: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4004: /* 4x4 Matrix-Vector Product with negative accumulation: */
4005: SSE_INLINE_BEGIN_2(tmpx,v)
4006: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4008: /* First Column */
4009: SSE_COPY_PS(XMM0,XMM6)
4010: SSE_SHUFFLE(XMM0,XMM0,0x00)
4011: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4012: SSE_SUB_PS(XMM7,XMM0)
4014: /* Second Column */
4015: SSE_COPY_PS(XMM1,XMM6)
4016: SSE_SHUFFLE(XMM1,XMM1,0x55)
4017: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4018: SSE_SUB_PS(XMM7,XMM1)
4020: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4022: /* Third Column */
4023: SSE_COPY_PS(XMM2,XMM6)
4024: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4025: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4026: SSE_SUB_PS(XMM7,XMM2)
4028: /* Fourth Column */
4029: SSE_COPY_PS(XMM3,XMM6)
4030: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4031: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4032: SSE_SUB_PS(XMM7,XMM3)
4033: SSE_INLINE_END_2
4034: v += 16;
4035: }
4036: v = aa + ai16;
4037: ai16 = 16*diag[--i];
4038: PREFETCH_NTA(aa+ai16+16);
4039: /*
4040: Scale the result by the diagonal 4x4 block,
4041: which was inverted as part of the factorization
4042: */
4043: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4044: /* First Column */
4045: SSE_COPY_PS(XMM0,XMM7)
4046: SSE_SHUFFLE(XMM0,XMM0,0x00)
4047: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4049: /* Second Column */
4050: SSE_COPY_PS(XMM1,XMM7)
4051: SSE_SHUFFLE(XMM1,XMM1,0x55)
4052: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4053: SSE_ADD_PS(XMM0,XMM1)
4055: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4057: /* Third Column */
4058: SSE_COPY_PS(XMM2,XMM7)
4059: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4060: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4061: SSE_ADD_PS(XMM0,XMM2)
4063: /* Fourth Column */
4064: SSE_COPY_PS(XMM3,XMM7)
4065: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4066: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4067: SSE_ADD_PS(XMM0,XMM3)
4069: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4070: SSE_INLINE_END_3
4072: /* Promote solution from float to double */
4073: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4075: /* Apply reordering to t and stream into x. */
4076: /* This way, x doesn't pollute the cache. */
4077: /* Be careful with size: 2 doubles = 4 floats! */
4078: idc = 4*(*c--);
4079: SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
4080: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
4081: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4082: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4083: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4084: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4085: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4086: SSE_INLINE_END_2
4087: v = aa + ai16 + 16;
4088: idt -= 4;
4089: }
4091: ISRestoreIndices(isrow,&rout);
4092: ISRestoreIndices(iscol,&cout);
4093: VecRestoreArray(bb,&b);
4094: VecRestoreArray(xx,&x);
4095: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4096: SSE_SCOPE_END;
4097: return(0);
4098: }
4100: #endif
4103: /*
4104: Special case where the matrix was ILU(0) factored in the natural
4105: ordering. This eliminates the need for the column and row permutation.
4106: */
4109: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4110: {
4111: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4112: PetscInt n =a->mbs;
4113: const PetscInt *ai=a->i,*aj=a->j;
4114: PetscErrorCode ierr;
4115: const PetscInt *diag = a->diag;
4116: const MatScalar *aa =a->a;
4117: PetscScalar *x;
4118: const PetscScalar *b;
4121: VecGetArrayRead(bb,&b);
4122: VecGetArray(xx,&x);
4124: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4125: {
4126: static PetscScalar w[2000]; /* very BAD need to fix */
4127: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4128: }
4129: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4130: {
4131: static PetscScalar w[2000]; /* very BAD need to fix */
4132: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4133: }
4134: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4135: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4136: #else
4137: {
4138: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
4139: const MatScalar *v;
4140: PetscInt jdx,idt,idx,nz,i,ai16;
4141: const PetscInt *vi;
4143: /* forward solve the lower triangular */
4144: idx = 0;
4145: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4146: for (i=1; i<n; i++) {
4147: v = aa + 16*ai[i];
4148: vi = aj + ai[i];
4149: nz = diag[i] - ai[i];
4150: idx += 4;
4151: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4152: while (nz--) {
4153: jdx = 4*(*vi++);
4154: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4155: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4156: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4157: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4158: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4159: v += 16;
4160: }
4161: x[idx] = s1;
4162: x[1+idx] = s2;
4163: x[2+idx] = s3;
4164: x[3+idx] = s4;
4165: }
4166: /* backward solve the upper triangular */
4167: idt = 4*(n-1);
4168: for (i=n-1; i>=0; i--) {
4169: ai16 = 16*diag[i];
4170: v = aa + ai16 + 16;
4171: vi = aj + diag[i] + 1;
4172: nz = ai[i+1] - diag[i] - 1;
4173: s1 = x[idt]; s2 = x[1+idt];
4174: s3 = x[2+idt];s4 = x[3+idt];
4175: while (nz--) {
4176: idx = 4*(*vi++);
4177: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
4178: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4179: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4180: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4181: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4182: v += 16;
4183: }
4184: v = aa + ai16;
4185: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4186: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
4187: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4188: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4189: idt -= 4;
4190: }
4191: }
4192: #endif
4194: VecRestoreArrayRead(bb,&b);
4195: VecRestoreArray(xx,&x);
4196: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4197: return(0);
4198: }
4202: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4203: {
4204: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4205: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4206: PetscInt i,k,nz,idx,jdx,idt;
4207: PetscErrorCode ierr;
4208: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
4209: const MatScalar *aa=a->a,*v;
4210: PetscScalar *x;
4211: const PetscScalar *b;
4212: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
4215: VecGetArrayRead(bb,&b);
4216: VecGetArray(xx,&x);
4217: /* forward solve the lower triangular */
4218: idx = 0;
4219: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4220: for (i=1; i<n; i++) {
4221: v = aa + bs2*ai[i];
4222: vi = aj + ai[i];
4223: nz = ai[i+1] - ai[i];
4224: idx = bs*i;
4225: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4226: for (k=0; k<nz; k++) {
4227: jdx = bs*vi[k];
4228: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4229: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4230: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4231: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4232: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4234: v += bs2;
4235: }
4237: x[idx] = s1;
4238: x[1+idx] = s2;
4239: x[2+idx] = s3;
4240: x[3+idx] = s4;
4241: }
4243: /* backward solve the upper triangular */
4244: for (i=n-1; i>=0; i--) {
4245: v = aa + bs2*(adiag[i+1]+1);
4246: vi = aj + adiag[i+1]+1;
4247: nz = adiag[i] - adiag[i+1]-1;
4248: idt = bs*i;
4249: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4251: for (k=0; k<nz; k++) {
4252: idx = bs*vi[k];
4253: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4254: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4255: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4256: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4257: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4259: v += bs2;
4260: }
4261: /* x = inv_diagonal*x */
4262: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4263: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4264: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4265: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4267: }
4269: VecRestoreArrayRead(bb,&b);
4270: VecRestoreArray(xx,&x);
4271: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4272: return(0);
4273: }
4277: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4278: {
4279: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4280: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4281: PetscErrorCode ierr;
4282: const MatScalar *aa=a->a;
4283: const PetscScalar *b;
4284: PetscScalar *x;
4287: VecGetArrayRead(bb,&b);
4288: VecGetArray(xx,&x);
4290: {
4291: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
4292: const MatScalar *v;
4293: MatScalar *t=(MatScalar*)x;
4294: PetscInt jdx,idt,idx,nz,i,ai16;
4295: const PetscInt *vi;
4297: /* forward solve the lower triangular */
4298: idx = 0;
4299: t[0] = (MatScalar)b[0];
4300: t[1] = (MatScalar)b[1];
4301: t[2] = (MatScalar)b[2];
4302: t[3] = (MatScalar)b[3];
4303: for (i=1; i<n; i++) {
4304: v = aa + 16*ai[i];
4305: vi = aj + ai[i];
4306: nz = diag[i] - ai[i];
4307: idx += 4;
4308: s1 = (MatScalar)b[idx];
4309: s2 = (MatScalar)b[1+idx];
4310: s3 = (MatScalar)b[2+idx];
4311: s4 = (MatScalar)b[3+idx];
4312: while (nz--) {
4313: jdx = 4*(*vi++);
4314: x1 = t[jdx];
4315: x2 = t[1+jdx];
4316: x3 = t[2+jdx];
4317: x4 = t[3+jdx];
4318: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4319: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4320: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4321: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4322: v += 16;
4323: }
4324: t[idx] = s1;
4325: t[1+idx] = s2;
4326: t[2+idx] = s3;
4327: t[3+idx] = s4;
4328: }
4329: /* backward solve the upper triangular */
4330: idt = 4*(n-1);
4331: for (i=n-1; i>=0; i--) {
4332: ai16 = 16*diag[i];
4333: v = aa + ai16 + 16;
4334: vi = aj + diag[i] + 1;
4335: nz = ai[i+1] - diag[i] - 1;
4336: s1 = t[idt];
4337: s2 = t[1+idt];
4338: s3 = t[2+idt];
4339: s4 = t[3+idt];
4340: while (nz--) {
4341: idx = 4*(*vi++);
4342: x1 = (MatScalar)x[idx];
4343: x2 = (MatScalar)x[1+idx];
4344: x3 = (MatScalar)x[2+idx];
4345: x4 = (MatScalar)x[3+idx];
4346: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4347: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4348: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4349: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4350: v += 16;
4351: }
4352: v = aa + ai16;
4353: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
4354: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
4355: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4356: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4357: idt -= 4;
4358: }
4359: }
4361: VecRestoreArrayRead(bb,&b);
4362: VecRestoreArray(xx,&x);
4363: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4364: return(0);
4365: }
4367: #if defined(PETSC_HAVE_SSE)
4369: #include PETSC_HAVE_SSE
4372: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4373: {
4374: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4375: unsigned short *aj=(unsigned short*)a->j;
4377: int *ai=a->i,n=a->mbs,*diag = a->diag;
4378: MatScalar *aa=a->a;
4379: PetscScalar *x,*b;
4382: SSE_SCOPE_BEGIN;
4383: /*
4384: Note: This code currently uses demotion of double
4385: to float when performing the mixed-mode computation.
4386: This may not be numerically reasonable for all applications.
4387: */
4388: PREFETCH_NTA(aa+16*ai[1]);
4390: VecGetArray(bb,&b);
4391: VecGetArray(xx,&x);
4392: {
4393: /* x will first be computed in single precision then promoted inplace to double */
4394: MatScalar *v,*t=(MatScalar*)x;
4395: int nz,i,idt,ai16;
4396: unsigned int jdx,idx;
4397: unsigned short *vi;
4398: /* Forward solve the lower triangular factor. */
4400: /* First block is the identity. */
4401: idx = 0;
4402: CONVERT_DOUBLE4_FLOAT4(t,b);
4403: v = aa + 16*((unsigned int)ai[1]);
4405: for (i=1; i<n; ) {
4406: PREFETCH_NTA(&v[8]);
4407: vi = aj + ai[i];
4408: nz = diag[i] - ai[i];
4409: idx += 4;
4411: /* Demote RHS from double to float. */
4412: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4413: LOAD_PS(&t[idx],XMM7);
4415: while (nz--) {
4416: PREFETCH_NTA(&v[16]);
4417: jdx = 4*((unsigned int)(*vi++));
4419: /* 4x4 Matrix-Vector product with negative accumulation: */
4420: SSE_INLINE_BEGIN_2(&t[jdx],v)
4421: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4423: /* First Column */
4424: SSE_COPY_PS(XMM0,XMM6)
4425: SSE_SHUFFLE(XMM0,XMM0,0x00)
4426: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4427: SSE_SUB_PS(XMM7,XMM0)
4429: /* Second Column */
4430: SSE_COPY_PS(XMM1,XMM6)
4431: SSE_SHUFFLE(XMM1,XMM1,0x55)
4432: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4433: SSE_SUB_PS(XMM7,XMM1)
4435: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4437: /* Third Column */
4438: SSE_COPY_PS(XMM2,XMM6)
4439: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4440: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4441: SSE_SUB_PS(XMM7,XMM2)
4443: /* Fourth Column */
4444: SSE_COPY_PS(XMM3,XMM6)
4445: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4446: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4447: SSE_SUB_PS(XMM7,XMM3)
4448: SSE_INLINE_END_2
4450: v += 16;
4451: }
4452: v = aa + 16*ai[++i];
4453: PREFETCH_NTA(v);
4454: STORE_PS(&t[idx],XMM7);
4455: }
4457: /* Backward solve the upper triangular factor.*/
4459: idt = 4*(n-1);
4460: ai16 = 16*diag[n-1];
4461: v = aa + ai16 + 16;
4462: for (i=n-1; i>=0; ) {
4463: PREFETCH_NTA(&v[8]);
4464: vi = aj + diag[i] + 1;
4465: nz = ai[i+1] - diag[i] - 1;
4467: LOAD_PS(&t[idt],XMM7);
4469: while (nz--) {
4470: PREFETCH_NTA(&v[16]);
4471: idx = 4*((unsigned int)(*vi++));
4473: /* 4x4 Matrix-Vector Product with negative accumulation: */
4474: SSE_INLINE_BEGIN_2(&t[idx],v)
4475: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4477: /* First Column */
4478: SSE_COPY_PS(XMM0,XMM6)
4479: SSE_SHUFFLE(XMM0,XMM0,0x00)
4480: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4481: SSE_SUB_PS(XMM7,XMM0)
4483: /* Second Column */
4484: SSE_COPY_PS(XMM1,XMM6)
4485: SSE_SHUFFLE(XMM1,XMM1,0x55)
4486: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4487: SSE_SUB_PS(XMM7,XMM1)
4489: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4491: /* Third Column */
4492: SSE_COPY_PS(XMM2,XMM6)
4493: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4494: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4495: SSE_SUB_PS(XMM7,XMM2)
4497: /* Fourth Column */
4498: SSE_COPY_PS(XMM3,XMM6)
4499: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4500: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4501: SSE_SUB_PS(XMM7,XMM3)
4502: SSE_INLINE_END_2
4503: v += 16;
4504: }
4505: v = aa + ai16;
4506: ai16 = 16*diag[--i];
4507: PREFETCH_NTA(aa+ai16+16);
4508: /*
4509: Scale the result by the diagonal 4x4 block,
4510: which was inverted as part of the factorization
4511: */
4512: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4513: /* First Column */
4514: SSE_COPY_PS(XMM0,XMM7)
4515: SSE_SHUFFLE(XMM0,XMM0,0x00)
4516: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4518: /* Second Column */
4519: SSE_COPY_PS(XMM1,XMM7)
4520: SSE_SHUFFLE(XMM1,XMM1,0x55)
4521: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4522: SSE_ADD_PS(XMM0,XMM1)
4524: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4526: /* Third Column */
4527: SSE_COPY_PS(XMM2,XMM7)
4528: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4529: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4530: SSE_ADD_PS(XMM0,XMM2)
4532: /* Fourth Column */
4533: SSE_COPY_PS(XMM3,XMM7)
4534: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4535: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4536: SSE_ADD_PS(XMM0,XMM3)
4538: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4539: SSE_INLINE_END_3
4541: v = aa + ai16 + 16;
4542: idt -= 4;
4543: }
4545: /* Convert t from single precision back to double precision (inplace)*/
4546: idt = 4*(n-1);
4547: for (i=n-1; i>=0; i--) {
4548: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4549: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4550: PetscScalar *xtemp=&x[idt];
4551: MatScalar *ttemp=&t[idt];
4552: xtemp[3] = (PetscScalar)ttemp[3];
4553: xtemp[2] = (PetscScalar)ttemp[2];
4554: xtemp[1] = (PetscScalar)ttemp[1];
4555: xtemp[0] = (PetscScalar)ttemp[0];
4556: idt -= 4;
4557: }
4559: } /* End of artificial scope. */
4560: VecRestoreArray(bb,&b);
4561: VecRestoreArray(xx,&x);
4562: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4563: SSE_SCOPE_END;
4564: return(0);
4565: }
4569: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4570: {
4571: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4572: int *aj=a->j;
4574: int *ai=a->i,n=a->mbs,*diag = a->diag;
4575: MatScalar *aa=a->a;
4576: PetscScalar *x,*b;
4579: SSE_SCOPE_BEGIN;
4580: /*
4581: Note: This code currently uses demotion of double
4582: to float when performing the mixed-mode computation.
4583: This may not be numerically reasonable for all applications.
4584: */
4585: PREFETCH_NTA(aa+16*ai[1]);
4587: VecGetArray(bb,&b);
4588: VecGetArray(xx,&x);
4589: {
4590: /* x will first be computed in single precision then promoted inplace to double */
4591: MatScalar *v,*t=(MatScalar*)x;
4592: int nz,i,idt,ai16;
4593: int jdx,idx;
4594: int *vi;
4595: /* Forward solve the lower triangular factor. */
4597: /* First block is the identity. */
4598: idx = 0;
4599: CONVERT_DOUBLE4_FLOAT4(t,b);
4600: v = aa + 16*ai[1];
4602: for (i=1; i<n; ) {
4603: PREFETCH_NTA(&v[8]);
4604: vi = aj + ai[i];
4605: nz = diag[i] - ai[i];
4606: idx += 4;
4608: /* Demote RHS from double to float. */
4609: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4610: LOAD_PS(&t[idx],XMM7);
4612: while (nz--) {
4613: PREFETCH_NTA(&v[16]);
4614: jdx = 4*(*vi++);
4615: /* jdx = *vi++; */
4617: /* 4x4 Matrix-Vector product with negative accumulation: */
4618: SSE_INLINE_BEGIN_2(&t[jdx],v)
4619: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4621: /* First Column */
4622: SSE_COPY_PS(XMM0,XMM6)
4623: SSE_SHUFFLE(XMM0,XMM0,0x00)
4624: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4625: SSE_SUB_PS(XMM7,XMM0)
4627: /* Second Column */
4628: SSE_COPY_PS(XMM1,XMM6)
4629: SSE_SHUFFLE(XMM1,XMM1,0x55)
4630: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4631: SSE_SUB_PS(XMM7,XMM1)
4633: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4635: /* Third Column */
4636: SSE_COPY_PS(XMM2,XMM6)
4637: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4638: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4639: SSE_SUB_PS(XMM7,XMM2)
4641: /* Fourth Column */
4642: SSE_COPY_PS(XMM3,XMM6)
4643: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4644: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4645: SSE_SUB_PS(XMM7,XMM3)
4646: SSE_INLINE_END_2
4648: v += 16;
4649: }
4650: v = aa + 16*ai[++i];
4651: PREFETCH_NTA(v);
4652: STORE_PS(&t[idx],XMM7);
4653: }
4655: /* Backward solve the upper triangular factor.*/
4657: idt = 4*(n-1);
4658: ai16 = 16*diag[n-1];
4659: v = aa + ai16 + 16;
4660: for (i=n-1; i>=0; ) {
4661: PREFETCH_NTA(&v[8]);
4662: vi = aj + diag[i] + 1;
4663: nz = ai[i+1] - diag[i] - 1;
4665: LOAD_PS(&t[idt],XMM7);
4667: while (nz--) {
4668: PREFETCH_NTA(&v[16]);
4669: idx = 4*(*vi++);
4670: /* idx = *vi++; */
4672: /* 4x4 Matrix-Vector Product with negative accumulation: */
4673: SSE_INLINE_BEGIN_2(&t[idx],v)
4674: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4676: /* First Column */
4677: SSE_COPY_PS(XMM0,XMM6)
4678: SSE_SHUFFLE(XMM0,XMM0,0x00)
4679: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4680: SSE_SUB_PS(XMM7,XMM0)
4682: /* Second Column */
4683: SSE_COPY_PS(XMM1,XMM6)
4684: SSE_SHUFFLE(XMM1,XMM1,0x55)
4685: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4686: SSE_SUB_PS(XMM7,XMM1)
4688: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4690: /* Third Column */
4691: SSE_COPY_PS(XMM2,XMM6)
4692: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4693: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4694: SSE_SUB_PS(XMM7,XMM2)
4696: /* Fourth Column */
4697: SSE_COPY_PS(XMM3,XMM6)
4698: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4699: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4700: SSE_SUB_PS(XMM7,XMM3)
4701: SSE_INLINE_END_2
4702: v += 16;
4703: }
4704: v = aa + ai16;
4705: ai16 = 16*diag[--i];
4706: PREFETCH_NTA(aa+ai16+16);
4707: /*
4708: Scale the result by the diagonal 4x4 block,
4709: which was inverted as part of the factorization
4710: */
4711: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4712: /* First Column */
4713: SSE_COPY_PS(XMM0,XMM7)
4714: SSE_SHUFFLE(XMM0,XMM0,0x00)
4715: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4717: /* Second Column */
4718: SSE_COPY_PS(XMM1,XMM7)
4719: SSE_SHUFFLE(XMM1,XMM1,0x55)
4720: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4721: SSE_ADD_PS(XMM0,XMM1)
4723: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4725: /* Third Column */
4726: SSE_COPY_PS(XMM2,XMM7)
4727: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4728: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4729: SSE_ADD_PS(XMM0,XMM2)
4731: /* Fourth Column */
4732: SSE_COPY_PS(XMM3,XMM7)
4733: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4734: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4735: SSE_ADD_PS(XMM0,XMM3)
4737: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4738: SSE_INLINE_END_3
4740: v = aa + ai16 + 16;
4741: idt -= 4;
4742: }
4744: /* Convert t from single precision back to double precision (inplace)*/
4745: idt = 4*(n-1);
4746: for (i=n-1; i>=0; i--) {
4747: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4748: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4749: PetscScalar *xtemp=&x[idt];
4750: MatScalar *ttemp=&t[idt];
4751: xtemp[3] = (PetscScalar)ttemp[3];
4752: xtemp[2] = (PetscScalar)ttemp[2];
4753: xtemp[1] = (PetscScalar)ttemp[1];
4754: xtemp[0] = (PetscScalar)ttemp[0];
4755: idt -= 4;
4756: }
4758: } /* End of artificial scope. */
4759: VecRestoreArray(bb,&b);
4760: VecRestoreArray(xx,&x);
4761: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4762: SSE_SCOPE_END;
4763: return(0);
4764: }
4766: #endif
4770: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4771: {
4772: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
4773: IS iscol=a->col,isrow=a->row;
4774: PetscErrorCode ierr;
4775: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4776: PetscInt i,nz,idx,idt,idc;
4777: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4778: const MatScalar *aa=a->a,*v;
4779: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
4780: const PetscScalar *b;
4783: VecGetArrayRead(bb,&b);
4784: VecGetArray(xx,&x);
4785: t = a->solve_work;
4787: ISGetIndices(isrow,&rout); r = rout;
4788: ISGetIndices(iscol,&cout); c = cout + (n-1);
4790: /* forward solve the lower triangular */
4791: idx = 3*(*r++);
4792: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4793: for (i=1; i<n; i++) {
4794: v = aa + 9*ai[i];
4795: vi = aj + ai[i];
4796: nz = diag[i] - ai[i];
4797: idx = 3*(*r++);
4798: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4799: while (nz--) {
4800: idx = 3*(*vi++);
4801: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4802: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4803: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4804: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4805: v += 9;
4806: }
4807: idx = 3*i;
4808: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4809: }
4810: /* backward solve the upper triangular */
4811: for (i=n-1; i>=0; i--) {
4812: v = aa + 9*diag[i] + 9;
4813: vi = aj + diag[i] + 1;
4814: nz = ai[i+1] - diag[i] - 1;
4815: idt = 3*i;
4816: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4817: while (nz--) {
4818: idx = 3*(*vi++);
4819: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4820: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4821: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4822: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4823: v += 9;
4824: }
4825: idc = 3*(*c--);
4826: v = aa + 9*diag[i];
4827: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4828: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4829: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4830: }
4831: ISRestoreIndices(isrow,&rout);
4832: ISRestoreIndices(iscol,&cout);
4833: VecRestoreArrayRead(bb,&b);
4834: VecRestoreArray(xx,&x);
4835: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4836: return(0);
4837: }
4841: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4842: {
4843: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
4844: IS iscol=a->col,isrow=a->row;
4845: PetscErrorCode ierr;
4846: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4847: PetscInt i,nz,idx,idt,idc,m;
4848: const PetscInt *r,*c,*rout,*cout;
4849: const MatScalar *aa=a->a,*v;
4850: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
4851: const PetscScalar *b;
4854: VecGetArrayRead(bb,&b);
4855: VecGetArray(xx,&x);
4856: t = a->solve_work;
4858: ISGetIndices(isrow,&rout); r = rout;
4859: ISGetIndices(iscol,&cout); c = cout;
4861: /* forward solve the lower triangular */
4862: idx = 3*r[0];
4863: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4864: for (i=1; i<n; i++) {
4865: v = aa + 9*ai[i];
4866: vi = aj + ai[i];
4867: nz = ai[i+1] - ai[i];
4868: idx = 3*r[i];
4869: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4870: for (m=0; m<nz; m++) {
4871: idx = 3*vi[m];
4872: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4873: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4874: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4875: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4876: v += 9;
4877: }
4878: idx = 3*i;
4879: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4880: }
4881: /* backward solve the upper triangular */
4882: for (i=n-1; i>=0; i--) {
4883: v = aa + 9*(adiag[i+1]+1);
4884: vi = aj + adiag[i+1]+1;
4885: nz = adiag[i] - adiag[i+1] - 1;
4886: idt = 3*i;
4887: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4888: for (m=0; m<nz; m++) {
4889: idx = 3*vi[m];
4890: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4891: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4892: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4893: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4894: v += 9;
4895: }
4896: idc = 3*c[i];
4897: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4898: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4899: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4900: }
4901: ISRestoreIndices(isrow,&rout);
4902: ISRestoreIndices(iscol,&cout);
4903: VecRestoreArrayRead(bb,&b);
4904: VecRestoreArray(xx,&x);
4905: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4906: return(0);
4907: }
4909: /*
4910: Special case where the matrix was ILU(0) factored in the natural
4911: ordering. This eliminates the need for the column and row permutation.
4912: */
4915: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4916: {
4917: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4918: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j;
4919: PetscErrorCode ierr;
4920: const PetscInt *diag = a->diag,*vi;
4921: const MatScalar *aa =a->a,*v;
4922: PetscScalar *x,s1,s2,s3,x1,x2,x3;
4923: const PetscScalar *b;
4924: PetscInt jdx,idt,idx,nz,i;
4927: VecGetArrayRead(bb,&b);
4928: VecGetArray(xx,&x);
4930: /* forward solve the lower triangular */
4931: idx = 0;
4932: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
4933: for (i=1; i<n; i++) {
4934: v = aa + 9*ai[i];
4935: vi = aj + ai[i];
4936: nz = diag[i] - ai[i];
4937: idx += 3;
4938: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939: while (nz--) {
4940: jdx = 3*(*vi++);
4941: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4942: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945: v += 9;
4946: }
4947: x[idx] = s1;
4948: x[1+idx] = s2;
4949: x[2+idx] = s3;
4950: }
4951: /* backward solve the upper triangular */
4952: for (i=n-1; i>=0; i--) {
4953: v = aa + 9*diag[i] + 9;
4954: vi = aj + diag[i] + 1;
4955: nz = ai[i+1] - diag[i] - 1;
4956: idt = 3*i;
4957: s1 = x[idt]; s2 = x[1+idt];
4958: s3 = x[2+idt];
4959: while (nz--) {
4960: idx = 3*(*vi++);
4961: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
4962: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4963: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4964: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4965: v += 9;
4966: }
4967: v = aa + 9*diag[i];
4968: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4969: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4970: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4971: }
4973: VecRestoreArrayRead(bb,&b);
4974: VecRestoreArray(xx,&x);
4975: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4976: return(0);
4977: }
4981: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4982: {
4983: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
4984: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4985: PetscErrorCode ierr;
4986: PetscInt i,k,nz,idx,jdx,idt;
4987: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
4988: const MatScalar *aa=a->a,*v;
4989: PetscScalar *x;
4990: const PetscScalar *b;
4991: PetscScalar s1,s2,s3,x1,x2,x3;
4994: VecGetArrayRead(bb,&b);
4995: VecGetArray(xx,&x);
4996: /* forward solve the lower triangular */
4997: idx = 0;
4998: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4999: for (i=1; i<n; i++) {
5000: v = aa + bs2*ai[i];
5001: vi = aj + ai[i];
5002: nz = ai[i+1] - ai[i];
5003: idx = bs*i;
5004: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5005: for (k=0; k<nz; k++) {
5006: jdx = bs*vi[k];
5007: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5008: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5009: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5010: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5012: v += bs2;
5013: }
5015: x[idx] = s1;
5016: x[1+idx] = s2;
5017: x[2+idx] = s3;
5018: }
5020: /* backward solve the upper triangular */
5021: for (i=n-1; i>=0; i--) {
5022: v = aa + bs2*(adiag[i+1]+1);
5023: vi = aj + adiag[i+1]+1;
5024: nz = adiag[i] - adiag[i+1]-1;
5025: idt = bs*i;
5026: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];
5028: for (k=0; k<nz; k++) {
5029: idx = bs*vi[k];
5030: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
5031: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5032: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5033: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5035: v += bs2;
5036: }
5037: /* x = inv_diagonal*x */
5038: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
5039: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5040: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5042: }
5044: VecRestoreArrayRead(bb,&b);
5045: VecRestoreArray(xx,&x);
5046: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
5047: return(0);
5048: }
5052: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5053: {
5054: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
5055: IS iscol=a->col,isrow=a->row;
5056: PetscErrorCode ierr;
5057: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5058: PetscInt i,nz,idx,idt,idc;
5059: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5060: const MatScalar *aa=a->a,*v;
5061: PetscScalar *x,s1,s2,x1,x2,*t;
5062: const PetscScalar *b;
5065: VecGetArrayRead(bb,&b);
5066: VecGetArray(xx,&x);
5067: t = a->solve_work;
5069: ISGetIndices(isrow,&rout); r = rout;
5070: ISGetIndices(iscol,&cout); c = cout + (n-1);
5072: /* forward solve the lower triangular */
5073: idx = 2*(*r++);
5074: t[0] = b[idx]; t[1] = b[1+idx];
5075: for (i=1; i<n; i++) {
5076: v = aa + 4*ai[i];
5077: vi = aj + ai[i];
5078: nz = diag[i] - ai[i];
5079: idx = 2*(*r++);
5080: s1 = b[idx]; s2 = b[1+idx];
5081: while (nz--) {
5082: idx = 2*(*vi++);
5083: x1 = t[idx]; x2 = t[1+idx];
5084: s1 -= v[0]*x1 + v[2]*x2;
5085: s2 -= v[1]*x1 + v[3]*x2;
5086: v += 4;
5087: }
5088: idx = 2*i;
5089: t[idx] = s1; t[1+idx] = s2;
5090: }
5091: /* backward solve the upper triangular */
5092: for (i=n-1; i>=0; i--) {
5093: v = aa + 4*diag[i] + 4;
5094: vi = aj + diag[i] + 1;
5095: nz = ai[i+1] - diag[i] - 1;
5096: idt = 2*i;
5097: s1 = t[idt]; s2 = t[1+idt];
5098: while (nz--) {
5099: idx = 2*(*vi++);
5100: x1 = t[idx]; x2 = t[1+idx];
5101: s1 -= v[0]*x1 + v[2]*x2;
5102: s2 -= v[1]*x1 + v[3]*x2;
5103: v += 4;
5104: }
5105: idc = 2*(*c--);
5106: v = aa + 4*diag[i];
5107: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
5108: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5109: }
5110: ISRestoreIndices(isrow,&rout);
5111: ISRestoreIndices(iscol,&cout);
5112: VecRestoreArrayRead(bb,&b);
5113: VecRestoreArray(xx,&x);
5114: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5115: return(0);
5116: }
5120: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5121: {
5122: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
5123: IS iscol=a->col,isrow=a->row;
5124: PetscErrorCode ierr;
5125: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5126: PetscInt i,nz,idx,jdx,idt,idc,m;
5127: const PetscInt *r,*c,*rout,*cout;
5128: const MatScalar *aa=a->a,*v;
5129: PetscScalar *x,s1,s2,x1,x2,*t;
5130: const PetscScalar *b;
5133: VecGetArrayRead(bb,&b);
5134: VecGetArray(xx,&x);
5135: t = a->solve_work;
5137: ISGetIndices(isrow,&rout); r = rout;
5138: ISGetIndices(iscol,&cout); c = cout;
5140: /* forward solve the lower triangular */
5141: idx = 2*r[0];
5142: t[0] = b[idx]; t[1] = b[1+idx];
5143: for (i=1; i<n; i++) {
5144: v = aa + 4*ai[i];
5145: vi = aj + ai[i];
5146: nz = ai[i+1] - ai[i];
5147: idx = 2*r[i];
5148: s1 = b[idx]; s2 = b[1+idx];
5149: for (m=0; m<nz; m++) {
5150: jdx = 2*vi[m];
5151: x1 = t[jdx]; x2 = t[1+jdx];
5152: s1 -= v[0]*x1 + v[2]*x2;
5153: s2 -= v[1]*x1 + v[3]*x2;
5154: v += 4;
5155: }
5156: idx = 2*i;
5157: t[idx] = s1; t[1+idx] = s2;
5158: }
5159: /* backward solve the upper triangular */
5160: for (i=n-1; i>=0; i--) {
5161: v = aa + 4*(adiag[i+1]+1);
5162: vi = aj + adiag[i+1]+1;
5163: nz = adiag[i] - adiag[i+1] - 1;
5164: idt = 2*i;
5165: s1 = t[idt]; s2 = t[1+idt];
5166: for (m=0; m<nz; m++) {
5167: idx = 2*vi[m];
5168: x1 = t[idx]; x2 = t[1+idx];
5169: s1 -= v[0]*x1 + v[2]*x2;
5170: s2 -= v[1]*x1 + v[3]*x2;
5171: v += 4;
5172: }
5173: idc = 2*c[i];
5174: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
5175: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5176: }
5177: ISRestoreIndices(isrow,&rout);
5178: ISRestoreIndices(iscol,&cout);
5179: VecRestoreArrayRead(bb,&b);
5180: VecRestoreArray(xx,&x);
5181: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5182: return(0);
5183: }
5185: /*
5186: Special case where the matrix was ILU(0) factored in the natural
5187: ordering. This eliminates the need for the column and row permutation.
5188: */
5191: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5192: {
5193: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5194: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5195: PetscErrorCode ierr;
5196: const MatScalar *aa=a->a,*v;
5197: PetscScalar *x,s1,s2,x1,x2;
5198: const PetscScalar *b;
5199: PetscInt jdx,idt,idx,nz,i;
5202: VecGetArrayRead(bb,&b);
5203: VecGetArray(xx,&x);
5205: /* forward solve the lower triangular */
5206: idx = 0;
5207: x[0] = b[0]; x[1] = b[1];
5208: for (i=1; i<n; i++) {
5209: v = aa + 4*ai[i];
5210: vi = aj + ai[i];
5211: nz = diag[i] - ai[i];
5212: idx += 2;
5213: s1 = b[idx];s2 = b[1+idx];
5214: while (nz--) {
5215: jdx = 2*(*vi++);
5216: x1 = x[jdx];x2 = x[1+jdx];
5217: s1 -= v[0]*x1 + v[2]*x2;
5218: s2 -= v[1]*x1 + v[3]*x2;
5219: v += 4;
5220: }
5221: x[idx] = s1;
5222: x[1+idx] = s2;
5223: }
5224: /* backward solve the upper triangular */
5225: for (i=n-1; i>=0; i--) {
5226: v = aa + 4*diag[i] + 4;
5227: vi = aj + diag[i] + 1;
5228: nz = ai[i+1] - diag[i] - 1;
5229: idt = 2*i;
5230: s1 = x[idt]; s2 = x[1+idt];
5231: while (nz--) {
5232: idx = 2*(*vi++);
5233: x1 = x[idx]; x2 = x[1+idx];
5234: s1 -= v[0]*x1 + v[2]*x2;
5235: s2 -= v[1]*x1 + v[3]*x2;
5236: v += 4;
5237: }
5238: v = aa + 4*diag[i];
5239: x[idt] = v[0]*s1 + v[2]*s2;
5240: x[1+idt] = v[1]*s1 + v[3]*s2;
5241: }
5243: VecRestoreArrayRead(bb,&b);
5244: VecRestoreArray(xx,&x);
5245: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5246: return(0);
5247: }
5251: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5252: {
5253: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5254: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5255: PetscInt i,k,nz,idx,idt,jdx;
5256: PetscErrorCode ierr;
5257: const MatScalar *aa=a->a,*v;
5258: PetscScalar *x,s1,s2,x1,x2;
5259: const PetscScalar *b;
5262: VecGetArrayRead(bb,&b);
5263: VecGetArray(xx,&x);
5264: /* forward solve the lower triangular */
5265: idx = 0;
5266: x[0] = b[idx]; x[1] = b[1+idx];
5267: for (i=1; i<n; i++) {
5268: v = aa + 4*ai[i];
5269: vi = aj + ai[i];
5270: nz = ai[i+1] - ai[i];
5271: idx = 2*i;
5272: s1 = b[idx];s2 = b[1+idx];
5273: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5274: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5275: for (k=0; k<nz; k++) {
5276: jdx = 2*vi[k];
5277: x1 = x[jdx];x2 = x[1+jdx];
5278: s1 -= v[0]*x1 + v[2]*x2;
5279: s2 -= v[1]*x1 + v[3]*x2;
5280: v += 4;
5281: }
5282: x[idx] = s1;
5283: x[1+idx] = s2;
5284: }
5286: /* backward solve the upper triangular */
5287: for (i=n-1; i>=0; i--) {
5288: v = aa + 4*(adiag[i+1]+1);
5289: vi = aj + adiag[i+1]+1;
5290: nz = adiag[i] - adiag[i+1]-1;
5291: idt = 2*i;
5292: s1 = x[idt]; s2 = x[1+idt];
5293: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5294: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5295: for (k=0; k<nz; k++) {
5296: idx = 2*vi[k];
5297: x1 = x[idx]; x2 = x[1+idx];
5298: s1 -= v[0]*x1 + v[2]*x2;
5299: s2 -= v[1]*x1 + v[3]*x2;
5300: v += 4;
5301: }
5302: /* x = inv_diagonal*x */
5303: x[idt] = v[0]*s1 + v[2]*s2;
5304: x[1+idt] = v[1]*s1 + v[3]*s2;
5305: }
5307: VecRestoreArrayRead(bb,&b);
5308: VecRestoreArray(xx,&x);
5309: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5310: return(0);
5311: }
5315: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5316: {
5317: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
5318: IS iscol=a->col,isrow=a->row;
5319: PetscErrorCode ierr;
5320: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5321: PetscInt i,nz;
5322: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5323: const MatScalar *aa=a->a,*v;
5324: PetscScalar *x,s1,*t;
5325: const PetscScalar *b;
5328: if (!n) return(0);
5330: VecGetArrayRead(bb,&b);
5331: VecGetArray(xx,&x);
5332: t = a->solve_work;
5334: ISGetIndices(isrow,&rout); r = rout;
5335: ISGetIndices(iscol,&cout); c = cout + (n-1);
5337: /* forward solve the lower triangular */
5338: t[0] = b[*r++];
5339: for (i=1; i<n; i++) {
5340: v = aa + ai[i];
5341: vi = aj + ai[i];
5342: nz = diag[i] - ai[i];
5343: s1 = b[*r++];
5344: while (nz--) {
5345: s1 -= (*v++)*t[*vi++];
5346: }
5347: t[i] = s1;
5348: }
5349: /* backward solve the upper triangular */
5350: for (i=n-1; i>=0; i--) {
5351: v = aa + diag[i] + 1;
5352: vi = aj + diag[i] + 1;
5353: nz = ai[i+1] - diag[i] - 1;
5354: s1 = t[i];
5355: while (nz--) {
5356: s1 -= (*v++)*t[*vi++];
5357: }
5358: x[*c--] = t[i] = aa[diag[i]]*s1;
5359: }
5361: ISRestoreIndices(isrow,&rout);
5362: ISRestoreIndices(iscol,&cout);
5363: VecRestoreArrayRead(bb,&b);
5364: VecRestoreArray(xx,&x);
5365: PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
5366: return(0);
5367: }
5371: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5372: {
5373: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5374: IS iscol = a->col,isrow = a->row;
5375: PetscErrorCode ierr;
5376: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5377: const PetscInt *rout,*cout,*r,*c;
5378: PetscScalar *x,*tmp,sum;
5379: const PetscScalar *b;
5380: const MatScalar *aa = a->a,*v;
5383: if (!n) return(0);
5385: VecGetArrayRead(bb,&b);
5386: VecGetArray(xx,&x);
5387: tmp = a->solve_work;
5389: ISGetIndices(isrow,&rout); r = rout;
5390: ISGetIndices(iscol,&cout); c = cout;
5392: /* forward solve the lower triangular */
5393: tmp[0] = b[r[0]];
5394: v = aa;
5395: vi = aj;
5396: for (i=1; i<n; i++) {
5397: nz = ai[i+1] - ai[i];
5398: sum = b[r[i]];
5399: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5400: tmp[i] = sum;
5401: v += nz; vi += nz;
5402: }
5404: /* backward solve the upper triangular */
5405: for (i=n-1; i>=0; i--) {
5406: v = aa + adiag[i+1]+1;
5407: vi = aj + adiag[i+1]+1;
5408: nz = adiag[i]-adiag[i+1]-1;
5409: sum = tmp[i];
5410: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5411: x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5412: }
5414: ISRestoreIndices(isrow,&rout);
5415: ISRestoreIndices(iscol,&cout);
5416: VecRestoreArrayRead(bb,&b);
5417: VecRestoreArray(xx,&x);
5418: PetscLogFlops(2*a->nz - A->cmap->n);
5419: return(0);
5420: }
5422: /*
5423: Special case where the matrix was ILU(0) factored in the natural
5424: ordering. This eliminates the need for the column and row permutation.
5425: */
5428: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5429: {
5430: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5431: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5432: PetscErrorCode ierr;
5433: const MatScalar *aa=a->a,*v;
5434: PetscScalar *x;
5435: const PetscScalar *b;
5436: PetscScalar s1,x1;
5437: PetscInt jdx,idt,idx,nz,i;
5440: VecGetArrayRead(bb,&b);
5441: VecGetArray(xx,&x);
5443: /* forward solve the lower triangular */
5444: idx = 0;
5445: x[0] = b[0];
5446: for (i=1; i<n; i++) {
5447: v = aa + ai[i];
5448: vi = aj + ai[i];
5449: nz = diag[i] - ai[i];
5450: idx += 1;
5451: s1 = b[idx];
5452: while (nz--) {
5453: jdx = *vi++;
5454: x1 = x[jdx];
5455: s1 -= v[0]*x1;
5456: v += 1;
5457: }
5458: x[idx] = s1;
5459: }
5460: /* backward solve the upper triangular */
5461: for (i=n-1; i>=0; i--) {
5462: v = aa + diag[i] + 1;
5463: vi = aj + diag[i] + 1;
5464: nz = ai[i+1] - diag[i] - 1;
5465: idt = i;
5466: s1 = x[idt];
5467: while (nz--) {
5468: idx = *vi++;
5469: x1 = x[idx];
5470: s1 -= v[0]*x1;
5471: v += 1;
5472: }
5473: v = aa + diag[i];
5474: x[idt] = v[0]*s1;
5475: }
5476: VecRestoreArrayRead(bb,&b);
5477: VecRestoreArray(xx,&x);
5478: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
5479: return(0);
5480: }
5485: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5486: {
5487: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
5488: PetscErrorCode ierr;
5489: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5490: PetscScalar *x,sum;
5491: const PetscScalar *b;
5492: const MatScalar *aa = a->a,*v;
5493: PetscInt i,nz;
5496: if (!n) return(0);
5498: VecGetArrayRead(bb,&b);
5499: VecGetArray(xx,&x);
5501: /* forward solve the lower triangular */
5502: x[0] = b[0];
5503: v = aa;
5504: vi = aj;
5505: for (i=1; i<n; i++) {
5506: nz = ai[i+1] - ai[i];
5507: sum = b[i];
5508: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5509: v += nz;
5510: vi += nz;
5511: x[i] = sum;
5512: }
5514: /* backward solve the upper triangular */
5515: for (i=n-1; i>=0; i--) {
5516: v = aa + adiag[i+1] + 1;
5517: vi = aj + adiag[i+1] + 1;
5518: nz = adiag[i] - adiag[i+1]-1;
5519: sum = x[i];
5520: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5521: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5522: }
5524: PetscLogFlops(2.0*a->nz - A->cmap->n);
5525: VecRestoreArrayRead(bb,&b);
5526: VecRestoreArray(xx,&x);
5527: return(0);
5528: }
5530: /* ----------------------------------------------------------------*/
5531: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
5535: /*
5536: This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5537: */
5538: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5539: {
5540: Mat C =B;
5541: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
5542: PetscErrorCode ierr;
5543: PetscInt i,j,k,ipvt[15];
5544: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5545: PetscInt nz,nzL,row;
5546: MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225];
5547: const MatScalar *v,*aa=a->a;
5548: PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg;
5549: PetscInt sol_ver;
5552: PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,NULL);
5554: /* generate work space needed by the factorization */
5555: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
5556: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5558: for (i=0; i<n; i++) {
5559: /* zero rtmp */
5560: /* L part */
5561: nz = bi[i+1] - bi[i];
5562: bjtmp = bj + bi[i];
5563: for (j=0; j<nz; j++) {
5564: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5565: }
5567: /* U part */
5568: nz = bdiag[i] - bdiag[i+1];
5569: bjtmp = bj + bdiag[i+1]+1;
5570: for (j=0; j<nz; j++) {
5571: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5572: }
5574: /* load in initial (unfactored row) */
5575: nz = ai[i+1] - ai[i];
5576: ajtmp = aj + ai[i];
5577: v = aa + bs2*ai[i];
5578: for (j=0; j<nz; j++) {
5579: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
5580: }
5582: /* elimination */
5583: bjtmp = bj + bi[i];
5584: nzL = bi[i+1] - bi[i];
5585: for (k=0; k < nzL; k++) {
5586: row = bjtmp[k];
5587: pc = rtmp + bs2*row;
5588: for (flg=0,j=0; j<bs2; j++) {
5589: if (pc[j]!=0.0) {
5590: flg = 1;
5591: break;
5592: }
5593: }
5594: if (flg) {
5595: pv = b->a + bs2*bdiag[row];
5596: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
5597: /*PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);*/
5598: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5599: pv = b->a + bs2*(bdiag[row+1]+1);
5600: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5601: for (j=0; j<nz; j++) {
5602: vv = rtmp + bs2*pj[j];
5603: PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5604: /* PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
5605: pv += bs2;
5606: }
5607: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5608: }
5609: }
5611: /* finished row so stick it into b->a */
5612: /* L part */
5613: pv = b->a + bs2*bi[i];
5614: pj = b->j + bi[i];
5615: nz = bi[i+1] - bi[i];
5616: for (j=0; j<nz; j++) {
5617: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5618: }
5620: /* Mark diagonal and invert diagonal for simplier triangular solves */
5621: pv = b->a + bs2*bdiag[i];
5622: pj = b->j + bdiag[i];
5623: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5624: /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
5625: PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);
5627: /* U part */
5628: pv = b->a + bs2*(bdiag[i+1]+1);
5629: pj = b->j + bdiag[i+1]+1;
5630: nz = bdiag[i] - bdiag[i+1] - 1;
5631: for (j=0; j<nz; j++) {
5632: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5633: }
5634: }
5636: PetscFree2(rtmp,mwork);
5638: C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5639: C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5640: C->assembled = PETSC_TRUE;
5642: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5643: return(0);
5644: }
5648: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5649: {
5650: Mat C =B;
5651: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
5652: IS isrow = b->row,isicol = b->icol;
5654: const PetscInt *r,*ic;
5655: PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5656: PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5657: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5658: PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5659: MatScalar *v_work;
5660: PetscBool col_identity,row_identity,both_identity;
5663: ISGetIndices(isrow,&r);
5664: ISGetIndices(isicol,&ic);
5666: PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);
5667: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5669: /* generate work space needed by dense LU factorization */
5670: PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);
5672: for (i=0; i<n; i++) {
5673: /* zero rtmp */
5674: /* L part */
5675: nz = bi[i+1] - bi[i];
5676: bjtmp = bj + bi[i];
5677: for (j=0; j<nz; j++) {
5678: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5679: }
5681: /* U part */
5682: nz = bdiag[i] - bdiag[i+1];
5683: bjtmp = bj + bdiag[i+1]+1;
5684: for (j=0; j<nz; j++) {
5685: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5686: }
5688: /* load in initial (unfactored row) */
5689: nz = ai[r[i]+1] - ai[r[i]];
5690: ajtmp = aj + ai[r[i]];
5691: v = aa + bs2*ai[r[i]];
5692: for (j=0; j<nz; j++) {
5693: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
5694: }
5696: /* elimination */
5697: bjtmp = bj + bi[i];
5698: nzL = bi[i+1] - bi[i];
5699: for (k=0; k < nzL; k++) {
5700: row = bjtmp[k];
5701: pc = rtmp + bs2*row;
5702: for (flg=0,j=0; j<bs2; j++) {
5703: if (pc[j]!=0.0) {
5704: flg = 1;
5705: break;
5706: }
5707: }
5708: if (flg) {
5709: pv = b->a + bs2*bdiag[row];
5710: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5711: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5712: pv = b->a + bs2*(bdiag[row+1]+1);
5713: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5714: for (j=0; j<nz; j++) {
5715: PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5716: }
5717: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5718: }
5719: }
5721: /* finished row so stick it into b->a */
5722: /* L part */
5723: pv = b->a + bs2*bi[i];
5724: pj = b->j + bi[i];
5725: nz = bi[i+1] - bi[i];
5726: for (j=0; j<nz; j++) {
5727: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5728: }
5730: /* Mark diagonal and invert diagonal for simplier triangular solves */
5731: pv = b->a + bs2*bdiag[i];
5732: pj = b->j + bdiag[i];
5733: /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5734: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5735: PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);
5737: /* U part */
5738: pv = b->a + bs2*(bdiag[i+1]+1);
5739: pj = b->j + bdiag[i+1]+1;
5740: nz = bdiag[i] - bdiag[i+1] - 1;
5741: for (j=0; j<nz; j++) {
5742: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5743: }
5744: }
5746: PetscFree(rtmp);
5747: PetscFree3(v_work,mwork,v_pivots);
5748: ISRestoreIndices(isicol,&ic);
5749: ISRestoreIndices(isrow,&r);
5751: ISIdentity(isrow,&row_identity);
5752: ISIdentity(isicol,&col_identity);
5754: both_identity = (PetscBool) (row_identity && col_identity);
5755: if (both_identity) {
5756: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5757: } else {
5758: C->ops->solve = MatSolve_SeqBAIJ_N;
5759: }
5760: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5762: C->assembled = PETSC_TRUE;
5764: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5765: return(0);
5766: }
5768: /*
5769: ilu(0) with natural ordering under new data structure.
5770: See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5771: because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5772: */
5776: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5777: {
5779: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5781: PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5782: PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp;
5785: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
5786: b = (Mat_SeqBAIJ*)(fact)->data;
5788: /* allocate matrix arrays for new data structure */
5789: PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);
5790: PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
5792: b->singlemalloc = PETSC_TRUE;
5793: b->free_a = PETSC_TRUE;
5794: b->free_ij = PETSC_TRUE;
5795: fact->preallocated = PETSC_TRUE;
5796: fact->assembled = PETSC_TRUE;
5797: if (!b->diag) {
5798: PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);
5799: PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));
5800: }
5801: bdiag = b->diag;
5803: if (n > 0) {
5804: PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
5805: }
5807: /* set bi and bj with new data structure */
5808: bi = b->i;
5809: bj = b->j;
5811: /* L part */
5812: bi[0] = 0;
5813: for (i=0; i<n; i++) {
5814: nz = adiag[i] - ai[i];
5815: bi[i+1] = bi[i] + nz;
5816: aj = a->j + ai[i];
5817: for (j=0; j<nz; j++) {
5818: *bj = aj[j]; bj++;
5819: }
5820: }
5822: /* U part */
5823: bi_temp = bi[n];
5824: bdiag[n] = bi[n]-1;
5825: for (i=n-1; i>=0; i--) {
5826: nz = ai[i+1] - adiag[i] - 1;
5827: bi_temp = bi_temp + nz + 1;
5828: aj = a->j + adiag[i] + 1;
5829: for (j=0; j<nz; j++) {
5830: *bj = aj[j]; bj++;
5831: }
5832: /* diag[i] */
5833: *bj = i; bj++;
5834: bdiag[i] = bi_temp - 1;
5835: }
5836: return(0);
5837: }
5841: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5842: {
5843: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5844: IS isicol;
5845: PetscErrorCode ierr;
5846: const PetscInt *r,*ic;
5847: PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d;
5848: PetscInt *bi,*cols,nnz,*cols_lvl;
5849: PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5850: PetscInt i,levels,diagonal_fill;
5851: PetscBool col_identity,row_identity,both_identity;
5852: PetscReal f;
5853: PetscInt nlnk,*lnk,*lnk_lvl=NULL;
5854: PetscBT lnkbt;
5855: PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr;
5856: PetscFreeSpaceList free_space =NULL,current_space=NULL;
5857: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
5858: PetscBool missing;
5859: PetscInt bs=A->rmap->bs,bs2=a->bs2;
5862: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5863: if (bs>1) { /* check shifttype */
5864: if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5865: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5866: }
5868: MatMissingDiagonal(A,&missing,&d);
5869: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5871: f = info->fill;
5872: levels = (PetscInt)info->levels;
5873: diagonal_fill = (PetscInt)info->diagonal_fill;
5875: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
5877: ISIdentity(isrow,&row_identity);
5878: ISIdentity(iscol,&col_identity);
5880: both_identity = (PetscBool) (row_identity && col_identity);
5882: if (!levels && both_identity) {
5883: /* special case: ilu(0) with natural ordering */
5884: MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
5885: MatSeqBAIJSetNumericFactorization(fact,both_identity);
5887: fact->factortype = MAT_FACTOR_ILU;
5888: (fact)->info.factor_mallocs = 0;
5889: (fact)->info.fill_ratio_given = info->fill;
5890: (fact)->info.fill_ratio_needed = 1.0;
5892: b = (Mat_SeqBAIJ*)(fact)->data;
5893: b->row = isrow;
5894: b->col = iscol;
5895: b->icol = isicol;
5896: PetscObjectReference((PetscObject)isrow);
5897: PetscObjectReference((PetscObject)iscol);
5898: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5900: PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5901: return(0);
5902: }
5904: ISGetIndices(isrow,&r);
5905: ISGetIndices(isicol,&ic);
5907: /* get new row pointers */
5908: PetscMalloc((n+1)*sizeof(PetscInt),&bi);
5909: bi[0] = 0;
5910: /* bdiag is location of diagonal in factor */
5911: PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);
5912: bdiag[0] = 0;
5914: PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);
5916: /* create a linked list for storing column indices of the active row */
5917: nlnk = n + 1;
5918: PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
5920: /* initial FreeSpace size is f*(ai[n]+1) */
5921: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
5922: current_space = free_space;
5923: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
5924: current_space_lvl = free_space_lvl;
5926: for (i=0; i<n; i++) {
5927: nzi = 0;
5928: /* copy current row into linked list */
5929: nnz = ai[r[i]+1] - ai[r[i]];
5930: if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5931: cols = aj + ai[r[i]];
5932: lnk[i] = -1; /* marker to indicate if diagonal exists */
5933: PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
5934: nzi += nlnk;
5936: /* make sure diagonal entry is included */
5937: if (diagonal_fill && lnk[i] == -1) {
5938: fm = n;
5939: while (lnk[fm] < i) fm = lnk[fm];
5940: lnk[i] = lnk[fm]; /* insert diagonal into linked list */
5941: lnk[fm] = i;
5942: lnk_lvl[i] = 0;
5943: nzi++; dcount++;
5944: }
5946: /* add pivot rows into the active row */
5947: nzbd = 0;
5948: prow = lnk[n];
5949: while (prow < i) {
5950: nnz = bdiag[prow];
5951: cols = bj_ptr[prow] + nnz + 1;
5952: cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5953: nnz = bi[prow+1] - bi[prow] - nnz - 1;
5955: PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
5956: nzi += nlnk;
5957: prow = lnk[prow];
5958: nzbd++;
5959: }
5960: bdiag[i] = nzbd;
5961: bi[i+1] = bi[i] + nzi;
5963: /* if free space is not available, make more free space */
5964: if (current_space->local_remaining<nzi) {
5965: nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5966: PetscFreeSpaceGet(nnz,¤t_space);
5967: PetscFreeSpaceGet(nnz,¤t_space_lvl);
5968: reallocs++;
5969: }
5971: /* copy data into free_space and free_space_lvl, then initialize lnk */
5972: PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
5974: bj_ptr[i] = current_space->array;
5975: bjlvl_ptr[i] = current_space_lvl->array;
5977: /* make sure the active row i has diagonal entry */
5978: if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5980: current_space->array += nzi;
5981: current_space->local_used += nzi;
5982: current_space->local_remaining -= nzi;
5984: current_space_lvl->array += nzi;
5985: current_space_lvl->local_used += nzi;
5986: current_space_lvl->local_remaining -= nzi;
5987: }
5989: ISRestoreIndices(isrow,&r);
5990: ISRestoreIndices(isicol,&ic);
5992: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5993: PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);
5994: PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
5996: PetscIncompleteLLDestroy(lnk,lnkbt);
5997: PetscFreeSpaceDestroy(free_space_lvl);
5998: PetscFree2(bj_ptr,bjlvl_ptr);
6000: #if defined(PETSC_USE_INFO)
6001: {
6002: PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6003: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
6004: PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);
6005: PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);
6006: PetscInfo(A,"for best performance.\n");
6007: if (diagonal_fill) {
6008: PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);
6009: }
6010: }
6011: #endif
6013: /* put together the new matrix */
6014: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
6015: PetscLogObjectParent(fact,isicol);
6017: b = (Mat_SeqBAIJ*)(fact)->data;
6018: b->free_a = PETSC_TRUE;
6019: b->free_ij = PETSC_TRUE;
6020: b->singlemalloc = PETSC_FALSE;
6022: PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);
6024: b->j = bj;
6025: b->i = bi;
6026: b->diag = bdiag;
6027: b->free_diag = PETSC_TRUE;
6028: b->ilen = 0;
6029: b->imax = 0;
6030: b->row = isrow;
6031: b->col = iscol;
6032: PetscObjectReference((PetscObject)isrow);
6033: PetscObjectReference((PetscObject)iscol);
6034: b->icol = isicol;
6036: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6037: /* In b structure: Free imax, ilen, old a, old j.
6038: Allocate bdiag, solve_work, new a, new j */
6039: PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
6040: b->maxnz = b->nz = bdiag[0]+1;
6042: fact->info.factor_mallocs = reallocs;
6043: fact->info.fill_ratio_given = f;
6044: fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6046: MatSeqBAIJSetNumericFactorization(fact,both_identity);
6047: return(0);
6048: }
6050: /*
6051: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6052: except that the data structure of Mat_SeqAIJ is slightly different.
6053: Not a good example of code reuse.
6054: */
6057: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6058: {
6059: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
6060: IS isicol;
6062: const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6063: PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6064: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6065: PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6066: PetscBool col_identity,row_identity,both_identity,flg;
6067: PetscReal f;
6070: MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
6071: if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6073: f = info->fill;
6074: levels = (PetscInt)info->levels;
6075: diagonal_fill = (PetscInt)info->diagonal_fill;
6077: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
6079: ISIdentity(isrow,&row_identity);
6080: ISIdentity(iscol,&col_identity);
6081: both_identity = (PetscBool) (row_identity && col_identity);
6083: if (!levels && both_identity) { /* special case copy the nonzero structure */
6084: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
6085: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6087: fact->factortype = MAT_FACTOR_ILU;
6088: b = (Mat_SeqBAIJ*)fact->data;
6089: b->row = isrow;
6090: b->col = iscol;
6091: PetscObjectReference((PetscObject)isrow);
6092: PetscObjectReference((PetscObject)iscol);
6093: b->icol = isicol;
6094: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6096: PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
6097: return(0);
6098: }
6100: /* general case perform the symbolic factorization */
6101: ISGetIndices(isrow,&r);
6102: ISGetIndices(isicol,&ic);
6104: /* get new row pointers */
6105: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
6106: ainew[0] = 0;
6107: /* don't know how many column pointers are needed so estimate */
6108: jmax = (PetscInt)(f*ai[n] + 1);
6109: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
6110: /* ajfill is level of fill for each fill entry */
6111: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
6112: /* fill is a linked list of nonzeros in active row */
6113: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
6114: /* im is level for each filled value */
6115: PetscMalloc((n+1)*sizeof(PetscInt),&im);
6116: /* dloc is location of diagonal in factor */
6117: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
6118: dloc[0] = 0;
6119: for (prow=0; prow<n; prow++) {
6121: /* copy prow into linked list */
6122: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
6123: if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6124: xi = aj + ai[r[prow]];
6125: fill[n] = n;
6126: fill[prow] = -1; /* marker for diagonal entry */
6127: while (nz--) {
6128: fm = n;
6129: idx = ic[*xi++];
6130: do {
6131: m = fm;
6132: fm = fill[m];
6133: } while (fm < idx);
6134: fill[m] = idx;
6135: fill[idx] = fm;
6136: im[idx] = 0;
6137: }
6139: /* make sure diagonal entry is included */
6140: if (diagonal_fill && fill[prow] == -1) {
6141: fm = n;
6142: while (fill[fm] < prow) fm = fill[fm];
6143: fill[prow] = fill[fm]; /* insert diagonal into linked list */
6144: fill[fm] = prow;
6145: im[prow] = 0;
6146: nzf++;
6147: dcount++;
6148: }
6150: nzi = 0;
6151: row = fill[n];
6152: while (row < prow) {
6153: incrlev = im[row] + 1;
6154: nz = dloc[row];
6155: xi = ajnew + ainew[row] + nz + 1;
6156: flev = ajfill + ainew[row] + nz + 1;
6157: nnz = ainew[row+1] - ainew[row] - nz - 1;
6158: fm = row;
6159: while (nnz-- > 0) {
6160: idx = *xi++;
6161: if (*flev + incrlev > levels) {
6162: flev++;
6163: continue;
6164: }
6165: do {
6166: m = fm;
6167: fm = fill[m];
6168: } while (fm < idx);
6169: if (fm != idx) {
6170: im[idx] = *flev + incrlev;
6171: fill[m] = idx;
6172: fill[idx] = fm;
6173: fm = idx;
6174: nzf++;
6175: } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6176: flev++;
6177: }
6178: row = fill[row];
6179: nzi++;
6180: }
6181: /* copy new filled row into permanent storage */
6182: ainew[prow+1] = ainew[prow] + nzf;
6183: if (ainew[prow+1] > jmax) {
6185: /* estimate how much additional space we will need */
6186: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6187: /* just double the memory each time */
6188: PetscInt maxadd = jmax;
6189: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6190: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6191: jmax += maxadd;
6193: /* allocate a longer ajnew and ajfill */
6194: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6195: PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
6196: PetscFree(ajnew);
6197: ajnew = xitmp;
6198: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6199: PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
6200: PetscFree(ajfill);
6201: ajfill = xitmp;
6202: reallocate++; /* count how many reallocations are needed */
6203: }
6204: xitmp = ajnew + ainew[prow];
6205: flev = ajfill + ainew[prow];
6206: dloc[prow] = nzi;
6207: fm = fill[n];
6208: while (nzf--) {
6209: *xitmp++ = fm;
6210: *flev++ = im[fm];
6211: fm = fill[fm];
6212: }
6213: /* make sure row has diagonal entry */
6214: if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6215: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6216: }
6217: PetscFree(ajfill);
6218: ISRestoreIndices(isrow,&r);
6219: ISRestoreIndices(isicol,&ic);
6220: PetscFree(fill);
6221: PetscFree(im);
6223: #if defined(PETSC_USE_INFO)
6224: {
6225: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6226: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
6227: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
6228: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
6229: PetscInfo(A,"for best performance.\n");
6230: if (diagonal_fill) {
6231: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
6232: }
6233: }
6234: #endif
6236: /* put together the new matrix */
6237: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
6238: PetscLogObjectParent(fact,isicol);
6239: b = (Mat_SeqBAIJ*)fact->data;
6241: b->free_a = PETSC_TRUE;
6242: b->free_ij = PETSC_TRUE;
6243: b->singlemalloc = PETSC_FALSE;
6245: PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
6247: b->j = ajnew;
6248: b->i = ainew;
6249: for (i=0; i<n; i++) dloc[i] += ainew[i];
6250: b->diag = dloc;
6251: b->free_diag = PETSC_TRUE;
6252: b->ilen = 0;
6253: b->imax = 0;
6254: b->row = isrow;
6255: b->col = iscol;
6256: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6258: PetscObjectReference((PetscObject)isrow);
6259: PetscObjectReference((PetscObject)iscol);
6260: b->icol = isicol;
6261: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6262: /* In b structure: Free imax, ilen, old a, old j.
6263: Allocate dloc, solve_work, new a, new j */
6264: PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
6265: b->maxnz = b->nz = ainew[n];
6267: fact->info.factor_mallocs = reallocate;
6268: fact->info.fill_ratio_given = f;
6269: fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6271: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6272: return(0);
6273: }
6277: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6278: {
6279: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
6280: /* int i,*AJ=a->j,nz=a->nz; */
6283: /* Undo Column scaling */
6284: /* while (nz--) { */
6285: /* AJ[i] = AJ[i]/4; */
6286: /* } */
6287: /* This should really invoke a push/pop logic, but we don't have that yet. */
6288: A->ops->setunfactored = NULL;
6289: return(0);
6290: }
6294: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6295: {
6296: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
6297: PetscInt *AJ=a->j,nz=a->nz;
6298: unsigned short *aj=(unsigned short*)AJ;
6301: /* Is this really necessary? */
6302: while (nz--) {
6303: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6304: }
6305: A->ops->setunfactored = NULL;
6306: return(0);
6307: }