Actual source code: baijsolvtran.c
petsc-3.6.1 2015-08-06
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
6: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
7: {
8: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
9: IS iscol = a->col,isrow = a->row;
10: PetscErrorCode ierr;
11: const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
12: PetscInt i,n = a->mbs,j;
13: PetscInt nz;
14: PetscScalar *x,*tmp,s1;
15: const MatScalar *aa = a->a,*v;
16: const PetscScalar *b;
19: VecGetArrayRead(bb,&b);
20: VecGetArray(xx,&x);
21: tmp = a->solve_work;
23: ISGetIndices(isrow,&rout); r = rout;
24: ISGetIndices(iscol,&cout); c = cout;
26: /* copy the b into temp work space according to permutation */
27: for (i=0; i<n; i++) tmp[i] = b[c[i]];
29: /* forward solve the U^T */
30: for (i=0; i<n; i++) {
31: v = aa + adiag[i+1] + 1;
32: vi = aj + adiag[i+1] + 1;
33: nz = adiag[i] - adiag[i+1] - 1;
34: s1 = tmp[i];
35: s1 *= v[nz]; /* multiply by inverse of diagonal entry */
36: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
37: tmp[i] = s1;
38: }
40: /* backward solve the L^T */
41: for (i=n-1; i>=0; i--) {
42: v = aa + ai[i];
43: vi = aj + ai[i];
44: nz = ai[i+1] - ai[i];
45: s1 = tmp[i];
46: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
47: }
49: /* copy tmp into x according to permutation */
50: for (i=0; i<n; i++) x[r[i]] = tmp[i];
52: ISRestoreIndices(isrow,&rout);
53: ISRestoreIndices(iscol,&cout);
54: VecRestoreArrayRead(bb,&b);
55: VecRestoreArray(xx,&x);
57: PetscLogFlops(2.0*a->nz-A->cmap->n);
58: return(0);
59: }
63: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
64: {
65: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
66: IS iscol=a->col,isrow=a->row;
67: PetscErrorCode ierr;
68: const PetscInt *r,*c,*rout,*cout;
69: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
70: PetscInt i,nz;
71: const MatScalar *aa=a->a,*v;
72: PetscScalar s1,*x,*t;
73: const PetscScalar *b;
76: VecGetArrayRead(bb,&b);
77: VecGetArray(xx,&x);
78: t = a->solve_work;
80: ISGetIndices(isrow,&rout); r = rout;
81: ISGetIndices(iscol,&cout); c = cout;
83: /* copy the b into temp work space according to permutation */
84: for (i=0; i<n; i++) t[i] = b[c[i]];
86: /* forward solve the U^T */
87: for (i=0; i<n; i++) {
89: v = aa + diag[i];
90: /* multiply by the inverse of the block diagonal */
91: s1 = (*v++)*t[i];
92: vi = aj + diag[i] + 1;
93: nz = ai[i+1] - diag[i] - 1;
94: while (nz--) {
95: t[*vi++] -= (*v++)*s1;
96: }
97: t[i] = s1;
98: }
99: /* backward solve the L^T */
100: for (i=n-1; i>=0; i--) {
101: v = aa + diag[i] - 1;
102: vi = aj + diag[i] - 1;
103: nz = diag[i] - ai[i];
104: s1 = t[i];
105: while (nz--) {
106: t[*vi--] -= (*v--)*s1;
107: }
108: }
110: /* copy t into x according to permutation */
111: for (i=0; i<n; i++) x[r[i]] = t[i];
113: ISRestoreIndices(isrow,&rout);
114: ISRestoreIndices(iscol,&cout);
115: VecRestoreArrayRead(bb,&b);
116: VecRestoreArray(xx,&x);
117: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
118: return(0);
119: }
123: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
124: {
125: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
126: IS iscol=a->col,isrow=a->row;
127: PetscErrorCode ierr;
128: const PetscInt *r,*c,*rout,*cout;
129: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
130: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
131: const MatScalar *aa=a->a,*v;
132: PetscScalar s1,s2,x1,x2,*x,*t;
133: const PetscScalar *b;
136: VecGetArrayRead(bb,&b);
137: VecGetArray(xx,&x);
138: t = a->solve_work;
140: ISGetIndices(isrow,&rout); r = rout;
141: ISGetIndices(iscol,&cout); c = cout;
143: /* copy the b into temp work space according to permutation */
144: ii = 0;
145: for (i=0; i<n; i++) {
146: ic = 2*c[i];
147: t[ii] = b[ic];
148: t[ii+1] = b[ic+1];
149: ii += 2;
150: }
152: /* forward solve the U^T */
153: idx = 0;
154: for (i=0; i<n; i++) {
156: v = aa + 4*diag[i];
157: /* multiply by the inverse of the block diagonal */
158: x1 = t[idx]; x2 = t[1+idx];
159: s1 = v[0]*x1 + v[1]*x2;
160: s2 = v[2]*x1 + v[3]*x2;
161: v += 4;
163: vi = aj + diag[i] + 1;
164: nz = ai[i+1] - diag[i] - 1;
165: while (nz--) {
166: oidx = 2*(*vi++);
167: t[oidx] -= v[0]*s1 + v[1]*s2;
168: t[oidx+1] -= v[2]*s1 + v[3]*s2;
169: v += 4;
170: }
171: t[idx] = s1;t[1+idx] = s2;
172: idx += 2;
173: }
174: /* backward solve the L^T */
175: for (i=n-1; i>=0; i--) {
176: v = aa + 4*diag[i] - 4;
177: vi = aj + diag[i] - 1;
178: nz = diag[i] - ai[i];
179: idt = 2*i;
180: s1 = t[idt]; s2 = t[1+idt];
181: while (nz--) {
182: idx = 2*(*vi--);
183: t[idx] -= v[0]*s1 + v[1]*s2;
184: t[idx+1] -= v[2]*s1 + v[3]*s2;
185: v -= 4;
186: }
187: }
189: /* copy t into x according to permutation */
190: ii = 0;
191: for (i=0; i<n; i++) {
192: ir = 2*r[i];
193: x[ir] = t[ii];
194: x[ir+1] = t[ii+1];
195: ii += 2;
196: }
198: ISRestoreIndices(isrow,&rout);
199: ISRestoreIndices(iscol,&cout);
200: VecRestoreArrayRead(bb,&b);
201: VecRestoreArray(xx,&x);
202: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
203: return(0);
204: }
208: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
209: {
210: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
211: PetscErrorCode ierr;
212: IS iscol=a->col,isrow=a->row;
213: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
214: const PetscInt *r,*c,*rout,*cout;
215: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
216: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
217: const MatScalar *aa=a->a,*v;
218: PetscScalar s1,s2,x1,x2,*x,*t;
219: const PetscScalar *b;
222: VecGetArrayRead(bb,&b);
223: VecGetArray(xx,&x);
224: t = a->solve_work;
226: ISGetIndices(isrow,&rout); r = rout;
227: ISGetIndices(iscol,&cout); c = cout;
229: /* copy b into temp work space according to permutation */
230: for (i=0; i<n; i++) {
231: ii = bs*i; ic = bs*c[i];
232: t[ii] = b[ic]; t[ii+1] = b[ic+1];
233: }
235: /* forward solve the U^T */
236: idx = 0;
237: for (i=0; i<n; i++) {
238: v = aa + bs2*diag[i];
239: /* multiply by the inverse of the block diagonal */
240: x1 = t[idx]; x2 = t[1+idx];
241: s1 = v[0]*x1 + v[1]*x2;
242: s2 = v[2]*x1 + v[3]*x2;
243: v -= bs2;
245: vi = aj + diag[i] - 1;
246: nz = diag[i] - diag[i+1] - 1;
247: for (j=0; j>-nz; j--) {
248: oidx = bs*vi[j];
249: t[oidx] -= v[0]*s1 + v[1]*s2;
250: t[oidx+1] -= v[2]*s1 + v[3]*s2;
251: v -= bs2;
252: }
253: t[idx] = s1;t[1+idx] = s2;
254: idx += bs;
255: }
256: /* backward solve the L^T */
257: for (i=n-1; i>=0; i--) {
258: v = aa + bs2*ai[i];
259: vi = aj + ai[i];
260: nz = ai[i+1] - ai[i];
261: idt = bs*i;
262: s1 = t[idt]; s2 = t[1+idt];
263: for (j=0; j<nz; j++) {
264: idx = bs*vi[j];
265: t[idx] -= v[0]*s1 + v[1]*s2;
266: t[idx+1] -= v[2]*s1 + v[3]*s2;
267: v += bs2;
268: }
269: }
271: /* copy t into x according to permutation */
272: for (i=0; i<n; i++) {
273: ii = bs*i; ir = bs*r[i];
274: x[ir] = t[ii]; x[ir+1] = t[ii+1];
275: }
277: ISRestoreIndices(isrow,&rout);
278: ISRestoreIndices(iscol,&cout);
279: VecRestoreArrayRead(bb,&b);
280: VecRestoreArray(xx,&x);
281: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
282: return(0);
283: }
287: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
288: {
289: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
290: IS iscol=a->col,isrow=a->row;
291: PetscErrorCode ierr;
292: const PetscInt *r,*c,*rout,*cout;
293: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
294: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
295: const MatScalar *aa=a->a,*v;
296: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
297: const PetscScalar *b;
300: VecGetArrayRead(bb,&b);
301: VecGetArray(xx,&x);
302: t = a->solve_work;
304: ISGetIndices(isrow,&rout); r = rout;
305: ISGetIndices(iscol,&cout); c = cout;
307: /* copy the b into temp work space according to permutation */
308: ii = 0;
309: for (i=0; i<n; i++) {
310: ic = 3*c[i];
311: t[ii] = b[ic];
312: t[ii+1] = b[ic+1];
313: t[ii+2] = b[ic+2];
314: ii += 3;
315: }
317: /* forward solve the U^T */
318: idx = 0;
319: for (i=0; i<n; i++) {
321: v = aa + 9*diag[i];
322: /* multiply by the inverse of the block diagonal */
323: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
324: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
325: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
326: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
327: v += 9;
329: vi = aj + diag[i] + 1;
330: nz = ai[i+1] - diag[i] - 1;
331: while (nz--) {
332: oidx = 3*(*vi++);
333: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
334: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
335: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
336: v += 9;
337: }
338: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
339: idx += 3;
340: }
341: /* backward solve the L^T */
342: for (i=n-1; i>=0; i--) {
343: v = aa + 9*diag[i] - 9;
344: vi = aj + diag[i] - 1;
345: nz = diag[i] - ai[i];
346: idt = 3*i;
347: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
348: while (nz--) {
349: idx = 3*(*vi--);
350: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
351: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
352: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
353: v -= 9;
354: }
355: }
357: /* copy t into x according to permutation */
358: ii = 0;
359: for (i=0; i<n; i++) {
360: ir = 3*r[i];
361: x[ir] = t[ii];
362: x[ir+1] = t[ii+1];
363: x[ir+2] = t[ii+2];
364: ii += 3;
365: }
367: ISRestoreIndices(isrow,&rout);
368: ISRestoreIndices(iscol,&cout);
369: VecRestoreArrayRead(bb,&b);
370: VecRestoreArray(xx,&x);
371: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
372: return(0);
373: }
377: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
378: {
379: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
380: PetscErrorCode ierr;
381: IS iscol=a->col,isrow=a->row;
382: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
383: const PetscInt *r,*c,*rout,*cout;
384: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
385: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
386: const MatScalar *aa=a->a,*v;
387: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
388: const PetscScalar *b;
391: VecGetArrayRead(bb,&b);
392: VecGetArray(xx,&x);
393: t = a->solve_work;
395: ISGetIndices(isrow,&rout); r = rout;
396: ISGetIndices(iscol,&cout); c = cout;
398: /* copy b into temp work space according to permutation */
399: for (i=0; i<n; i++) {
400: ii = bs*i; ic = bs*c[i];
401: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
402: }
404: /* forward solve the U^T */
405: idx = 0;
406: for (i=0; i<n; i++) {
407: v = aa + bs2*diag[i];
408: /* multiply by the inverse of the block diagonal */
409: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
410: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
411: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
412: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
413: v -= bs2;
415: vi = aj + diag[i] - 1;
416: nz = diag[i] - diag[i+1] - 1;
417: for (j=0; j>-nz; j--) {
418: oidx = bs*vi[j];
419: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
420: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
421: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
422: v -= bs2;
423: }
424: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
425: idx += bs;
426: }
427: /* backward solve the L^T */
428: for (i=n-1; i>=0; i--) {
429: v = aa + bs2*ai[i];
430: vi = aj + ai[i];
431: nz = ai[i+1] - ai[i];
432: idt = bs*i;
433: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
434: for (j=0; j<nz; j++) {
435: idx = bs*vi[j];
436: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
437: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
438: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
439: v += bs2;
440: }
441: }
443: /* copy t into x according to permutation */
444: for (i=0; i<n; i++) {
445: ii = bs*i; ir = bs*r[i];
446: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
447: }
449: ISRestoreIndices(isrow,&rout);
450: ISRestoreIndices(iscol,&cout);
451: VecRestoreArrayRead(bb,&b);
452: VecRestoreArray(xx,&x);
453: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
454: return(0);
455: }
459: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
460: {
461: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
462: IS iscol=a->col,isrow=a->row;
463: PetscErrorCode ierr;
464: const PetscInt *r,*c,*rout,*cout;
465: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
466: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
467: const MatScalar *aa=a->a,*v;
468: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
469: const PetscScalar *b;
472: VecGetArrayRead(bb,&b);
473: VecGetArray(xx,&x);
474: t = a->solve_work;
476: ISGetIndices(isrow,&rout); r = rout;
477: ISGetIndices(iscol,&cout); c = cout;
479: /* copy the b into temp work space according to permutation */
480: ii = 0;
481: for (i=0; i<n; i++) {
482: ic = 4*c[i];
483: t[ii] = b[ic];
484: t[ii+1] = b[ic+1];
485: t[ii+2] = b[ic+2];
486: t[ii+3] = b[ic+3];
487: ii += 4;
488: }
490: /* forward solve the U^T */
491: idx = 0;
492: for (i=0; i<n; i++) {
494: v = aa + 16*diag[i];
495: /* multiply by the inverse of the block diagonal */
496: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
497: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
498: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
499: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
500: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
501: v += 16;
503: vi = aj + diag[i] + 1;
504: nz = ai[i+1] - diag[i] - 1;
505: while (nz--) {
506: oidx = 4*(*vi++);
507: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
508: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
509: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
510: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
511: v += 16;
512: }
513: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
514: idx += 4;
515: }
516: /* backward solve the L^T */
517: for (i=n-1; i>=0; i--) {
518: v = aa + 16*diag[i] - 16;
519: vi = aj + diag[i] - 1;
520: nz = diag[i] - ai[i];
521: idt = 4*i;
522: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
523: while (nz--) {
524: idx = 4*(*vi--);
525: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
526: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
527: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
528: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
529: v -= 16;
530: }
531: }
533: /* copy t into x according to permutation */
534: ii = 0;
535: for (i=0; i<n; i++) {
536: ir = 4*r[i];
537: x[ir] = t[ii];
538: x[ir+1] = t[ii+1];
539: x[ir+2] = t[ii+2];
540: x[ir+3] = t[ii+3];
541: ii += 4;
542: }
544: ISRestoreIndices(isrow,&rout);
545: ISRestoreIndices(iscol,&cout);
546: VecRestoreArrayRead(bb,&b);
547: VecRestoreArray(xx,&x);
548: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
549: return(0);
550: }
554: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
555: {
556: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
557: PetscErrorCode ierr;
558: IS iscol=a->col,isrow=a->row;
559: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
560: const PetscInt *r,*c,*rout,*cout;
561: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
562: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
563: const MatScalar *aa=a->a,*v;
564: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
565: const PetscScalar *b;
568: VecGetArrayRead(bb,&b);
569: VecGetArray(xx,&x);
570: t = a->solve_work;
572: ISGetIndices(isrow,&rout); r = rout;
573: ISGetIndices(iscol,&cout); c = cout;
575: /* copy b into temp work space according to permutation */
576: for (i=0; i<n; i++) {
577: ii = bs*i; ic = bs*c[i];
578: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
579: }
581: /* forward solve the U^T */
582: idx = 0;
583: for (i=0; i<n; i++) {
584: v = aa + bs2*diag[i];
585: /* multiply by the inverse of the block diagonal */
586: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
587: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
588: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
589: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
590: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
591: v -= bs2;
593: vi = aj + diag[i] - 1;
594: nz = diag[i] - diag[i+1] - 1;
595: for (j=0; j>-nz; j--) {
596: oidx = bs*vi[j];
597: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
598: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
599: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
600: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
601: v -= bs2;
602: }
603: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4;
604: idx += bs;
605: }
606: /* backward solve the L^T */
607: for (i=n-1; i>=0; i--) {
608: v = aa + bs2*ai[i];
609: vi = aj + ai[i];
610: nz = ai[i+1] - ai[i];
611: idt = bs*i;
612: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt];
613: for (j=0; j<nz; j++) {
614: idx = bs*vi[j];
615: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
616: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
617: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
618: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
619: v += bs2;
620: }
621: }
623: /* copy t into x according to permutation */
624: for (i=0; i<n; i++) {
625: ii = bs*i; ir = bs*r[i];
626: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
627: }
629: ISRestoreIndices(isrow,&rout);
630: ISRestoreIndices(iscol,&cout);
631: VecRestoreArrayRead(bb,&b);
632: VecRestoreArray(xx,&x);
633: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
634: return(0);
635: }
639: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
640: {
641: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
642: IS iscol=a->col,isrow=a->row;
643: PetscErrorCode ierr;
644: const PetscInt *r,*c,*rout,*cout;
645: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
646: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
647: const MatScalar *aa=a->a,*v;
648: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
649: const PetscScalar *b;
652: VecGetArrayRead(bb,&b);
653: VecGetArray(xx,&x);
654: t = a->solve_work;
656: ISGetIndices(isrow,&rout); r = rout;
657: ISGetIndices(iscol,&cout); c = cout;
659: /* copy the b into temp work space according to permutation */
660: ii = 0;
661: for (i=0; i<n; i++) {
662: ic = 5*c[i];
663: t[ii] = b[ic];
664: t[ii+1] = b[ic+1];
665: t[ii+2] = b[ic+2];
666: t[ii+3] = b[ic+3];
667: t[ii+4] = b[ic+4];
668: ii += 5;
669: }
671: /* forward solve the U^T */
672: idx = 0;
673: for (i=0; i<n; i++) {
675: v = aa + 25*diag[i];
676: /* multiply by the inverse of the block diagonal */
677: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
678: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
679: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
680: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
681: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
682: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
683: v += 25;
685: vi = aj + diag[i] + 1;
686: nz = ai[i+1] - diag[i] - 1;
687: while (nz--) {
688: oidx = 5*(*vi++);
689: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
690: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
691: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
692: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
693: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
694: v += 25;
695: }
696: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
697: idx += 5;
698: }
699: /* backward solve the L^T */
700: for (i=n-1; i>=0; i--) {
701: v = aa + 25*diag[i] - 25;
702: vi = aj + diag[i] - 1;
703: nz = diag[i] - ai[i];
704: idt = 5*i;
705: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
706: while (nz--) {
707: idx = 5*(*vi--);
708: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
709: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
710: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
711: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
712: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
713: v -= 25;
714: }
715: }
717: /* copy t into x according to permutation */
718: ii = 0;
719: for (i=0; i<n; i++) {
720: ir = 5*r[i];
721: x[ir] = t[ii];
722: x[ir+1] = t[ii+1];
723: x[ir+2] = t[ii+2];
724: x[ir+3] = t[ii+3];
725: x[ir+4] = t[ii+4];
726: ii += 5;
727: }
729: ISRestoreIndices(isrow,&rout);
730: ISRestoreIndices(iscol,&cout);
731: VecRestoreArrayRead(bb,&b);
732: VecRestoreArray(xx,&x);
733: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
734: return(0);
735: }
739: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
740: {
741: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
742: PetscErrorCode ierr;
743: IS iscol=a->col,isrow=a->row;
744: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
745: const PetscInt *r,*c,*rout,*cout;
746: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
747: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
748: const MatScalar *aa=a->a,*v;
749: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
750: const PetscScalar *b;
753: VecGetArrayRead(bb,&b);
754: VecGetArray(xx,&x);
755: t = a->solve_work;
757: ISGetIndices(isrow,&rout); r = rout;
758: ISGetIndices(iscol,&cout); c = cout;
760: /* copy b into temp work space according to permutation */
761: for (i=0; i<n; i++) {
762: ii = bs*i; ic = bs*c[i];
763: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
764: t[ii+4] = b[ic+4];
765: }
767: /* forward solve the U^T */
768: idx = 0;
769: for (i=0; i<n; i++) {
770: v = aa + bs2*diag[i];
771: /* multiply by the inverse of the block diagonal */
772: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
773: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
774: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
775: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
776: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
777: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
778: v -= bs2;
780: vi = aj + diag[i] - 1;
781: nz = diag[i] - diag[i+1] - 1;
782: for (j=0; j>-nz; j--) {
783: oidx = bs*vi[j];
784: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
785: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
786: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
787: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
788: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
789: v -= bs2;
790: }
791: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
792: idx += bs;
793: }
794: /* backward solve the L^T */
795: for (i=n-1; i>=0; i--) {
796: v = aa + bs2*ai[i];
797: vi = aj + ai[i];
798: nz = ai[i+1] - ai[i];
799: idt = bs*i;
800: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
801: for (j=0; j<nz; j++) {
802: idx = bs*vi[j];
803: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
804: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
805: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
806: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
807: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
808: v += bs2;
809: }
810: }
812: /* copy t into x according to permutation */
813: for (i=0; i<n; i++) {
814: ii = bs*i; ir = bs*r[i];
815: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
816: x[ir+4] = t[ii+4];
817: }
819: ISRestoreIndices(isrow,&rout);
820: ISRestoreIndices(iscol,&cout);
821: VecRestoreArrayRead(bb,&b);
822: VecRestoreArray(xx,&x);
823: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
824: return(0);
825: }
829: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
830: {
831: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
832: IS iscol=a->col,isrow=a->row;
833: PetscErrorCode ierr;
834: const PetscInt *r,*c,*rout,*cout;
835: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
836: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
837: const MatScalar *aa=a->a,*v;
838: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
839: const PetscScalar *b;
842: VecGetArrayRead(bb,&b);
843: VecGetArray(xx,&x);
844: t = a->solve_work;
846: ISGetIndices(isrow,&rout); r = rout;
847: ISGetIndices(iscol,&cout); c = cout;
849: /* copy the b into temp work space according to permutation */
850: ii = 0;
851: for (i=0; i<n; i++) {
852: ic = 6*c[i];
853: t[ii] = b[ic];
854: t[ii+1] = b[ic+1];
855: t[ii+2] = b[ic+2];
856: t[ii+3] = b[ic+3];
857: t[ii+4] = b[ic+4];
858: t[ii+5] = b[ic+5];
859: ii += 6;
860: }
862: /* forward solve the U^T */
863: idx = 0;
864: for (i=0; i<n; i++) {
866: v = aa + 36*diag[i];
867: /* multiply by the inverse of the block diagonal */
868: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
869: x6 = t[5+idx];
870: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
871: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
872: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
873: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
874: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
875: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
876: v += 36;
878: vi = aj + diag[i] + 1;
879: nz = ai[i+1] - diag[i] - 1;
880: while (nz--) {
881: oidx = 6*(*vi++);
882: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
883: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
884: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
885: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
886: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
887: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
888: v += 36;
889: }
890: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
891: t[5+idx] = s6;
892: idx += 6;
893: }
894: /* backward solve the L^T */
895: for (i=n-1; i>=0; i--) {
896: v = aa + 36*diag[i] - 36;
897: vi = aj + diag[i] - 1;
898: nz = diag[i] - ai[i];
899: idt = 6*i;
900: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
901: s6 = t[5+idt];
902: while (nz--) {
903: idx = 6*(*vi--);
904: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
905: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
906: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
907: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
908: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
909: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
910: v -= 36;
911: }
912: }
914: /* copy t into x according to permutation */
915: ii = 0;
916: for (i=0; i<n; i++) {
917: ir = 6*r[i];
918: x[ir] = t[ii];
919: x[ir+1] = t[ii+1];
920: x[ir+2] = t[ii+2];
921: x[ir+3] = t[ii+3];
922: x[ir+4] = t[ii+4];
923: x[ir+5] = t[ii+5];
924: ii += 6;
925: }
927: ISRestoreIndices(isrow,&rout);
928: ISRestoreIndices(iscol,&cout);
929: VecRestoreArrayRead(bb,&b);
930: VecRestoreArray(xx,&x);
931: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
932: return(0);
933: }
937: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
938: {
939: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
940: PetscErrorCode ierr;
941: IS iscol=a->col,isrow=a->row;
942: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
943: const PetscInt *r,*c,*rout,*cout;
944: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
945: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
946: const MatScalar *aa=a->a,*v;
947: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
948: const PetscScalar *b;
951: VecGetArrayRead(bb,&b);
952: VecGetArray(xx,&x);
953: t = a->solve_work;
955: ISGetIndices(isrow,&rout); r = rout;
956: ISGetIndices(iscol,&cout); c = cout;
958: /* copy b into temp work space according to permutation */
959: for (i=0; i<n; i++) {
960: ii = bs*i; ic = bs*c[i];
961: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
962: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5];
963: }
965: /* forward solve the U^T */
966: idx = 0;
967: for (i=0; i<n; i++) {
968: v = aa + bs2*diag[i];
969: /* multiply by the inverse of the block diagonal */
970: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
971: x6 = t[5+idx];
972: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
973: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
974: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
975: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
976: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
977: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
978: v -= bs2;
980: vi = aj + diag[i] - 1;
981: nz = diag[i] - diag[i+1] - 1;
982: for (j=0; j>-nz; j--) {
983: oidx = bs*vi[j];
984: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
985: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
986: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
987: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
988: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
989: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
990: v -= bs2;
991: }
992: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
993: t[5+idx] = s6;
994: idx += bs;
995: }
996: /* backward solve the L^T */
997: for (i=n-1; i>=0; i--) {
998: v = aa + bs2*ai[i];
999: vi = aj + ai[i];
1000: nz = ai[i+1] - ai[i];
1001: idt = bs*i;
1002: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1003: s6 = t[5+idt];
1004: for (j=0; j<nz; j++) {
1005: idx = bs*vi[j];
1006: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1007: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1008: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1009: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1010: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1011: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1012: v += bs2;
1013: }
1014: }
1016: /* copy t into x according to permutation */
1017: for (i=0; i<n; i++) {
1018: ii = bs*i; ir = bs*r[i];
1019: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1020: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5];
1021: }
1023: ISRestoreIndices(isrow,&rout);
1024: ISRestoreIndices(iscol,&cout);
1025: VecRestoreArrayRead(bb,&b);
1026: VecRestoreArray(xx,&x);
1027: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1028: return(0);
1029: }
1033: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1034: {
1035: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1036: IS iscol=a->col,isrow=a->row;
1037: PetscErrorCode ierr;
1038: const PetscInt *r,*c,*rout,*cout;
1039: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1040: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1041: const MatScalar *aa=a->a,*v;
1042: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1043: const PetscScalar *b;
1046: VecGetArrayRead(bb,&b);
1047: VecGetArray(xx,&x);
1048: t = a->solve_work;
1050: ISGetIndices(isrow,&rout); r = rout;
1051: ISGetIndices(iscol,&cout); c = cout;
1053: /* copy the b into temp work space according to permutation */
1054: ii = 0;
1055: for (i=0; i<n; i++) {
1056: ic = 7*c[i];
1057: t[ii] = b[ic];
1058: t[ii+1] = b[ic+1];
1059: t[ii+2] = b[ic+2];
1060: t[ii+3] = b[ic+3];
1061: t[ii+4] = b[ic+4];
1062: t[ii+5] = b[ic+5];
1063: t[ii+6] = b[ic+6];
1064: ii += 7;
1065: }
1067: /* forward solve the U^T */
1068: idx = 0;
1069: for (i=0; i<n; i++) {
1071: v = aa + 49*diag[i];
1072: /* multiply by the inverse of the block diagonal */
1073: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1074: x6 = t[5+idx]; x7 = t[6+idx];
1075: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1076: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1077: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1078: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1079: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1080: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1081: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1082: v += 49;
1084: vi = aj + diag[i] + 1;
1085: nz = ai[i+1] - diag[i] - 1;
1086: while (nz--) {
1087: oidx = 7*(*vi++);
1088: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1089: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1090: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1091: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1092: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1093: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1094: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1095: v += 49;
1096: }
1097: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1098: t[5+idx] = s6;t[6+idx] = s7;
1099: idx += 7;
1100: }
1101: /* backward solve the L^T */
1102: for (i=n-1; i>=0; i--) {
1103: v = aa + 49*diag[i] - 49;
1104: vi = aj + diag[i] - 1;
1105: nz = diag[i] - ai[i];
1106: idt = 7*i;
1107: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1108: s6 = t[5+idt];s7 = t[6+idt];
1109: while (nz--) {
1110: idx = 7*(*vi--);
1111: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1112: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1113: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1114: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1115: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1116: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1117: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1118: v -= 49;
1119: }
1120: }
1122: /* copy t into x according to permutation */
1123: ii = 0;
1124: for (i=0; i<n; i++) {
1125: ir = 7*r[i];
1126: x[ir] = t[ii];
1127: x[ir+1] = t[ii+1];
1128: x[ir+2] = t[ii+2];
1129: x[ir+3] = t[ii+3];
1130: x[ir+4] = t[ii+4];
1131: x[ir+5] = t[ii+5];
1132: x[ir+6] = t[ii+6];
1133: ii += 7;
1134: }
1136: ISRestoreIndices(isrow,&rout);
1137: ISRestoreIndices(iscol,&cout);
1138: VecRestoreArrayRead(bb,&b);
1139: VecRestoreArray(xx,&x);
1140: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
1141: return(0);
1142: }
1145: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1146: {
1147: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1148: PetscErrorCode ierr;
1149: IS iscol=a->col,isrow=a->row;
1150: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1151: const PetscInt *r,*c,*rout,*cout;
1152: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1153: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1154: const MatScalar *aa=a->a,*v;
1155: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1156: const PetscScalar *b;
1159: VecGetArrayRead(bb,&b);
1160: VecGetArray(xx,&x);
1161: t = a->solve_work;
1163: ISGetIndices(isrow,&rout); r = rout;
1164: ISGetIndices(iscol,&cout); c = cout;
1166: /* copy b into temp work space according to permutation */
1167: for (i=0; i<n; i++) {
1168: ii = bs*i; ic = bs*c[i];
1169: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1170: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6];
1171: }
1173: /* forward solve the U^T */
1174: idx = 0;
1175: for (i=0; i<n; i++) {
1176: v = aa + bs2*diag[i];
1177: /* multiply by the inverse of the block diagonal */
1178: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1179: x6 = t[5+idx]; x7 = t[6+idx];
1180: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1181: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1182: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1183: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1184: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1185: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1186: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1187: v -= bs2;
1189: vi = aj + diag[i] - 1;
1190: nz = diag[i] - diag[i+1] - 1;
1191: for (j=0; j>-nz; j--) {
1192: oidx = bs*vi[j];
1193: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1194: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1195: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1196: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1197: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1198: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1199: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1200: v -= bs2;
1201: }
1202: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1203: t[5+idx] = s6; t[6+idx] = s7;
1204: idx += bs;
1205: }
1206: /* backward solve the L^T */
1207: for (i=n-1; i>=0; i--) {
1208: v = aa + bs2*ai[i];
1209: vi = aj + ai[i];
1210: nz = ai[i+1] - ai[i];
1211: idt = bs*i;
1212: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1213: s6 = t[5+idt]; s7 = t[6+idt];
1214: for (j=0; j<nz; j++) {
1215: idx = bs*vi[j];
1216: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1217: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1218: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1219: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1220: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1221: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1222: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1223: v += bs2;
1224: }
1225: }
1227: /* copy t into x according to permutation */
1228: for (i=0; i<n; i++) {
1229: ii = bs*i; ir = bs*r[i];
1230: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1231: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6];
1232: }
1234: ISRestoreIndices(isrow,&rout);
1235: ISRestoreIndices(iscol,&cout);
1236: VecRestoreArrayRead(bb,&b);
1237: VecRestoreArray(xx,&x);
1238: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1239: return(0);
1240: }
1242: /* ----------------------------------------------------------- */
1245: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
1246: {
1247: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1248: IS iscol=a->col,isrow=a->row;
1249: PetscErrorCode ierr;
1250: const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1251: PetscInt i,nz,j;
1252: const PetscInt n =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
1253: const MatScalar *aa=a->a,*v;
1254: PetscScalar *x,*t,*ls;
1255: const PetscScalar *b;
1258: VecGetArrayRead(bb,&b);
1259: VecGetArray(xx,&x);
1260: t = a->solve_work;
1262: ISGetIndices(isrow,&rout); r = rout;
1263: ISGetIndices(iscol,&cout); c = cout;
1265: /* copy the b into temp work space according to permutation */
1266: for (i=0; i<n; i++) {
1267: for (j=0; j<bs; j++) {
1268: t[i*bs+j] = b[c[i]*bs+j];
1269: }
1270: }
1273: /* forward solve the upper triangular transpose */
1274: ls = a->solve_work + A->cmap->n;
1275: for (i=0; i<n; i++) {
1276: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1277: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1278: v = aa + bs2*(a->diag[i] + 1);
1279: vi = aj + a->diag[i] + 1;
1280: nz = ai[i+1] - a->diag[i] - 1;
1281: while (nz--) {
1282: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1283: v += bs2;
1284: }
1285: }
1287: /* backward solve the lower triangular transpose */
1288: for (i=n-1; i>=0; i--) {
1289: v = aa + bs2*ai[i];
1290: vi = aj + ai[i];
1291: nz = a->diag[i] - ai[i];
1292: while (nz--) {
1293: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1294: v += bs2;
1295: }
1296: }
1298: /* copy t into x according to permutation */
1299: for (i=0; i<n; i++) {
1300: for (j=0; j<bs; j++) {
1301: x[bs*r[i]+j] = t[bs*i+j];
1302: }
1303: }
1305: ISRestoreIndices(isrow,&rout);
1306: ISRestoreIndices(iscol,&cout);
1307: VecRestoreArrayRead(bb,&b);
1308: VecRestoreArray(xx,&x);
1309: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1310: return(0);
1311: }
1315: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1316: {
1317: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1318: IS iscol=a->col,isrow=a->row;
1319: PetscErrorCode ierr;
1320: const PetscInt *r,*c,*rout,*cout;
1321: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
1322: PetscInt i,j,nz;
1323: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1324: const MatScalar *aa=a->a,*v;
1325: PetscScalar *x,*t,*ls;
1326: const PetscScalar *b;
1329: VecGetArrayRead(bb,&b);
1330: VecGetArray(xx,&x);
1331: t = a->solve_work;
1333: ISGetIndices(isrow,&rout); r = rout;
1334: ISGetIndices(iscol,&cout); c = cout;
1336: /* copy the b into temp work space according to permutation */
1337: for (i=0; i<n; i++) {
1338: for (j=0; j<bs; j++) {
1339: t[i*bs+j] = b[c[i]*bs+j];
1340: }
1341: }
1344: /* forward solve the upper triangular transpose */
1345: ls = a->solve_work + A->cmap->n;
1346: for (i=0; i<n; i++) {
1347: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1348: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
1349: v = aa + bs2*(diag[i] - 1);
1350: vi = aj + diag[i] - 1;
1351: nz = diag[i] - diag[i+1] - 1;
1352: for (j=0; j>-nz; j--) {
1353: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1354: v -= bs2;
1355: }
1356: }
1358: /* backward solve the lower triangular transpose */
1359: for (i=n-1; i>=0; i--) {
1360: v = aa + bs2*ai[i];
1361: vi = aj + ai[i];
1362: nz = ai[i+1] - ai[i];
1363: for (j=0; j<nz; j++) {
1364: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1365: v += bs2;
1366: }
1367: }
1369: /* copy t into x according to permutation */
1370: for (i=0; i<n; i++) {
1371: for (j=0; j<bs; j++) {
1372: x[bs*r[i]+j] = t[bs*i+j];
1373: }
1374: }
1376: ISRestoreIndices(isrow,&rout);
1377: ISRestoreIndices(iscol,&cout);
1378: VecRestoreArrayRead(bb,&b);
1379: VecRestoreArray(xx,&x);
1380: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1381: return(0);
1382: }