Actual source code: baijsolvtran.c
petsc-3.8.4 2018-03-24
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5: {
6: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
7: IS iscol = a->col,isrow = a->row;
8: PetscErrorCode ierr;
9: const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
10: PetscInt i,n = a->mbs,j;
11: PetscInt nz;
12: PetscScalar *x,*tmp,s1;
13: const MatScalar *aa = a->a,*v;
14: const PetscScalar *b;
17: VecGetArrayRead(bb,&b);
18: VecGetArray(xx,&x);
19: tmp = a->solve_work;
21: ISGetIndices(isrow,&rout); r = rout;
22: ISGetIndices(iscol,&cout); c = cout;
24: /* copy the b into temp work space according to permutation */
25: for (i=0; i<n; i++) tmp[i] = b[c[i]];
27: /* forward solve the U^T */
28: for (i=0; i<n; i++) {
29: v = aa + adiag[i+1] + 1;
30: vi = aj + adiag[i+1] + 1;
31: nz = adiag[i] - adiag[i+1] - 1;
32: s1 = tmp[i];
33: s1 *= v[nz]; /* multiply by inverse of diagonal entry */
34: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
35: tmp[i] = s1;
36: }
38: /* backward solve the L^T */
39: for (i=n-1; i>=0; i--) {
40: v = aa + ai[i];
41: vi = aj + ai[i];
42: nz = ai[i+1] - ai[i];
43: s1 = tmp[i];
44: for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
45: }
47: /* copy tmp into x according to permutation */
48: for (i=0; i<n; i++) x[r[i]] = tmp[i];
50: ISRestoreIndices(isrow,&rout);
51: ISRestoreIndices(iscol,&cout);
52: VecRestoreArrayRead(bb,&b);
53: VecRestoreArray(xx,&x);
55: PetscLogFlops(2.0*a->nz-A->cmap->n);
56: return(0);
57: }
59: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
60: {
61: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
62: IS iscol=a->col,isrow=a->row;
63: PetscErrorCode ierr;
64: const PetscInt *r,*c,*rout,*cout;
65: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
66: PetscInt i,nz;
67: const MatScalar *aa=a->a,*v;
68: PetscScalar s1,*x,*t;
69: const PetscScalar *b;
72: VecGetArrayRead(bb,&b);
73: VecGetArray(xx,&x);
74: t = a->solve_work;
76: ISGetIndices(isrow,&rout); r = rout;
77: ISGetIndices(iscol,&cout); c = cout;
79: /* copy the b into temp work space according to permutation */
80: for (i=0; i<n; i++) t[i] = b[c[i]];
82: /* forward solve the U^T */
83: for (i=0; i<n; i++) {
85: v = aa + diag[i];
86: /* multiply by the inverse of the block diagonal */
87: s1 = (*v++)*t[i];
88: vi = aj + diag[i] + 1;
89: nz = ai[i+1] - diag[i] - 1;
90: while (nz--) {
91: t[*vi++] -= (*v++)*s1;
92: }
93: t[i] = s1;
94: }
95: /* backward solve the L^T */
96: for (i=n-1; i>=0; i--) {
97: v = aa + diag[i] - 1;
98: vi = aj + diag[i] - 1;
99: nz = diag[i] - ai[i];
100: s1 = t[i];
101: while (nz--) {
102: t[*vi--] -= (*v--)*s1;
103: }
104: }
106: /* copy t into x according to permutation */
107: for (i=0; i<n; i++) x[r[i]] = t[i];
109: ISRestoreIndices(isrow,&rout);
110: ISRestoreIndices(iscol,&cout);
111: VecRestoreArrayRead(bb,&b);
112: VecRestoreArray(xx,&x);
113: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
114: return(0);
115: }
117: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
118: {
119: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
120: IS iscol=a->col,isrow=a->row;
121: PetscErrorCode ierr;
122: const PetscInt *r,*c,*rout,*cout;
123: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
124: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
125: const MatScalar *aa=a->a,*v;
126: PetscScalar s1,s2,x1,x2,*x,*t;
127: const PetscScalar *b;
130: VecGetArrayRead(bb,&b);
131: VecGetArray(xx,&x);
132: t = a->solve_work;
134: ISGetIndices(isrow,&rout); r = rout;
135: ISGetIndices(iscol,&cout); c = cout;
137: /* copy the b into temp work space according to permutation */
138: ii = 0;
139: for (i=0; i<n; i++) {
140: ic = 2*c[i];
141: t[ii] = b[ic];
142: t[ii+1] = b[ic+1];
143: ii += 2;
144: }
146: /* forward solve the U^T */
147: idx = 0;
148: for (i=0; i<n; i++) {
150: v = aa + 4*diag[i];
151: /* multiply by the inverse of the block diagonal */
152: x1 = t[idx]; x2 = t[1+idx];
153: s1 = v[0]*x1 + v[1]*x2;
154: s2 = v[2]*x1 + v[3]*x2;
155: v += 4;
157: vi = aj + diag[i] + 1;
158: nz = ai[i+1] - diag[i] - 1;
159: while (nz--) {
160: oidx = 2*(*vi++);
161: t[oidx] -= v[0]*s1 + v[1]*s2;
162: t[oidx+1] -= v[2]*s1 + v[3]*s2;
163: v += 4;
164: }
165: t[idx] = s1;t[1+idx] = s2;
166: idx += 2;
167: }
168: /* backward solve the L^T */
169: for (i=n-1; i>=0; i--) {
170: v = aa + 4*diag[i] - 4;
171: vi = aj + diag[i] - 1;
172: nz = diag[i] - ai[i];
173: idt = 2*i;
174: s1 = t[idt]; s2 = t[1+idt];
175: while (nz--) {
176: idx = 2*(*vi--);
177: t[idx] -= v[0]*s1 + v[1]*s2;
178: t[idx+1] -= v[2]*s1 + v[3]*s2;
179: v -= 4;
180: }
181: }
183: /* copy t into x according to permutation */
184: ii = 0;
185: for (i=0; i<n; i++) {
186: ir = 2*r[i];
187: x[ir] = t[ii];
188: x[ir+1] = t[ii+1];
189: ii += 2;
190: }
192: ISRestoreIndices(isrow,&rout);
193: ISRestoreIndices(iscol,&cout);
194: VecRestoreArrayRead(bb,&b);
195: VecRestoreArray(xx,&x);
196: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
197: return(0);
198: }
200: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
201: {
202: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
203: PetscErrorCode ierr;
204: IS iscol=a->col,isrow=a->row;
205: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
206: const PetscInt *r,*c,*rout,*cout;
207: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
208: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
209: const MatScalar *aa=a->a,*v;
210: PetscScalar s1,s2,x1,x2,*x,*t;
211: const PetscScalar *b;
214: VecGetArrayRead(bb,&b);
215: VecGetArray(xx,&x);
216: t = a->solve_work;
218: ISGetIndices(isrow,&rout); r = rout;
219: ISGetIndices(iscol,&cout); c = cout;
221: /* copy b into temp work space according to permutation */
222: for (i=0; i<n; i++) {
223: ii = bs*i; ic = bs*c[i];
224: t[ii] = b[ic]; t[ii+1] = b[ic+1];
225: }
227: /* forward solve the U^T */
228: idx = 0;
229: for (i=0; i<n; i++) {
230: v = aa + bs2*diag[i];
231: /* multiply by the inverse of the block diagonal */
232: x1 = t[idx]; x2 = t[1+idx];
233: s1 = v[0]*x1 + v[1]*x2;
234: s2 = v[2]*x1 + v[3]*x2;
235: v -= bs2;
237: vi = aj + diag[i] - 1;
238: nz = diag[i] - diag[i+1] - 1;
239: for (j=0; j>-nz; j--) {
240: oidx = bs*vi[j];
241: t[oidx] -= v[0]*s1 + v[1]*s2;
242: t[oidx+1] -= v[2]*s1 + v[3]*s2;
243: v -= bs2;
244: }
245: t[idx] = s1;t[1+idx] = s2;
246: idx += bs;
247: }
248: /* backward solve the L^T */
249: for (i=n-1; i>=0; i--) {
250: v = aa + bs2*ai[i];
251: vi = aj + ai[i];
252: nz = ai[i+1] - ai[i];
253: idt = bs*i;
254: s1 = t[idt]; s2 = t[1+idt];
255: for (j=0; j<nz; j++) {
256: idx = bs*vi[j];
257: t[idx] -= v[0]*s1 + v[1]*s2;
258: t[idx+1] -= v[2]*s1 + v[3]*s2;
259: v += bs2;
260: }
261: }
263: /* copy t into x according to permutation */
264: for (i=0; i<n; i++) {
265: ii = bs*i; ir = bs*r[i];
266: x[ir] = t[ii]; x[ir+1] = t[ii+1];
267: }
269: ISRestoreIndices(isrow,&rout);
270: ISRestoreIndices(iscol,&cout);
271: VecRestoreArrayRead(bb,&b);
272: VecRestoreArray(xx,&x);
273: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
274: return(0);
275: }
277: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
278: {
279: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
280: IS iscol=a->col,isrow=a->row;
281: PetscErrorCode ierr;
282: const PetscInt *r,*c,*rout,*cout;
283: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
284: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
285: const MatScalar *aa=a->a,*v;
286: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
287: const PetscScalar *b;
290: VecGetArrayRead(bb,&b);
291: VecGetArray(xx,&x);
292: t = a->solve_work;
294: ISGetIndices(isrow,&rout); r = rout;
295: ISGetIndices(iscol,&cout); c = cout;
297: /* copy the b into temp work space according to permutation */
298: ii = 0;
299: for (i=0; i<n; i++) {
300: ic = 3*c[i];
301: t[ii] = b[ic];
302: t[ii+1] = b[ic+1];
303: t[ii+2] = b[ic+2];
304: ii += 3;
305: }
307: /* forward solve the U^T */
308: idx = 0;
309: for (i=0; i<n; i++) {
311: v = aa + 9*diag[i];
312: /* multiply by the inverse of the block diagonal */
313: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
314: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
315: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
316: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
317: v += 9;
319: vi = aj + diag[i] + 1;
320: nz = ai[i+1] - diag[i] - 1;
321: while (nz--) {
322: oidx = 3*(*vi++);
323: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
324: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
325: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
326: v += 9;
327: }
328: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
329: idx += 3;
330: }
331: /* backward solve the L^T */
332: for (i=n-1; i>=0; i--) {
333: v = aa + 9*diag[i] - 9;
334: vi = aj + diag[i] - 1;
335: nz = diag[i] - ai[i];
336: idt = 3*i;
337: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
338: while (nz--) {
339: idx = 3*(*vi--);
340: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
341: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
342: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
343: v -= 9;
344: }
345: }
347: /* copy t into x according to permutation */
348: ii = 0;
349: for (i=0; i<n; i++) {
350: ir = 3*r[i];
351: x[ir] = t[ii];
352: x[ir+1] = t[ii+1];
353: x[ir+2] = t[ii+2];
354: ii += 3;
355: }
357: ISRestoreIndices(isrow,&rout);
358: ISRestoreIndices(iscol,&cout);
359: VecRestoreArrayRead(bb,&b);
360: VecRestoreArray(xx,&x);
361: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
362: return(0);
363: }
365: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
366: {
367: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
368: PetscErrorCode ierr;
369: IS iscol=a->col,isrow=a->row;
370: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
371: const PetscInt *r,*c,*rout,*cout;
372: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
373: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
374: const MatScalar *aa=a->a,*v;
375: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
376: const PetscScalar *b;
379: VecGetArrayRead(bb,&b);
380: VecGetArray(xx,&x);
381: t = a->solve_work;
383: ISGetIndices(isrow,&rout); r = rout;
384: ISGetIndices(iscol,&cout); c = cout;
386: /* copy b into temp work space according to permutation */
387: for (i=0; i<n; i++) {
388: ii = bs*i; ic = bs*c[i];
389: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
390: }
392: /* forward solve the U^T */
393: idx = 0;
394: for (i=0; i<n; i++) {
395: v = aa + bs2*diag[i];
396: /* multiply by the inverse of the block diagonal */
397: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
398: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
399: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
400: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
401: v -= bs2;
403: vi = aj + diag[i] - 1;
404: nz = diag[i] - diag[i+1] - 1;
405: for (j=0; j>-nz; j--) {
406: oidx = bs*vi[j];
407: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
408: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
409: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
410: v -= bs2;
411: }
412: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
413: idx += bs;
414: }
415: /* backward solve the L^T */
416: for (i=n-1; i>=0; i--) {
417: v = aa + bs2*ai[i];
418: vi = aj + ai[i];
419: nz = ai[i+1] - ai[i];
420: idt = bs*i;
421: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
422: for (j=0; j<nz; j++) {
423: idx = bs*vi[j];
424: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
425: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
426: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
427: v += bs2;
428: }
429: }
431: /* copy t into x according to permutation */
432: for (i=0; i<n; i++) {
433: ii = bs*i; ir = bs*r[i];
434: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
435: }
437: ISRestoreIndices(isrow,&rout);
438: ISRestoreIndices(iscol,&cout);
439: VecRestoreArrayRead(bb,&b);
440: VecRestoreArray(xx,&x);
441: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
442: return(0);
443: }
445: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
446: {
447: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
448: IS iscol=a->col,isrow=a->row;
449: PetscErrorCode ierr;
450: const PetscInt *r,*c,*rout,*cout;
451: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
452: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
453: const MatScalar *aa=a->a,*v;
454: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
455: const PetscScalar *b;
458: VecGetArrayRead(bb,&b);
459: VecGetArray(xx,&x);
460: t = a->solve_work;
462: ISGetIndices(isrow,&rout); r = rout;
463: ISGetIndices(iscol,&cout); c = cout;
465: /* copy the b into temp work space according to permutation */
466: ii = 0;
467: for (i=0; i<n; i++) {
468: ic = 4*c[i];
469: t[ii] = b[ic];
470: t[ii+1] = b[ic+1];
471: t[ii+2] = b[ic+2];
472: t[ii+3] = b[ic+3];
473: ii += 4;
474: }
476: /* forward solve the U^T */
477: idx = 0;
478: for (i=0; i<n; i++) {
480: v = aa + 16*diag[i];
481: /* multiply by the inverse of the block diagonal */
482: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
483: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
484: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
485: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
486: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
487: v += 16;
489: vi = aj + diag[i] + 1;
490: nz = ai[i+1] - diag[i] - 1;
491: while (nz--) {
492: oidx = 4*(*vi++);
493: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
494: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
495: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
496: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
497: v += 16;
498: }
499: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
500: idx += 4;
501: }
502: /* backward solve the L^T */
503: for (i=n-1; i>=0; i--) {
504: v = aa + 16*diag[i] - 16;
505: vi = aj + diag[i] - 1;
506: nz = diag[i] - ai[i];
507: idt = 4*i;
508: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
509: while (nz--) {
510: idx = 4*(*vi--);
511: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
512: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
513: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
514: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
515: v -= 16;
516: }
517: }
519: /* copy t into x according to permutation */
520: ii = 0;
521: for (i=0; i<n; i++) {
522: ir = 4*r[i];
523: x[ir] = t[ii];
524: x[ir+1] = t[ii+1];
525: x[ir+2] = t[ii+2];
526: x[ir+3] = t[ii+3];
527: ii += 4;
528: }
530: ISRestoreIndices(isrow,&rout);
531: ISRestoreIndices(iscol,&cout);
532: VecRestoreArrayRead(bb,&b);
533: VecRestoreArray(xx,&x);
534: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
535: return(0);
536: }
538: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
539: {
540: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
541: PetscErrorCode ierr;
542: IS iscol=a->col,isrow=a->row;
543: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
544: const PetscInt *r,*c,*rout,*cout;
545: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
546: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
547: const MatScalar *aa=a->a,*v;
548: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
549: const PetscScalar *b;
552: VecGetArrayRead(bb,&b);
553: VecGetArray(xx,&x);
554: t = a->solve_work;
556: ISGetIndices(isrow,&rout); r = rout;
557: ISGetIndices(iscol,&cout); c = cout;
559: /* copy b into temp work space according to permutation */
560: for (i=0; i<n; i++) {
561: ii = bs*i; ic = bs*c[i];
562: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
563: }
565: /* forward solve the U^T */
566: idx = 0;
567: for (i=0; i<n; i++) {
568: v = aa + bs2*diag[i];
569: /* multiply by the inverse of the block diagonal */
570: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
571: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
572: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
573: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
574: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
575: v -= bs2;
577: vi = aj + diag[i] - 1;
578: nz = diag[i] - diag[i+1] - 1;
579: for (j=0; j>-nz; j--) {
580: oidx = bs*vi[j];
581: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
582: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
583: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
584: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
585: v -= bs2;
586: }
587: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4;
588: idx += bs;
589: }
590: /* backward solve the L^T */
591: for (i=n-1; i>=0; i--) {
592: v = aa + bs2*ai[i];
593: vi = aj + ai[i];
594: nz = ai[i+1] - ai[i];
595: idt = bs*i;
596: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt];
597: for (j=0; j<nz; j++) {
598: idx = bs*vi[j];
599: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
600: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
601: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
602: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
603: v += bs2;
604: }
605: }
607: /* copy t into x according to permutation */
608: for (i=0; i<n; i++) {
609: ii = bs*i; ir = bs*r[i];
610: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
611: }
613: ISRestoreIndices(isrow,&rout);
614: ISRestoreIndices(iscol,&cout);
615: VecRestoreArrayRead(bb,&b);
616: VecRestoreArray(xx,&x);
617: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
618: return(0);
619: }
621: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
622: {
623: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
624: IS iscol=a->col,isrow=a->row;
625: PetscErrorCode ierr;
626: const PetscInt *r,*c,*rout,*cout;
627: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
628: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
629: const MatScalar *aa=a->a,*v;
630: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
631: const PetscScalar *b;
634: VecGetArrayRead(bb,&b);
635: VecGetArray(xx,&x);
636: t = a->solve_work;
638: ISGetIndices(isrow,&rout); r = rout;
639: ISGetIndices(iscol,&cout); c = cout;
641: /* copy the b into temp work space according to permutation */
642: ii = 0;
643: for (i=0; i<n; i++) {
644: ic = 5*c[i];
645: t[ii] = b[ic];
646: t[ii+1] = b[ic+1];
647: t[ii+2] = b[ic+2];
648: t[ii+3] = b[ic+3];
649: t[ii+4] = b[ic+4];
650: ii += 5;
651: }
653: /* forward solve the U^T */
654: idx = 0;
655: for (i=0; i<n; i++) {
657: v = aa + 25*diag[i];
658: /* multiply by the inverse of the block diagonal */
659: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
660: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
661: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
662: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
663: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
664: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
665: v += 25;
667: vi = aj + diag[i] + 1;
668: nz = ai[i+1] - diag[i] - 1;
669: while (nz--) {
670: oidx = 5*(*vi++);
671: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
672: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
673: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
674: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
675: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
676: v += 25;
677: }
678: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
679: idx += 5;
680: }
681: /* backward solve the L^T */
682: for (i=n-1; i>=0; i--) {
683: v = aa + 25*diag[i] - 25;
684: vi = aj + diag[i] - 1;
685: nz = diag[i] - ai[i];
686: idt = 5*i;
687: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
688: while (nz--) {
689: idx = 5*(*vi--);
690: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
691: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
692: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
693: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
694: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
695: v -= 25;
696: }
697: }
699: /* copy t into x according to permutation */
700: ii = 0;
701: for (i=0; i<n; i++) {
702: ir = 5*r[i];
703: x[ir] = t[ii];
704: x[ir+1] = t[ii+1];
705: x[ir+2] = t[ii+2];
706: x[ir+3] = t[ii+3];
707: x[ir+4] = t[ii+4];
708: ii += 5;
709: }
711: ISRestoreIndices(isrow,&rout);
712: ISRestoreIndices(iscol,&cout);
713: VecRestoreArrayRead(bb,&b);
714: VecRestoreArray(xx,&x);
715: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
716: return(0);
717: }
719: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
720: {
721: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
722: PetscErrorCode ierr;
723: IS iscol=a->col,isrow=a->row;
724: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
725: const PetscInt *r,*c,*rout,*cout;
726: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
727: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
728: const MatScalar *aa=a->a,*v;
729: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
730: const PetscScalar *b;
733: VecGetArrayRead(bb,&b);
734: VecGetArray(xx,&x);
735: t = a->solve_work;
737: ISGetIndices(isrow,&rout); r = rout;
738: ISGetIndices(iscol,&cout); c = cout;
740: /* copy b into temp work space according to permutation */
741: for (i=0; i<n; i++) {
742: ii = bs*i; ic = bs*c[i];
743: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
744: t[ii+4] = b[ic+4];
745: }
747: /* forward solve the U^T */
748: idx = 0;
749: for (i=0; i<n; i++) {
750: v = aa + bs2*diag[i];
751: /* multiply by the inverse of the block diagonal */
752: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
753: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
754: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
755: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
756: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
757: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
758: v -= bs2;
760: vi = aj + diag[i] - 1;
761: nz = diag[i] - diag[i+1] - 1;
762: for (j=0; j>-nz; j--) {
763: oidx = bs*vi[j];
764: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
765: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
766: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
767: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
768: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
769: v -= bs2;
770: }
771: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
772: idx += bs;
773: }
774: /* backward solve the L^T */
775: for (i=n-1; i>=0; i--) {
776: v = aa + bs2*ai[i];
777: vi = aj + ai[i];
778: nz = ai[i+1] - ai[i];
779: idt = bs*i;
780: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
781: for (j=0; j<nz; j++) {
782: idx = bs*vi[j];
783: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
784: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
785: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
786: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
787: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
788: v += bs2;
789: }
790: }
792: /* copy t into x according to permutation */
793: for (i=0; i<n; i++) {
794: ii = bs*i; ir = bs*r[i];
795: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
796: x[ir+4] = t[ii+4];
797: }
799: ISRestoreIndices(isrow,&rout);
800: ISRestoreIndices(iscol,&cout);
801: VecRestoreArrayRead(bb,&b);
802: VecRestoreArray(xx,&x);
803: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
804: return(0);
805: }
807: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
808: {
809: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
810: IS iscol=a->col,isrow=a->row;
811: PetscErrorCode ierr;
812: const PetscInt *r,*c,*rout,*cout;
813: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
814: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
815: const MatScalar *aa=a->a,*v;
816: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
817: const PetscScalar *b;
820: VecGetArrayRead(bb,&b);
821: VecGetArray(xx,&x);
822: t = a->solve_work;
824: ISGetIndices(isrow,&rout); r = rout;
825: ISGetIndices(iscol,&cout); c = cout;
827: /* copy the b into temp work space according to permutation */
828: ii = 0;
829: for (i=0; i<n; i++) {
830: ic = 6*c[i];
831: t[ii] = b[ic];
832: t[ii+1] = b[ic+1];
833: t[ii+2] = b[ic+2];
834: t[ii+3] = b[ic+3];
835: t[ii+4] = b[ic+4];
836: t[ii+5] = b[ic+5];
837: ii += 6;
838: }
840: /* forward solve the U^T */
841: idx = 0;
842: for (i=0; i<n; i++) {
844: v = aa + 36*diag[i];
845: /* multiply by the inverse of the block diagonal */
846: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
847: x6 = t[5+idx];
848: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
849: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
850: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
851: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
852: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
853: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
854: v += 36;
856: vi = aj + diag[i] + 1;
857: nz = ai[i+1] - diag[i] - 1;
858: while (nz--) {
859: oidx = 6*(*vi++);
860: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
861: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
862: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
863: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
864: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
865: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
866: v += 36;
867: }
868: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
869: t[5+idx] = s6;
870: idx += 6;
871: }
872: /* backward solve the L^T */
873: for (i=n-1; i>=0; i--) {
874: v = aa + 36*diag[i] - 36;
875: vi = aj + diag[i] - 1;
876: nz = diag[i] - ai[i];
877: idt = 6*i;
878: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
879: s6 = t[5+idt];
880: while (nz--) {
881: idx = 6*(*vi--);
882: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
883: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
884: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
885: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
886: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
887: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
888: v -= 36;
889: }
890: }
892: /* copy t into x according to permutation */
893: ii = 0;
894: for (i=0; i<n; i++) {
895: ir = 6*r[i];
896: x[ir] = t[ii];
897: x[ir+1] = t[ii+1];
898: x[ir+2] = t[ii+2];
899: x[ir+3] = t[ii+3];
900: x[ir+4] = t[ii+4];
901: x[ir+5] = t[ii+5];
902: ii += 6;
903: }
905: ISRestoreIndices(isrow,&rout);
906: ISRestoreIndices(iscol,&cout);
907: VecRestoreArrayRead(bb,&b);
908: VecRestoreArray(xx,&x);
909: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
910: return(0);
911: }
913: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
914: {
915: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
916: PetscErrorCode ierr;
917: IS iscol=a->col,isrow=a->row;
918: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
919: const PetscInt *r,*c,*rout,*cout;
920: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
921: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
922: const MatScalar *aa=a->a,*v;
923: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
924: const PetscScalar *b;
927: VecGetArrayRead(bb,&b);
928: VecGetArray(xx,&x);
929: t = a->solve_work;
931: ISGetIndices(isrow,&rout); r = rout;
932: ISGetIndices(iscol,&cout); c = cout;
934: /* copy b into temp work space according to permutation */
935: for (i=0; i<n; i++) {
936: ii = bs*i; ic = bs*c[i];
937: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
938: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5];
939: }
941: /* forward solve the U^T */
942: idx = 0;
943: for (i=0; i<n; i++) {
944: v = aa + bs2*diag[i];
945: /* multiply by the inverse of the block diagonal */
946: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
947: x6 = t[5+idx];
948: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
949: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
950: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
951: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
952: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
953: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
954: v -= bs2;
956: vi = aj + diag[i] - 1;
957: nz = diag[i] - diag[i+1] - 1;
958: for (j=0; j>-nz; j--) {
959: oidx = bs*vi[j];
960: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
961: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
962: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
963: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
964: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
965: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
966: v -= bs2;
967: }
968: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
969: t[5+idx] = s6;
970: idx += bs;
971: }
972: /* backward solve the L^T */
973: for (i=n-1; i>=0; i--) {
974: v = aa + bs2*ai[i];
975: vi = aj + ai[i];
976: nz = ai[i+1] - ai[i];
977: idt = bs*i;
978: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
979: s6 = t[5+idt];
980: for (j=0; j<nz; j++) {
981: idx = bs*vi[j];
982: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
983: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
984: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
985: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
986: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
987: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
988: v += bs2;
989: }
990: }
992: /* copy t into x according to permutation */
993: for (i=0; i<n; i++) {
994: ii = bs*i; ir = bs*r[i];
995: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
996: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5];
997: }
999: ISRestoreIndices(isrow,&rout);
1000: ISRestoreIndices(iscol,&cout);
1001: VecRestoreArrayRead(bb,&b);
1002: VecRestoreArray(xx,&x);
1003: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1004: return(0);
1005: }
1007: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1008: {
1009: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1010: IS iscol=a->col,isrow=a->row;
1011: PetscErrorCode ierr;
1012: const PetscInt *r,*c,*rout,*cout;
1013: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1014: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1015: const MatScalar *aa=a->a,*v;
1016: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1017: const PetscScalar *b;
1020: VecGetArrayRead(bb,&b);
1021: VecGetArray(xx,&x);
1022: t = a->solve_work;
1024: ISGetIndices(isrow,&rout); r = rout;
1025: ISGetIndices(iscol,&cout); c = cout;
1027: /* copy the b into temp work space according to permutation */
1028: ii = 0;
1029: for (i=0; i<n; i++) {
1030: ic = 7*c[i];
1031: t[ii] = b[ic];
1032: t[ii+1] = b[ic+1];
1033: t[ii+2] = b[ic+2];
1034: t[ii+3] = b[ic+3];
1035: t[ii+4] = b[ic+4];
1036: t[ii+5] = b[ic+5];
1037: t[ii+6] = b[ic+6];
1038: ii += 7;
1039: }
1041: /* forward solve the U^T */
1042: idx = 0;
1043: for (i=0; i<n; i++) {
1045: v = aa + 49*diag[i];
1046: /* multiply by the inverse of the block diagonal */
1047: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048: x6 = t[5+idx]; x7 = t[6+idx];
1049: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1050: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056: v += 49;
1058: vi = aj + diag[i] + 1;
1059: nz = ai[i+1] - diag[i] - 1;
1060: while (nz--) {
1061: oidx = 7*(*vi++);
1062: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1063: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069: v += 49;
1070: }
1071: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072: t[5+idx] = s6;t[6+idx] = s7;
1073: idx += 7;
1074: }
1075: /* backward solve the L^T */
1076: for (i=n-1; i>=0; i--) {
1077: v = aa + 49*diag[i] - 49;
1078: vi = aj + diag[i] - 1;
1079: nz = diag[i] - ai[i];
1080: idt = 7*i;
1081: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082: s6 = t[5+idt];s7 = t[6+idt];
1083: while (nz--) {
1084: idx = 7*(*vi--);
1085: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1086: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092: v -= 49;
1093: }
1094: }
1096: /* copy t into x according to permutation */
1097: ii = 0;
1098: for (i=0; i<n; i++) {
1099: ir = 7*r[i];
1100: x[ir] = t[ii];
1101: x[ir+1] = t[ii+1];
1102: x[ir+2] = t[ii+2];
1103: x[ir+3] = t[ii+3];
1104: x[ir+4] = t[ii+4];
1105: x[ir+5] = t[ii+5];
1106: x[ir+6] = t[ii+6];
1107: ii += 7;
1108: }
1110: ISRestoreIndices(isrow,&rout);
1111: ISRestoreIndices(iscol,&cout);
1112: VecRestoreArrayRead(bb,&b);
1113: VecRestoreArray(xx,&x);
1114: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
1115: return(0);
1116: }
1117: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1118: {
1119: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
1120: PetscErrorCode ierr;
1121: IS iscol=a->col,isrow=a->row;
1122: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1123: const PetscInt *r,*c,*rout,*cout;
1124: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1125: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1126: const MatScalar *aa=a->a,*v;
1127: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1128: const PetscScalar *b;
1131: VecGetArrayRead(bb,&b);
1132: VecGetArray(xx,&x);
1133: t = a->solve_work;
1135: ISGetIndices(isrow,&rout); r = rout;
1136: ISGetIndices(iscol,&cout); c = cout;
1138: /* copy b into temp work space according to permutation */
1139: for (i=0; i<n; i++) {
1140: ii = bs*i; ic = bs*c[i];
1141: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1142: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6];
1143: }
1145: /* forward solve the U^T */
1146: idx = 0;
1147: for (i=0; i<n; i++) {
1148: v = aa + bs2*diag[i];
1149: /* multiply by the inverse of the block diagonal */
1150: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1151: x6 = t[5+idx]; x7 = t[6+idx];
1152: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1153: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1154: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1155: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1156: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1157: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1158: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1159: v -= bs2;
1161: vi = aj + diag[i] - 1;
1162: nz = diag[i] - diag[i+1] - 1;
1163: for (j=0; j>-nz; j--) {
1164: oidx = bs*vi[j];
1165: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1166: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1167: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1168: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1169: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1170: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1171: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1172: v -= bs2;
1173: }
1174: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1175: t[5+idx] = s6; t[6+idx] = s7;
1176: idx += bs;
1177: }
1178: /* backward solve the L^T */
1179: for (i=n-1; i>=0; i--) {
1180: v = aa + bs2*ai[i];
1181: vi = aj + ai[i];
1182: nz = ai[i+1] - ai[i];
1183: idt = bs*i;
1184: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1185: s6 = t[5+idt]; s7 = t[6+idt];
1186: for (j=0; j<nz; j++) {
1187: idx = bs*vi[j];
1188: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1189: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1190: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1191: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1192: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1193: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1194: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1195: v += bs2;
1196: }
1197: }
1199: /* copy t into x according to permutation */
1200: for (i=0; i<n; i++) {
1201: ii = bs*i; ir = bs*r[i];
1202: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1203: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6];
1204: }
1206: ISRestoreIndices(isrow,&rout);
1207: ISRestoreIndices(iscol,&cout);
1208: VecRestoreArrayRead(bb,&b);
1209: VecRestoreArray(xx,&x);
1210: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1211: return(0);
1212: }
1214: /* ----------------------------------------------------------- */
1215: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
1216: {
1217: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1218: IS iscol=a->col,isrow=a->row;
1219: PetscErrorCode ierr;
1220: const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1221: PetscInt i,nz,j;
1222: const PetscInt n =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
1223: const MatScalar *aa=a->a,*v;
1224: PetscScalar *x,*t,*ls;
1225: const PetscScalar *b;
1228: VecGetArrayRead(bb,&b);
1229: VecGetArray(xx,&x);
1230: t = a->solve_work;
1232: ISGetIndices(isrow,&rout); r = rout;
1233: ISGetIndices(iscol,&cout); c = cout;
1235: /* copy the b into temp work space according to permutation */
1236: for (i=0; i<n; i++) {
1237: for (j=0; j<bs; j++) {
1238: t[i*bs+j] = b[c[i]*bs+j];
1239: }
1240: }
1243: /* forward solve the upper triangular transpose */
1244: ls = a->solve_work + A->cmap->n;
1245: for (i=0; i<n; i++) {
1246: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1247: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1248: v = aa + bs2*(a->diag[i] + 1);
1249: vi = aj + a->diag[i] + 1;
1250: nz = ai[i+1] - a->diag[i] - 1;
1251: while (nz--) {
1252: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1253: v += bs2;
1254: }
1255: }
1257: /* backward solve the lower triangular transpose */
1258: for (i=n-1; i>=0; i--) {
1259: v = aa + bs2*ai[i];
1260: vi = aj + ai[i];
1261: nz = a->diag[i] - ai[i];
1262: while (nz--) {
1263: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1264: v += bs2;
1265: }
1266: }
1268: /* copy t into x according to permutation */
1269: for (i=0; i<n; i++) {
1270: for (j=0; j<bs; j++) {
1271: x[bs*r[i]+j] = t[bs*i+j];
1272: }
1273: }
1275: ISRestoreIndices(isrow,&rout);
1276: ISRestoreIndices(iscol,&cout);
1277: VecRestoreArrayRead(bb,&b);
1278: VecRestoreArray(xx,&x);
1279: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1280: return(0);
1281: }
1283: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1284: {
1285: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1286: IS iscol=a->col,isrow=a->row;
1287: PetscErrorCode ierr;
1288: const PetscInt *r,*c,*rout,*cout;
1289: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
1290: PetscInt i,j,nz;
1291: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
1292: const MatScalar *aa=a->a,*v;
1293: PetscScalar *x,*t,*ls;
1294: const PetscScalar *b;
1297: VecGetArrayRead(bb,&b);
1298: VecGetArray(xx,&x);
1299: t = a->solve_work;
1301: ISGetIndices(isrow,&rout); r = rout;
1302: ISGetIndices(iscol,&cout); c = cout;
1304: /* copy the b into temp work space according to permutation */
1305: for (i=0; i<n; i++) {
1306: for (j=0; j<bs; j++) {
1307: t[i*bs+j] = b[c[i]*bs+j];
1308: }
1309: }
1312: /* forward solve the upper triangular transpose */
1313: ls = a->solve_work + A->cmap->n;
1314: for (i=0; i<n; i++) {
1315: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1316: PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
1317: v = aa + bs2*(diag[i] - 1);
1318: vi = aj + diag[i] - 1;
1319: nz = diag[i] - diag[i+1] - 1;
1320: for (j=0; j>-nz; j--) {
1321: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1322: v -= bs2;
1323: }
1324: }
1326: /* backward solve the lower triangular transpose */
1327: for (i=n-1; i>=0; i--) {
1328: v = aa + bs2*ai[i];
1329: vi = aj + ai[i];
1330: nz = ai[i+1] - ai[i];
1331: for (j=0; j<nz; j++) {
1332: PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1333: v += bs2;
1334: }
1335: }
1337: /* copy t into x according to permutation */
1338: for (i=0; i<n; i++) {
1339: for (j=0; j<bs; j++) {
1340: x[bs*r[i]+j] = t[bs*i+j];
1341: }
1342: }
1344: ISRestoreIndices(isrow,&rout);
1345: ISRestoreIndices(iscol,&cout);
1346: VecRestoreArrayRead(bb,&b);
1347: VecRestoreArray(xx,&x);
1348: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1349: return(0);
1350: }