Actual source code: baijsolvtrannat.c

petsc-3.4.5 2014-06-29
  1: #include <../src/mat/impls/baij/seq/baij.h>

  5: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
  6: {
  7:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
  8:   PetscErrorCode    ierr;
  9:   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
 10:   PetscInt          i,n = a->mbs,j;
 11:   PetscInt          nz;
 12:   PetscScalar       *x,*tmp,s1;
 13:   const MatScalar   *aa = a->a,*v;
 14:   const PetscScalar *b;

 17:   VecGetArrayRead(bb,&b);
 18:   VecGetArray(xx,&x);
 19:   tmp  = a->solve_work;


 22:   /* copy the b into temp work space according to permutation */
 23:   for (i=0; i<n; i++) tmp[i] = b[i];

 25:   /* forward solve the U^T */
 26:   for (i=0; i<n; i++) {
 27:     v   = aa + adiag[i+1] + 1;
 28:     vi  = aj + adiag[i+1] + 1;
 29:     nz  = adiag[i] - adiag[i+1] - 1;
 30:     s1  = tmp[i];
 31:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
 32:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 33:     tmp[i] = s1;
 34:   }

 36:   /* backward solve the L^T */
 37:   for (i=n-1; i>=0; i--) {
 38:     v  = aa + ai[i];
 39:     vi = aj + ai[i];
 40:     nz = ai[i+1] - ai[i];
 41:     s1 = tmp[i];
 42:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 43:   }

 45:   /* copy tmp into x according to permutation */
 46:   for (i=0; i<n; i++) x[i] = tmp[i];

 48:   VecRestoreArrayRead(bb,&b);
 49:   VecRestoreArray(xx,&x);

 51:   PetscLogFlops(2.0*a->nz-A->cmap->n);
 52:   return(0);
 53: }

 57: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 58: {
 59:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
 60:   PetscErrorCode  ierr;
 61:   PetscInt        i,nz;
 62:   const PetscInt  *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 63:   const MatScalar *aa   =a->a,*v;
 64:   PetscScalar     s1,*x;

 67:   VecCopy(bb,xx);
 68:   VecGetArray(xx,&x);

 70:   /* forward solve the U^T */
 71:   for (i=0; i<n; i++) {

 73:     v = aa + diag[i];
 74:     /* multiply by the inverse of the block diagonal */
 75:     s1 = (*v++)*x[i];
 76:     vi = aj + diag[i] + 1;
 77:     nz = ai[i+1] - diag[i] - 1;
 78:     while (nz--) {
 79:       x[*vi++] -= (*v++)*s1;
 80:     }
 81:     x[i] = s1;
 82:   }
 83:   /* backward solve the L^T */
 84:   for (i=n-1; i>=0; i--) {
 85:     v  = aa + diag[i] - 1;
 86:     vi = aj + diag[i] - 1;
 87:     nz = diag[i] - ai[i];
 88:     s1 = x[i];
 89:     while (nz--) {
 90:       x[*vi--] -=  (*v--)*s1;
 91:     }
 92:   }
 93:   VecRestoreArray(xx,&x);
 94:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
 95:   return(0);
 96: }

100: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
101: {
102:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
103:   PetscErrorCode  ierr;
104:   PetscInt        i,nz,idx,idt,oidx;
105:   const PetscInt  *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
106:   const MatScalar *aa   =a->a,*v;
107:   PetscScalar     s1,s2,x1,x2,*x;

110:   VecCopy(bb,xx);
111:   VecGetArray(xx,&x);

113:   /* forward solve the U^T */
114:   idx = 0;
115:   for (i=0; i<n; i++) {

117:     v = aa + 4*diag[i];
118:     /* multiply by the inverse of the block diagonal */
119:     x1 = x[idx];   x2 = x[1+idx];
120:     s1 = v[0]*x1  +  v[1]*x2;
121:     s2 = v[2]*x1  +  v[3]*x2;
122:     v += 4;

124:     vi = aj + diag[i] + 1;
125:     nz = ai[i+1] - diag[i] - 1;
126:     while (nz--) {
127:       oidx       = 2*(*vi++);
128:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
129:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
130:       v         += 4;
131:     }
132:     x[idx] = s1;x[1+idx] = s2;
133:     idx   += 2;
134:   }
135:   /* backward solve the L^T */
136:   for (i=n-1; i>=0; i--) {
137:     v   = aa + 4*diag[i] - 4;
138:     vi  = aj + diag[i] - 1;
139:     nz  = diag[i] - ai[i];
140:     idt = 2*i;
141:     s1  = x[idt];  s2 = x[1+idt];
142:     while (nz--) {
143:       idx       = 2*(*vi--);
144:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
145:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
146:       v        -= 4;
147:     }
148:   }
149:   VecRestoreArray(xx,&x);
150:   PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
151:   return(0);
152: }

156: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
157: {
158:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
159:   PetscErrorCode  ierr;
160:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
161:   PetscInt        nz,idx,idt,j,i,oidx;
162:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
163:   const MatScalar *aa=a->a,*v;
164:   PetscScalar     s1,s2,x1,x2,*x;

167:   VecCopy(bb,xx);
168:   VecGetArray(xx,&x);

170:   /* forward solve the U^T */
171:   idx = 0;
172:   for (i=0; i<n; i++) {
173:     v = aa + bs2*diag[i];
174:     /* multiply by the inverse of the block diagonal */
175:     x1 = x[idx];   x2 = x[1+idx];
176:     s1 = v[0]*x1  +  v[1]*x2;
177:     s2 = v[2]*x1  +  v[3]*x2;
178:     v -= bs2;

180:     vi = aj + diag[i] - 1;
181:     nz = diag[i] - diag[i+1] - 1;
182:     for (j=0; j>-nz; j--) {
183:       oidx       = bs*vi[j];
184:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
185:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
186:       v         -= bs2;
187:     }
188:     x[idx] = s1;x[1+idx] = s2;
189:     idx   += bs;
190:   }
191:   /* backward solve the L^T */
192:   for (i=n-1; i>=0; i--) {
193:     v   = aa + bs2*ai[i];
194:     vi  = aj + ai[i];
195:     nz  = ai[i+1] - ai[i];
196:     idt = bs*i;
197:     s1  = x[idt];  s2 = x[1+idt];
198:     for (j=0; j<nz; j++) {
199:       idx       = bs*vi[j];
200:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
201:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
202:       v        += bs2;
203:     }
204:   }
205:   VecRestoreArray(xx,&x);
206:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
207:   return(0);
208: }

212: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
213: {
214:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
215:   PetscErrorCode  ierr;
216:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
217:   PetscInt        i,nz,idx,idt,oidx;
218:   const MatScalar *aa=a->a,*v;
219:   PetscScalar     s1,s2,s3,x1,x2,x3,*x;

222:   VecCopy(bb,xx);
223:   VecGetArray(xx,&x);

225:   /* forward solve the U^T */
226:   idx = 0;
227:   for (i=0; i<n; i++) {

229:     v = aa + 9*diag[i];
230:     /* multiply by the inverse of the block diagonal */
231:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
232:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
233:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
234:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
235:     v += 9;

237:     vi = aj + diag[i] + 1;
238:     nz = ai[i+1] - diag[i] - 1;
239:     while (nz--) {
240:       oidx       = 3*(*vi++);
241:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
242:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
243:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
244:       v         += 9;
245:     }
246:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
247:     idx   += 3;
248:   }
249:   /* backward solve the L^T */
250:   for (i=n-1; i>=0; i--) {
251:     v   = aa + 9*diag[i] - 9;
252:     vi  = aj + diag[i] - 1;
253:     nz  = diag[i] - ai[i];
254:     idt = 3*i;
255:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
256:     while (nz--) {
257:       idx       = 3*(*vi--);
258:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
259:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
260:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
261:       v        -= 9;
262:     }
263:   }
264:   VecRestoreArray(xx,&x);
265:   PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
266:   return(0);
267: }

271: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
272: {
273:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
274:   PetscErrorCode  ierr;
275:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
276:   PetscInt        nz,idx,idt,j,i,oidx;
277:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
278:   const MatScalar *aa=a->a,*v;
279:   PetscScalar     s1,s2,s3,x1,x2,x3,*x;

282:   VecCopy(bb,xx);
283:   VecGetArray(xx,&x);

285:   /* forward solve the U^T */
286:   idx = 0;
287:   for (i=0; i<n; i++) {
288:     v = aa + bs2*diag[i];
289:     /* multiply by the inverse of the block diagonal */
290:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
291:     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
292:     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
293:     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
294:     v -= bs2;

296:     vi = aj + diag[i] - 1;
297:     nz = diag[i] - diag[i+1] - 1;
298:     for (j=0; j>-nz; j--) {
299:       oidx       = bs*vi[j];
300:       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
301:       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
302:       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
303:       v         -= bs2;
304:     }
305:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;
306:     idx   += bs;
307:   }
308:   /* backward solve the L^T */
309:   for (i=n-1; i>=0; i--) {
310:     v   = aa + bs2*ai[i];
311:     vi  = aj + ai[i];
312:     nz  = ai[i+1] - ai[i];
313:     idt = bs*i;
314:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
315:     for (j=0; j<nz; j++) {
316:       idx       = bs*vi[j];
317:       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
318:       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
319:       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
320:       v        += bs2;
321:     }
322:   }
323:   VecRestoreArray(xx,&x);
324:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
325:   return(0);
326: }

330: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
331: {
332:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
333:   PetscErrorCode  ierr;
334:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
335:   PetscInt        i,nz,idx,idt,oidx;
336:   const MatScalar *aa=a->a,*v;
337:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;

340:   VecCopy(bb,xx);
341:   VecGetArray(xx,&x);

343:   /* forward solve the U^T */
344:   idx = 0;
345:   for (i=0; i<n; i++) {

347:     v = aa + 16*diag[i];
348:     /* multiply by the inverse of the block diagonal */
349:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
350:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
351:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
352:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
353:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
354:     v += 16;

356:     vi = aj + diag[i] + 1;
357:     nz = ai[i+1] - diag[i] - 1;
358:     while (nz--) {
359:       oidx       = 4*(*vi++);
360:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
361:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
362:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
363:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
364:       v         += 16;
365:     }
366:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
367:     idx   += 4;
368:   }
369:   /* backward solve the L^T */
370:   for (i=n-1; i>=0; i--) {
371:     v   = aa + 16*diag[i] - 16;
372:     vi  = aj + diag[i] - 1;
373:     nz  = diag[i] - ai[i];
374:     idt = 4*i;
375:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
376:     while (nz--) {
377:       idx       = 4*(*vi--);
378:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
379:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
380:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
381:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
382:       v        -= 16;
383:     }
384:   }
385:   VecRestoreArray(xx,&x);
386:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
387:   return(0);
388: }

392: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
393: {
394:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
395:   PetscErrorCode  ierr;
396:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
397:   PetscInt        nz,idx,idt,j,i,oidx;
398:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
399:   const MatScalar *aa=a->a,*v;
400:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;

403:   VecCopy(bb,xx);
404:   VecGetArray(xx,&x);

406:   /* forward solve the U^T */
407:   idx = 0;
408:   for (i=0; i<n; i++) {
409:     v = aa + bs2*diag[i];
410:     /* multiply by the inverse of the block diagonal */
411:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
412:     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
413:     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
414:     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
415:     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
416:     v -= bs2;

418:     vi = aj + diag[i] - 1;
419:     nz = diag[i] - diag[i+1] - 1;
420:     for (j=0; j>-nz; j--) {
421:       oidx       = bs*vi[j];
422:       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
423:       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
424:       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
425:       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
426:       v         -= bs2;
427:     }
428:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
429:     idx   += bs;
430:   }
431:   /* backward solve the L^T */
432:   for (i=n-1; i>=0; i--) {
433:     v   = aa + bs2*ai[i];
434:     vi  = aj + ai[i];
435:     nz  = ai[i+1] - ai[i];
436:     idt = bs*i;
437:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
438:     for (j=0; j<nz; j++) {
439:       idx       = bs*vi[j];
440:       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
441:       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
442:       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
443:       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
444:       v        += bs2;
445:     }
446:   }
447:   VecRestoreArray(xx,&x);
448:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
449:   return(0);
450: }

454: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
455: {
456:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
457:   PetscErrorCode  ierr;
458:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
459:   PetscInt        i,nz,idx,idt,oidx;
460:   const MatScalar *aa=a->a,*v;
461:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

464:   VecCopy(bb,xx);
465:   VecGetArray(xx,&x);

467:   /* forward solve the U^T */
468:   idx = 0;
469:   for (i=0; i<n; i++) {

471:     v = aa + 25*diag[i];
472:     /* multiply by the inverse of the block diagonal */
473:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
474:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
475:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
476:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
477:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
478:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
479:     v += 25;

481:     vi = aj + diag[i] + 1;
482:     nz = ai[i+1] - diag[i] - 1;
483:     while (nz--) {
484:       oidx       = 5*(*vi++);
485:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
486:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
487:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
488:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
489:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
490:       v         += 25;
491:     }
492:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
493:     idx   += 5;
494:   }
495:   /* backward solve the L^T */
496:   for (i=n-1; i>=0; i--) {
497:     v   = aa + 25*diag[i] - 25;
498:     vi  = aj + diag[i] - 1;
499:     nz  = diag[i] - ai[i];
500:     idt = 5*i;
501:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
502:     while (nz--) {
503:       idx       = 5*(*vi--);
504:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
505:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
506:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
507:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
508:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
509:       v        -= 25;
510:     }
511:   }
512:   VecRestoreArray(xx,&x);
513:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
514:   return(0);
515: }

519: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
520: {
521:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
522:   PetscErrorCode  ierr;
523:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
524:   PetscInt        nz,idx,idt,j,i,oidx;
525:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
526:   const MatScalar *aa=a->a,*v;
527:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

530:   VecCopy(bb,xx);
531:   VecGetArray(xx,&x);

533:   /* forward solve the U^T */
534:   idx = 0;
535:   for (i=0; i<n; i++) {
536:     v = aa + bs2*diag[i];
537:     /* multiply by the inverse of the block diagonal */
538:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
539:     x5 = x[4+idx];
540:     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
541:     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
542:     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
543:     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
544:     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
545:     v -= bs2;

547:     vi = aj + diag[i] - 1;
548:     nz = diag[i] - diag[i+1] - 1;
549:     for (j=0; j>-nz; j--) {
550:       oidx       = bs*vi[j];
551:       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
552:       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
553:       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
554:       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
555:       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
556:       v         -= bs2;
557:     }
558:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
559:     idx   += bs;
560:   }
561:   /* backward solve the L^T */
562:   for (i=n-1; i>=0; i--) {
563:     v   = aa + bs2*ai[i];
564:     vi  = aj + ai[i];
565:     nz  = ai[i+1] - ai[i];
566:     idt = bs*i;
567:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
568:     for (j=0; j<nz; j++) {
569:       idx       = bs*vi[j];
570:       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
571:       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
572:       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
573:       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
574:       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
575:       v        += bs2;
576:     }
577:   }
578:   VecRestoreArray(xx,&x);
579:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
580:   return(0);
581: }

585: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
586: {
587:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
588:   PetscErrorCode  ierr;
589:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
590:   PetscInt        i,nz,idx,idt,oidx;
591:   const MatScalar *aa=a->a,*v;
592:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

595:   VecCopy(bb,xx);
596:   VecGetArray(xx,&x);

598:   /* forward solve the U^T */
599:   idx = 0;
600:   for (i=0; i<n; i++) {

602:     v = aa + 36*diag[i];
603:     /* multiply by the inverse of the block diagonal */
604:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
605:     x6 = x[5+idx];
606:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
607:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
608:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
609:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
610:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
611:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
612:     v += 36;

614:     vi = aj + diag[i] + 1;
615:     nz = ai[i+1] - diag[i] - 1;
616:     while (nz--) {
617:       oidx       = 6*(*vi++);
618:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
619:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
620:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
621:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
622:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
623:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
624:       v         += 36;
625:     }
626:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
627:     x[5+idx] = s6;
628:     idx     += 6;
629:   }
630:   /* backward solve the L^T */
631:   for (i=n-1; i>=0; i--) {
632:     v   = aa + 36*diag[i] - 36;
633:     vi  = aj + diag[i] - 1;
634:     nz  = diag[i] - ai[i];
635:     idt = 6*i;
636:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
637:     s6  = x[5+idt];
638:     while (nz--) {
639:       idx       = 6*(*vi--);
640:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
641:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
642:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
643:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
644:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
645:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
646:       v        -= 36;
647:     }
648:   }
649:   VecRestoreArray(xx,&x);
650:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
651:   return(0);
652: }

656: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
657: {
658:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
659:   PetscErrorCode  ierr;
660:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
661:   PetscInt        nz,idx,idt,j,i,oidx;
662:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
663:   const MatScalar *aa=a->a,*v;
664:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

667:   VecCopy(bb,xx);
668:   VecGetArray(xx,&x);

670:   /* forward solve the U^T */
671:   idx = 0;
672:   for (i=0; i<n; i++) {
673:     v = aa + bs2*diag[i];
674:     /* multiply by the inverse of the block diagonal */
675:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
676:     x5 = x[4+idx]; x6 = x[5+idx];
677:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
678:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
679:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
680:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
681:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
682:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
683:     v -= bs2;

685:     vi = aj + diag[i] - 1;
686:     nz = diag[i] - diag[i+1] - 1;
687:     for (j=0; j>-nz; j--) {
688:       oidx       = bs*vi[j];
689:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
690:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
691:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
692:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
693:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
694:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
695:       v         -= bs2;
696:     }
697:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
698:     x[5+idx] = s6;
699:     idx     += bs;
700:   }
701:   /* backward solve the L^T */
702:   for (i=n-1; i>=0; i--) {
703:     v   = aa + bs2*ai[i];
704:     vi  = aj + ai[i];
705:     nz  = ai[i+1] - ai[i];
706:     idt = bs*i;
707:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
708:     s6  = x[5+idt];
709:     for (j=0; j<nz; j++) {
710:       idx       = bs*vi[j];
711:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
712:       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
713:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
714:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
715:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
716:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
717:       v        += bs2;
718:     }
719:   }
720:   VecRestoreArray(xx,&x);
721:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
722:   return(0);
723: }

727: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
728: {
729:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
730:   PetscErrorCode  ierr;
731:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
732:   PetscInt        i,nz,idx,idt,oidx;
733:   const MatScalar *aa=a->a,*v;
734:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

737:   VecCopy(bb,xx);
738:   VecGetArray(xx,&x);

740:   /* forward solve the U^T */
741:   idx = 0;
742:   for (i=0; i<n; i++) {

744:     v = aa + 49*diag[i];
745:     /* multiply by the inverse of the block diagonal */
746:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
747:     x6 = x[5+idx]; x7 = x[6+idx];
748:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
749:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
750:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
751:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
752:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
753:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
754:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
755:     v += 49;

757:     vi = aj + diag[i] + 1;
758:     nz = ai[i+1] - diag[i] - 1;
759:     while (nz--) {
760:       oidx       = 7*(*vi++);
761:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
762:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
763:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
764:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
765:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
766:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
767:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
768:       v         += 49;
769:     }
770:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
771:     x[5+idx] = s6;x[6+idx] = s7;
772:     idx     += 7;
773:   }
774:   /* backward solve the L^T */
775:   for (i=n-1; i>=0; i--) {
776:     v   = aa + 49*diag[i] - 49;
777:     vi  = aj + diag[i] - 1;
778:     nz  = diag[i] - ai[i];
779:     idt = 7*i;
780:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
781:     s6  = x[5+idt];s7 = x[6+idt];
782:     while (nz--) {
783:       idx       = 7*(*vi--);
784:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
785:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
786:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
787:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
788:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
789:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
790:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
791:       v        -= 49;
792:     }
793:   }
794:   VecRestoreArray(xx,&x);
795:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
796:   return(0);
797: }
800: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
801: {
802:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
803:   PetscErrorCode  ierr;
804:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
805:   PetscInt        nz,idx,idt,j,i,oidx;
806:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
807:   const MatScalar *aa=a->a,*v;
808:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

811:   VecCopy(bb,xx);
812:   VecGetArray(xx,&x);

814:   /* forward solve the U^T */
815:   idx = 0;
816:   for (i=0; i<n; i++) {
817:     v = aa + bs2*diag[i];
818:     /* multiply by the inverse of the block diagonal */
819:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
820:     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
821:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
822:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
823:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
824:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
825:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
826:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
827:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
828:     v -= bs2;
829:     vi = aj + diag[i] - 1;
830:     nz = diag[i] - diag[i+1] - 1;
831:     for (j=0; j>-nz; j--) {
832:       oidx       = bs*vi[j];
833:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
834:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
835:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
836:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
837:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
838:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
839:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
840:       v         -= bs2;
841:     }
842:     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
843:     x[5+idx] = s6;  x[6+idx] = s7;
844:     idx     += bs;
845:   }
846:   /* backward solve the L^T */
847:   for (i=n-1; i>=0; i--) {
848:     v   = aa + bs2*ai[i];
849:     vi  = aj + ai[i];
850:     nz  = ai[i+1] - ai[i];
851:     idt = bs*i;
852:     s1  = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
853:     s6  = x[5+idt];  s7 = x[6+idt];
854:     for (j=0; j<nz; j++) {
855:       idx       = bs*vi[j];
856:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
857:       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
858:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
859:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
860:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
861:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
862:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
863:       v        += bs2;
864:     }
865:   }
866:   VecRestoreArray(xx,&x);
867:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
868:   return(0);
869: }