Actual source code: baijsolvtrannat.c

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1:  #include <../src/mat/impls/baij/seq/baij.h>

  3: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
  4: {
  5:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
  6:   PetscErrorCode    ierr;
  7:   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
  8:   PetscInt          i,n = a->mbs,j;
  9:   PetscInt          nz;
 10:   PetscScalar       *x,*tmp,s1;
 11:   const MatScalar   *aa = a->a,*v;
 12:   const PetscScalar *b;

 15:   VecGetArrayRead(bb,&b);
 16:   VecGetArray(xx,&x);
 17:   tmp  = a->solve_work;


 20:   /* copy the b into temp work space according to permutation */
 21:   for (i=0; i<n; i++) tmp[i] = b[i];

 23:   /* forward solve the U^T */
 24:   for (i=0; i<n; i++) {
 25:     v   = aa + adiag[i+1] + 1;
 26:     vi  = aj + adiag[i+1] + 1;
 27:     nz  = adiag[i] - adiag[i+1] - 1;
 28:     s1  = tmp[i];
 29:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
 30:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 31:     tmp[i] = s1;
 32:   }

 34:   /* backward solve the L^T */
 35:   for (i=n-1; i>=0; i--) {
 36:     v  = aa + ai[i];
 37:     vi = aj + ai[i];
 38:     nz = ai[i+1] - ai[i];
 39:     s1 = tmp[i];
 40:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 41:   }

 43:   /* copy tmp into x according to permutation */
 44:   for (i=0; i<n; i++) x[i] = tmp[i];

 46:   VecRestoreArrayRead(bb,&b);
 47:   VecRestoreArray(xx,&x);

 49:   PetscLogFlops(2.0*a->nz-A->cmap->n);
 50:   return(0);
 51: }

 53: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 54: {
 55:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
 56:   PetscErrorCode  ierr;
 57:   PetscInt        i,nz;
 58:   const PetscInt  *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 59:   const MatScalar *aa   =a->a,*v;
 60:   PetscScalar     s1,*x;

 63:   VecCopy(bb,xx);
 64:   VecGetArray(xx,&x);

 66:   /* forward solve the U^T */
 67:   for (i=0; i<n; i++) {

 69:     v = aa + diag[i];
 70:     /* multiply by the inverse of the block diagonal */
 71:     s1 = (*v++)*x[i];
 72:     vi = aj + diag[i] + 1;
 73:     nz = ai[i+1] - diag[i] - 1;
 74:     while (nz--) {
 75:       x[*vi++] -= (*v++)*s1;
 76:     }
 77:     x[i] = s1;
 78:   }
 79:   /* backward solve the L^T */
 80:   for (i=n-1; i>=0; i--) {
 81:     v  = aa + diag[i] - 1;
 82:     vi = aj + diag[i] - 1;
 83:     nz = diag[i] - ai[i];
 84:     s1 = x[i];
 85:     while (nz--) {
 86:       x[*vi--] -=  (*v--)*s1;
 87:     }
 88:   }
 89:   VecRestoreArray(xx,&x);
 90:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
 91:   return(0);
 92: }

 94: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 95: {
 96:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
 97:   PetscErrorCode  ierr;
 98:   PetscInt        i,nz,idx,idt,oidx;
 99:   const PetscInt  *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
100:   const MatScalar *aa   =a->a,*v;
101:   PetscScalar     s1,s2,x1,x2,*x;

104:   VecCopy(bb,xx);
105:   VecGetArray(xx,&x);

107:   /* forward solve the U^T */
108:   idx = 0;
109:   for (i=0; i<n; i++) {

111:     v = aa + 4*diag[i];
112:     /* multiply by the inverse of the block diagonal */
113:     x1 = x[idx];   x2 = x[1+idx];
114:     s1 = v[0]*x1  +  v[1]*x2;
115:     s2 = v[2]*x1  +  v[3]*x2;
116:     v += 4;

118:     vi = aj + diag[i] + 1;
119:     nz = ai[i+1] - diag[i] - 1;
120:     while (nz--) {
121:       oidx       = 2*(*vi++);
122:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
123:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
124:       v         += 4;
125:     }
126:     x[idx] = s1;x[1+idx] = s2;
127:     idx   += 2;
128:   }
129:   /* backward solve the L^T */
130:   for (i=n-1; i>=0; i--) {
131:     v   = aa + 4*diag[i] - 4;
132:     vi  = aj + diag[i] - 1;
133:     nz  = diag[i] - ai[i];
134:     idt = 2*i;
135:     s1  = x[idt];  s2 = x[1+idt];
136:     while (nz--) {
137:       idx       = 2*(*vi--);
138:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
139:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
140:       v        -= 4;
141:     }
142:   }
143:   VecRestoreArray(xx,&x);
144:   PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
145:   return(0);
146: }

148: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
149: {
150:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
151:   PetscErrorCode  ierr;
152:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
153:   PetscInt        nz,idx,idt,j,i,oidx;
154:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
155:   const MatScalar *aa=a->a,*v;
156:   PetscScalar     s1,s2,x1,x2,*x;

159:   VecCopy(bb,xx);
160:   VecGetArray(xx,&x);

162:   /* forward solve the U^T */
163:   idx = 0;
164:   for (i=0; i<n; i++) {
165:     v = aa + bs2*diag[i];
166:     /* multiply by the inverse of the block diagonal */
167:     x1 = x[idx];   x2 = x[1+idx];
168:     s1 = v[0]*x1  +  v[1]*x2;
169:     s2 = v[2]*x1  +  v[3]*x2;
170:     v -= bs2;

172:     vi = aj + diag[i] - 1;
173:     nz = diag[i] - diag[i+1] - 1;
174:     for (j=0; j>-nz; j--) {
175:       oidx       = bs*vi[j];
176:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
177:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
178:       v         -= bs2;
179:     }
180:     x[idx] = s1;x[1+idx] = s2;
181:     idx   += bs;
182:   }
183:   /* backward solve the L^T */
184:   for (i=n-1; i>=0; i--) {
185:     v   = aa + bs2*ai[i];
186:     vi  = aj + ai[i];
187:     nz  = ai[i+1] - ai[i];
188:     idt = bs*i;
189:     s1  = x[idt];  s2 = x[1+idt];
190:     for (j=0; j<nz; j++) {
191:       idx       = bs*vi[j];
192:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
193:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
194:       v        += bs2;
195:     }
196:   }
197:   VecRestoreArray(xx,&x);
198:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
199:   return(0);
200: }

202: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
203: {
204:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
205:   PetscErrorCode  ierr;
206:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
207:   PetscInt        i,nz,idx,idt,oidx;
208:   const MatScalar *aa=a->a,*v;
209:   PetscScalar     s1,s2,s3,x1,x2,x3,*x;

212:   VecCopy(bb,xx);
213:   VecGetArray(xx,&x);

215:   /* forward solve the U^T */
216:   idx = 0;
217:   for (i=0; i<n; i++) {

219:     v = aa + 9*diag[i];
220:     /* multiply by the inverse of the block diagonal */
221:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
222:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
223:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
224:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
225:     v += 9;

227:     vi = aj + diag[i] + 1;
228:     nz = ai[i+1] - diag[i] - 1;
229:     while (nz--) {
230:       oidx       = 3*(*vi++);
231:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
232:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
233:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
234:       v         += 9;
235:     }
236:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
237:     idx   += 3;
238:   }
239:   /* backward solve the L^T */
240:   for (i=n-1; i>=0; i--) {
241:     v   = aa + 9*diag[i] - 9;
242:     vi  = aj + diag[i] - 1;
243:     nz  = diag[i] - ai[i];
244:     idt = 3*i;
245:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
246:     while (nz--) {
247:       idx       = 3*(*vi--);
248:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
249:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
250:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
251:       v        -= 9;
252:     }
253:   }
254:   VecRestoreArray(xx,&x);
255:   PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
256:   return(0);
257: }

259: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
260: {
261:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
262:   PetscErrorCode  ierr;
263:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
264:   PetscInt        nz,idx,idt,j,i,oidx;
265:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
266:   const MatScalar *aa=a->a,*v;
267:   PetscScalar     s1,s2,s3,x1,x2,x3,*x;

270:   VecCopy(bb,xx);
271:   VecGetArray(xx,&x);

273:   /* forward solve the U^T */
274:   idx = 0;
275:   for (i=0; i<n; i++) {
276:     v = aa + bs2*diag[i];
277:     /* multiply by the inverse of the block diagonal */
278:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
279:     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
280:     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
281:     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
282:     v -= bs2;

284:     vi = aj + diag[i] - 1;
285:     nz = diag[i] - diag[i+1] - 1;
286:     for (j=0; j>-nz; j--) {
287:       oidx       = bs*vi[j];
288:       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289:       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290:       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291:       v         -= bs2;
292:     }
293:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;
294:     idx   += bs;
295:   }
296:   /* backward solve the L^T */
297:   for (i=n-1; i>=0; i--) {
298:     v   = aa + bs2*ai[i];
299:     vi  = aj + ai[i];
300:     nz  = ai[i+1] - ai[i];
301:     idt = bs*i;
302:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
303:     for (j=0; j<nz; j++) {
304:       idx       = bs*vi[j];
305:       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
306:       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
307:       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
308:       v        += bs2;
309:     }
310:   }
311:   VecRestoreArray(xx,&x);
312:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
313:   return(0);
314: }

316: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
317: {
318:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
319:   PetscErrorCode  ierr;
320:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
321:   PetscInt        i,nz,idx,idt,oidx;
322:   const MatScalar *aa=a->a,*v;
323:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;

326:   VecCopy(bb,xx);
327:   VecGetArray(xx,&x);

329:   /* forward solve the U^T */
330:   idx = 0;
331:   for (i=0; i<n; i++) {

333:     v = aa + 16*diag[i];
334:     /* multiply by the inverse of the block diagonal */
335:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
336:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
337:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
338:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
339:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
340:     v += 16;

342:     vi = aj + diag[i] + 1;
343:     nz = ai[i+1] - diag[i] - 1;
344:     while (nz--) {
345:       oidx       = 4*(*vi++);
346:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
347:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
348:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
349:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
350:       v         += 16;
351:     }
352:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
353:     idx   += 4;
354:   }
355:   /* backward solve the L^T */
356:   for (i=n-1; i>=0; i--) {
357:     v   = aa + 16*diag[i] - 16;
358:     vi  = aj + diag[i] - 1;
359:     nz  = diag[i] - ai[i];
360:     idt = 4*i;
361:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
362:     while (nz--) {
363:       idx       = 4*(*vi--);
364:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
365:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
366:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
367:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
368:       v        -= 16;
369:     }
370:   }
371:   VecRestoreArray(xx,&x);
372:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
373:   return(0);
374: }

376: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
377: {
378:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
379:   PetscErrorCode  ierr;
380:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
381:   PetscInt        nz,idx,idt,j,i,oidx;
382:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
383:   const MatScalar *aa=a->a,*v;
384:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;

387:   VecCopy(bb,xx);
388:   VecGetArray(xx,&x);

390:   /* forward solve the U^T */
391:   idx = 0;
392:   for (i=0; i<n; i++) {
393:     v = aa + bs2*diag[i];
394:     /* multiply by the inverse of the block diagonal */
395:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
396:     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
397:     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
398:     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
399:     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
400:     v -= bs2;

402:     vi = aj + diag[i] - 1;
403:     nz = diag[i] - diag[i+1] - 1;
404:     for (j=0; j>-nz; j--) {
405:       oidx       = bs*vi[j];
406:       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
407:       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
408:       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
409:       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
410:       v         -= bs2;
411:     }
412:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
413:     idx   += bs;
414:   }
415:   /* backward solve the L^T */
416:   for (i=n-1; i>=0; i--) {
417:     v   = aa + bs2*ai[i];
418:     vi  = aj + ai[i];
419:     nz  = ai[i+1] - ai[i];
420:     idt = bs*i;
421:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
422:     for (j=0; j<nz; j++) {
423:       idx       = bs*vi[j];
424:       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
425:       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
426:       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
427:       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
428:       v        += bs2;
429:     }
430:   }
431:   VecRestoreArray(xx,&x);
432:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
433:   return(0);
434: }

436: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
437: {
438:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
439:   PetscErrorCode  ierr;
440:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
441:   PetscInt        i,nz,idx,idt,oidx;
442:   const MatScalar *aa=a->a,*v;
443:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

446:   VecCopy(bb,xx);
447:   VecGetArray(xx,&x);

449:   /* forward solve the U^T */
450:   idx = 0;
451:   for (i=0; i<n; i++) {

453:     v = aa + 25*diag[i];
454:     /* multiply by the inverse of the block diagonal */
455:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
456:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
457:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
458:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
459:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
460:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
461:     v += 25;

463:     vi = aj + diag[i] + 1;
464:     nz = ai[i+1] - diag[i] - 1;
465:     while (nz--) {
466:       oidx       = 5*(*vi++);
467:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
468:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
469:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
470:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
471:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
472:       v         += 25;
473:     }
474:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
475:     idx   += 5;
476:   }
477:   /* backward solve the L^T */
478:   for (i=n-1; i>=0; i--) {
479:     v   = aa + 25*diag[i] - 25;
480:     vi  = aj + diag[i] - 1;
481:     nz  = diag[i] - ai[i];
482:     idt = 5*i;
483:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
484:     while (nz--) {
485:       idx       = 5*(*vi--);
486:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
487:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
488:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
489:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
490:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
491:       v        -= 25;
492:     }
493:   }
494:   VecRestoreArray(xx,&x);
495:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
496:   return(0);
497: }

499: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
500: {
501:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
502:   PetscErrorCode  ierr;
503:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
504:   PetscInt        nz,idx,idt,j,i,oidx;
505:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
506:   const MatScalar *aa=a->a,*v;
507:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

510:   VecCopy(bb,xx);
511:   VecGetArray(xx,&x);

513:   /* forward solve the U^T */
514:   idx = 0;
515:   for (i=0; i<n; i++) {
516:     v = aa + bs2*diag[i];
517:     /* multiply by the inverse of the block diagonal */
518:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
519:     x5 = x[4+idx];
520:     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
521:     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
522:     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
523:     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
524:     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
525:     v -= bs2;

527:     vi = aj + diag[i] - 1;
528:     nz = diag[i] - diag[i+1] - 1;
529:     for (j=0; j>-nz; j--) {
530:       oidx       = bs*vi[j];
531:       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
532:       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
533:       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
534:       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
535:       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
536:       v         -= bs2;
537:     }
538:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
539:     idx   += bs;
540:   }
541:   /* backward solve the L^T */
542:   for (i=n-1; i>=0; i--) {
543:     v   = aa + bs2*ai[i];
544:     vi  = aj + ai[i];
545:     nz  = ai[i+1] - ai[i];
546:     idt = bs*i;
547:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
548:     for (j=0; j<nz; j++) {
549:       idx       = bs*vi[j];
550:       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
551:       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
552:       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
553:       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
554:       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
555:       v        += bs2;
556:     }
557:   }
558:   VecRestoreArray(xx,&x);
559:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
560:   return(0);
561: }

563: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
564: {
565:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
566:   PetscErrorCode  ierr;
567:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
568:   PetscInt        i,nz,idx,idt,oidx;
569:   const MatScalar *aa=a->a,*v;
570:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

573:   VecCopy(bb,xx);
574:   VecGetArray(xx,&x);

576:   /* forward solve the U^T */
577:   idx = 0;
578:   for (i=0; i<n; i++) {

580:     v = aa + 36*diag[i];
581:     /* multiply by the inverse of the block diagonal */
582:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
583:     x6 = x[5+idx];
584:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
585:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
586:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
587:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
588:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
589:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
590:     v += 36;

592:     vi = aj + diag[i] + 1;
593:     nz = ai[i+1] - diag[i] - 1;
594:     while (nz--) {
595:       oidx       = 6*(*vi++);
596:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
597:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
598:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
599:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
600:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
601:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
602:       v         += 36;
603:     }
604:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
605:     x[5+idx] = s6;
606:     idx     += 6;
607:   }
608:   /* backward solve the L^T */
609:   for (i=n-1; i>=0; i--) {
610:     v   = aa + 36*diag[i] - 36;
611:     vi  = aj + diag[i] - 1;
612:     nz  = diag[i] - ai[i];
613:     idt = 6*i;
614:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
615:     s6  = x[5+idt];
616:     while (nz--) {
617:       idx       = 6*(*vi--);
618:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
619:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
620:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
621:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
622:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
623:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
624:       v        -= 36;
625:     }
626:   }
627:   VecRestoreArray(xx,&x);
628:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
629:   return(0);
630: }

632: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
633: {
634:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
635:   PetscErrorCode  ierr;
636:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
637:   PetscInt        nz,idx,idt,j,i,oidx;
638:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
639:   const MatScalar *aa=a->a,*v;
640:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

643:   VecCopy(bb,xx);
644:   VecGetArray(xx,&x);

646:   /* forward solve the U^T */
647:   idx = 0;
648:   for (i=0; i<n; i++) {
649:     v = aa + bs2*diag[i];
650:     /* multiply by the inverse of the block diagonal */
651:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
652:     x5 = x[4+idx]; x6 = x[5+idx];
653:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
654:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
655:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
656:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
657:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
658:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
659:     v -= bs2;

661:     vi = aj + diag[i] - 1;
662:     nz = diag[i] - diag[i+1] - 1;
663:     for (j=0; j>-nz; j--) {
664:       oidx       = bs*vi[j];
665:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
666:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
667:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
668:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
669:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
670:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
671:       v         -= bs2;
672:     }
673:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
674:     x[5+idx] = s6;
675:     idx     += bs;
676:   }
677:   /* backward solve the L^T */
678:   for (i=n-1; i>=0; i--) {
679:     v   = aa + bs2*ai[i];
680:     vi  = aj + ai[i];
681:     nz  = ai[i+1] - ai[i];
682:     idt = bs*i;
683:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
684:     s6  = x[5+idt];
685:     for (j=0; j<nz; j++) {
686:       idx       = bs*vi[j];
687:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
688:       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
689:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
690:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
691:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
692:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
693:       v        += bs2;
694:     }
695:   }
696:   VecRestoreArray(xx,&x);
697:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
698:   return(0);
699: }

701: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
702: {
703:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
704:   PetscErrorCode  ierr;
705:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
706:   PetscInt        i,nz,idx,idt,oidx;
707:   const MatScalar *aa=a->a,*v;
708:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

711:   VecCopy(bb,xx);
712:   VecGetArray(xx,&x);

714:   /* forward solve the U^T */
715:   idx = 0;
716:   for (i=0; i<n; i++) {

718:     v = aa + 49*diag[i];
719:     /* multiply by the inverse of the block diagonal */
720:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
721:     x6 = x[5+idx]; x7 = x[6+idx];
722:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
723:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
724:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
725:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
726:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
727:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
728:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
729:     v += 49;

731:     vi = aj + diag[i] + 1;
732:     nz = ai[i+1] - diag[i] - 1;
733:     while (nz--) {
734:       oidx       = 7*(*vi++);
735:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
736:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
737:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
738:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
739:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
740:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
741:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
742:       v         += 49;
743:     }
744:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
745:     x[5+idx] = s6;x[6+idx] = s7;
746:     idx     += 7;
747:   }
748:   /* backward solve the L^T */
749:   for (i=n-1; i>=0; i--) {
750:     v   = aa + 49*diag[i] - 49;
751:     vi  = aj + diag[i] - 1;
752:     nz  = diag[i] - ai[i];
753:     idt = 7*i;
754:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
755:     s6  = x[5+idt];s7 = x[6+idt];
756:     while (nz--) {
757:       idx       = 7*(*vi--);
758:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
759:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
760:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
761:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
762:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
763:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
764:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
765:       v        -= 49;
766:     }
767:   }
768:   VecRestoreArray(xx,&x);
769:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
770:   return(0);
771: }
772: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
773: {
774:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
775:   PetscErrorCode  ierr;
776:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
777:   PetscInt        nz,idx,idt,j,i,oidx;
778:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
779:   const MatScalar *aa=a->a,*v;
780:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

783:   VecCopy(bb,xx);
784:   VecGetArray(xx,&x);

786:   /* forward solve the U^T */
787:   idx = 0;
788:   for (i=0; i<n; i++) {
789:     v = aa + bs2*diag[i];
790:     /* multiply by the inverse of the block diagonal */
791:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
792:     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
793:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
794:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
795:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
796:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
797:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
798:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
799:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
800:     v -= bs2;
801:     vi = aj + diag[i] - 1;
802:     nz = diag[i] - diag[i+1] - 1;
803:     for (j=0; j>-nz; j--) {
804:       oidx       = bs*vi[j];
805:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
806:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
807:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
808:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
809:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
810:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
811:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
812:       v         -= bs2;
813:     }
814:     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
815:     x[5+idx] = s6;  x[6+idx] = s7;
816:     idx     += bs;
817:   }
818:   /* backward solve the L^T */
819:   for (i=n-1; i>=0; i--) {
820:     v   = aa + bs2*ai[i];
821:     vi  = aj + ai[i];
822:     nz  = ai[i+1] - ai[i];
823:     idt = bs*i;
824:     s1  = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
825:     s6  = x[5+idt];  s7 = x[6+idt];
826:     for (j=0; j<nz; j++) {
827:       idx       = bs*vi[j];
828:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829:       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835:       v        += bs2;
836:     }
837:   }
838:   VecRestoreArray(xx,&x);
839:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
840:   return(0);
841: }