Actual source code: baijsolvtran.c

petsc-3.6.1 2015-08-06
Report Typos and Errors
  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  6: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
  7: {
  8:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
  9:   IS                iscol = a->col,isrow = a->row;
 10:   PetscErrorCode    ierr;
 11:   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
 12:   PetscInt          i,n = a->mbs,j;
 13:   PetscInt          nz;
 14:   PetscScalar       *x,*tmp,s1;
 15:   const MatScalar   *aa = a->a,*v;
 16:   const PetscScalar *b;

 19:   VecGetArrayRead(bb,&b);
 20:   VecGetArray(xx,&x);
 21:   tmp  = a->solve_work;

 23:   ISGetIndices(isrow,&rout); r = rout;
 24:   ISGetIndices(iscol,&cout); c = cout;

 26:   /* copy the b into temp work space according to permutation */
 27:   for (i=0; i<n; i++) tmp[i] = b[c[i]];

 29:   /* forward solve the U^T */
 30:   for (i=0; i<n; i++) {
 31:     v   = aa + adiag[i+1] + 1;
 32:     vi  = aj + adiag[i+1] + 1;
 33:     nz  = adiag[i] - adiag[i+1] - 1;
 34:     s1  = tmp[i];
 35:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
 36:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 37:     tmp[i] = s1;
 38:   }

 40:   /* backward solve the L^T */
 41:   for (i=n-1; i>=0; i--) {
 42:     v  = aa + ai[i];
 43:     vi = aj + ai[i];
 44:     nz = ai[i+1] - ai[i];
 45:     s1 = tmp[i];
 46:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 47:   }

 49:   /* copy tmp into x according to permutation */
 50:   for (i=0; i<n; i++) x[r[i]] = tmp[i];

 52:   ISRestoreIndices(isrow,&rout);
 53:   ISRestoreIndices(iscol,&cout);
 54:   VecRestoreArrayRead(bb,&b);
 55:   VecRestoreArray(xx,&x);

 57:   PetscLogFlops(2.0*a->nz-A->cmap->n);
 58:   return(0);
 59: }

 63: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
 64: {
 65:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
 66:   IS                iscol=a->col,isrow=a->row;
 67:   PetscErrorCode    ierr;
 68:   const PetscInt    *r,*c,*rout,*cout;
 69:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 70:   PetscInt          i,nz;
 71:   const MatScalar   *aa=a->a,*v;
 72:   PetscScalar       s1,*x,*t;
 73:   const PetscScalar *b;

 76:   VecGetArrayRead(bb,&b);
 77:   VecGetArray(xx,&x);
 78:   t    = a->solve_work;

 80:   ISGetIndices(isrow,&rout); r = rout;
 81:   ISGetIndices(iscol,&cout); c = cout;

 83:   /* copy the b into temp work space according to permutation */
 84:   for (i=0; i<n; i++) t[i] = b[c[i]];

 86:   /* forward solve the U^T */
 87:   for (i=0; i<n; i++) {

 89:     v = aa + diag[i];
 90:     /* multiply by the inverse of the block diagonal */
 91:     s1 = (*v++)*t[i];
 92:     vi = aj + diag[i] + 1;
 93:     nz = ai[i+1] - diag[i] - 1;
 94:     while (nz--) {
 95:       t[*vi++] -= (*v++)*s1;
 96:     }
 97:     t[i] = s1;
 98:   }
 99:   /* backward solve the L^T */
100:   for (i=n-1; i>=0; i--) {
101:     v  = aa + diag[i] - 1;
102:     vi = aj + diag[i] - 1;
103:     nz = diag[i] - ai[i];
104:     s1 = t[i];
105:     while (nz--) {
106:       t[*vi--] -=  (*v--)*s1;
107:     }
108:   }

110:   /* copy t into x according to permutation */
111:   for (i=0; i<n; i++) x[r[i]] = t[i];

113:   ISRestoreIndices(isrow,&rout);
114:   ISRestoreIndices(iscol,&cout);
115:   VecRestoreArrayRead(bb,&b);
116:   VecRestoreArray(xx,&x);
117:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
118:   return(0);
119: }

123: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
124: {
125:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
126:   IS                iscol=a->col,isrow=a->row;
127:   PetscErrorCode    ierr;
128:   const PetscInt    *r,*c,*rout,*cout;
129:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
130:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
131:   const MatScalar   *aa=a->a,*v;
132:   PetscScalar       s1,s2,x1,x2,*x,*t;
133:   const PetscScalar *b;

136:   VecGetArrayRead(bb,&b);
137:   VecGetArray(xx,&x);
138:   t    = a->solve_work;

140:   ISGetIndices(isrow,&rout); r = rout;
141:   ISGetIndices(iscol,&cout); c = cout;

143:   /* copy the b into temp work space according to permutation */
144:   ii = 0;
145:   for (i=0; i<n; i++) {
146:     ic      = 2*c[i];
147:     t[ii]   = b[ic];
148:     t[ii+1] = b[ic+1];
149:     ii     += 2;
150:   }

152:   /* forward solve the U^T */
153:   idx = 0;
154:   for (i=0; i<n; i++) {

156:     v = aa + 4*diag[i];
157:     /* multiply by the inverse of the block diagonal */
158:     x1 = t[idx];   x2 = t[1+idx];
159:     s1 = v[0]*x1  +  v[1]*x2;
160:     s2 = v[2]*x1  +  v[3]*x2;
161:     v += 4;

163:     vi = aj + diag[i] + 1;
164:     nz = ai[i+1] - diag[i] - 1;
165:     while (nz--) {
166:       oidx       = 2*(*vi++);
167:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
168:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
169:       v         += 4;
170:     }
171:     t[idx] = s1;t[1+idx] = s2;
172:     idx   += 2;
173:   }
174:   /* backward solve the L^T */
175:   for (i=n-1; i>=0; i--) {
176:     v   = aa + 4*diag[i] - 4;
177:     vi  = aj + diag[i] - 1;
178:     nz  = diag[i] - ai[i];
179:     idt = 2*i;
180:     s1  = t[idt];  s2 = t[1+idt];
181:     while (nz--) {
182:       idx       = 2*(*vi--);
183:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
184:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
185:       v        -= 4;
186:     }
187:   }

189:   /* copy t into x according to permutation */
190:   ii = 0;
191:   for (i=0; i<n; i++) {
192:     ir      = 2*r[i];
193:     x[ir]   = t[ii];
194:     x[ir+1] = t[ii+1];
195:     ii     += 2;
196:   }

198:   ISRestoreIndices(isrow,&rout);
199:   ISRestoreIndices(iscol,&cout);
200:   VecRestoreArrayRead(bb,&b);
201:   VecRestoreArray(xx,&x);
202:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
203:   return(0);
204: }

208: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
209: {
210:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
211:   PetscErrorCode    ierr;
212:   IS                iscol=a->col,isrow=a->row;
213:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
214:   const PetscInt    *r,*c,*rout,*cout;
215:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
216:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
217:   const MatScalar   *aa=a->a,*v;
218:   PetscScalar       s1,s2,x1,x2,*x,*t;
219:   const PetscScalar *b;

222:   VecGetArrayRead(bb,&b);
223:   VecGetArray(xx,&x);
224:   t    = a->solve_work;

226:   ISGetIndices(isrow,&rout); r = rout;
227:   ISGetIndices(iscol,&cout); c = cout;

229:   /* copy b into temp work space according to permutation */
230:   for (i=0; i<n; i++) {
231:     ii    = bs*i; ic = bs*c[i];
232:     t[ii] = b[ic]; t[ii+1] = b[ic+1];
233:   }

235:   /* forward solve the U^T */
236:   idx = 0;
237:   for (i=0; i<n; i++) {
238:     v = aa + bs2*diag[i];
239:     /* multiply by the inverse of the block diagonal */
240:     x1 = t[idx];   x2 = t[1+idx];
241:     s1 = v[0]*x1  +  v[1]*x2;
242:     s2 = v[2]*x1  +  v[3]*x2;
243:     v -= bs2;

245:     vi = aj + diag[i] - 1;
246:     nz = diag[i] - diag[i+1] - 1;
247:     for (j=0; j>-nz; j--) {
248:       oidx       = bs*vi[j];
249:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
250:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
251:       v         -= bs2;
252:     }
253:     t[idx] = s1;t[1+idx] = s2;
254:     idx   += bs;
255:   }
256:   /* backward solve the L^T */
257:   for (i=n-1; i>=0; i--) {
258:     v   = aa + bs2*ai[i];
259:     vi  = aj + ai[i];
260:     nz  = ai[i+1] - ai[i];
261:     idt = bs*i;
262:     s1  = t[idt];  s2 = t[1+idt];
263:     for (j=0; j<nz; j++) {
264:       idx       = bs*vi[j];
265:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
266:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
267:       v        += bs2;
268:     }
269:   }

271:   /* copy t into x according to permutation */
272:   for (i=0; i<n; i++) {
273:     ii    = bs*i;  ir = bs*r[i];
274:     x[ir] = t[ii];  x[ir+1] = t[ii+1];
275:   }

277:   ISRestoreIndices(isrow,&rout);
278:   ISRestoreIndices(iscol,&cout);
279:   VecRestoreArrayRead(bb,&b);
280:   VecRestoreArray(xx,&x);
281:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
282:   return(0);
283: }

287: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
288: {
289:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
290:   IS                iscol=a->col,isrow=a->row;
291:   PetscErrorCode    ierr;
292:   const PetscInt    *r,*c,*rout,*cout;
293:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
294:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
295:   const MatScalar   *aa=a->a,*v;
296:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
297:   const PetscScalar *b;

300:   VecGetArrayRead(bb,&b);
301:   VecGetArray(xx,&x);
302:   t    = a->solve_work;

304:   ISGetIndices(isrow,&rout); r = rout;
305:   ISGetIndices(iscol,&cout); c = cout;

307:   /* copy the b into temp work space according to permutation */
308:   ii = 0;
309:   for (i=0; i<n; i++) {
310:     ic      = 3*c[i];
311:     t[ii]   = b[ic];
312:     t[ii+1] = b[ic+1];
313:     t[ii+2] = b[ic+2];
314:     ii     += 3;
315:   }

317:   /* forward solve the U^T */
318:   idx = 0;
319:   for (i=0; i<n; i++) {

321:     v = aa + 9*diag[i];
322:     /* multiply by the inverse of the block diagonal */
323:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
324:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
325:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
326:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
327:     v += 9;

329:     vi = aj + diag[i] + 1;
330:     nz = ai[i+1] - diag[i] - 1;
331:     while (nz--) {
332:       oidx       = 3*(*vi++);
333:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
334:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
335:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
336:       v         += 9;
337:     }
338:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
339:     idx   += 3;
340:   }
341:   /* backward solve the L^T */
342:   for (i=n-1; i>=0; i--) {
343:     v   = aa + 9*diag[i] - 9;
344:     vi  = aj + diag[i] - 1;
345:     nz  = diag[i] - ai[i];
346:     idt = 3*i;
347:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
348:     while (nz--) {
349:       idx       = 3*(*vi--);
350:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
351:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
352:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
353:       v        -= 9;
354:     }
355:   }

357:   /* copy t into x according to permutation */
358:   ii = 0;
359:   for (i=0; i<n; i++) {
360:     ir      = 3*r[i];
361:     x[ir]   = t[ii];
362:     x[ir+1] = t[ii+1];
363:     x[ir+2] = t[ii+2];
364:     ii     += 3;
365:   }

367:   ISRestoreIndices(isrow,&rout);
368:   ISRestoreIndices(iscol,&cout);
369:   VecRestoreArrayRead(bb,&b);
370:   VecRestoreArray(xx,&x);
371:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
372:   return(0);
373: }

377: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
378: {
379:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
380:   PetscErrorCode    ierr;
381:   IS                iscol=a->col,isrow=a->row;
382:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
383:   const PetscInt    *r,*c,*rout,*cout;
384:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
385:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
386:   const MatScalar   *aa=a->a,*v;
387:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
388:   const PetscScalar *b;

391:   VecGetArrayRead(bb,&b);
392:   VecGetArray(xx,&x);
393:   t    = a->solve_work;

395:   ISGetIndices(isrow,&rout); r = rout;
396:   ISGetIndices(iscol,&cout); c = cout;

398:   /* copy b into temp work space according to permutation */
399:   for (i=0; i<n; i++) {
400:     ii    = bs*i; ic = bs*c[i];
401:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
402:   }

404:   /* forward solve the U^T */
405:   idx = 0;
406:   for (i=0; i<n; i++) {
407:     v = aa + bs2*diag[i];
408:     /* multiply by the inverse of the block diagonal */
409:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
410:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
411:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
412:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
413:     v -= bs2;

415:     vi = aj + diag[i] - 1;
416:     nz = diag[i] - diag[i+1] - 1;
417:     for (j=0; j>-nz; j--) {
418:       oidx       = bs*vi[j];
419:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
420:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
421:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
422:       v         -= bs2;
423:     }
424:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;
425:     idx   += bs;
426:   }
427:   /* backward solve the L^T */
428:   for (i=n-1; i>=0; i--) {
429:     v   = aa + bs2*ai[i];
430:     vi  = aj + ai[i];
431:     nz  = ai[i+1] - ai[i];
432:     idt = bs*i;
433:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
434:     for (j=0; j<nz; j++) {
435:       idx       = bs*vi[j];
436:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
437:       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
438:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
439:       v        += bs2;
440:     }
441:   }

443:   /* copy t into x according to permutation */
444:   for (i=0; i<n; i++) {
445:     ii    = bs*i;  ir = bs*r[i];
446:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
447:   }

449:   ISRestoreIndices(isrow,&rout);
450:   ISRestoreIndices(iscol,&cout);
451:   VecRestoreArrayRead(bb,&b);
452:   VecRestoreArray(xx,&x);
453:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
454:   return(0);
455: }

459: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
460: {
461:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
462:   IS                iscol=a->col,isrow=a->row;
463:   PetscErrorCode    ierr;
464:   const PetscInt    *r,*c,*rout,*cout;
465:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
466:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
467:   const MatScalar   *aa=a->a,*v;
468:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
469:   const PetscScalar *b;

472:   VecGetArrayRead(bb,&b);
473:   VecGetArray(xx,&x);
474:   t    = a->solve_work;

476:   ISGetIndices(isrow,&rout); r = rout;
477:   ISGetIndices(iscol,&cout); c = cout;

479:   /* copy the b into temp work space according to permutation */
480:   ii = 0;
481:   for (i=0; i<n; i++) {
482:     ic      = 4*c[i];
483:     t[ii]   = b[ic];
484:     t[ii+1] = b[ic+1];
485:     t[ii+2] = b[ic+2];
486:     t[ii+3] = b[ic+3];
487:     ii     += 4;
488:   }

490:   /* forward solve the U^T */
491:   idx = 0;
492:   for (i=0; i<n; i++) {

494:     v = aa + 16*diag[i];
495:     /* multiply by the inverse of the block diagonal */
496:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
497:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
498:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
499:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
500:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
501:     v += 16;

503:     vi = aj + diag[i] + 1;
504:     nz = ai[i+1] - diag[i] - 1;
505:     while (nz--) {
506:       oidx       = 4*(*vi++);
507:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
508:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
509:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
510:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
511:       v         += 16;
512:     }
513:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
514:     idx   += 4;
515:   }
516:   /* backward solve the L^T */
517:   for (i=n-1; i>=0; i--) {
518:     v   = aa + 16*diag[i] - 16;
519:     vi  = aj + diag[i] - 1;
520:     nz  = diag[i] - ai[i];
521:     idt = 4*i;
522:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
523:     while (nz--) {
524:       idx       = 4*(*vi--);
525:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
526:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
527:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
528:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
529:       v        -= 16;
530:     }
531:   }

533:   /* copy t into x according to permutation */
534:   ii = 0;
535:   for (i=0; i<n; i++) {
536:     ir      = 4*r[i];
537:     x[ir]   = t[ii];
538:     x[ir+1] = t[ii+1];
539:     x[ir+2] = t[ii+2];
540:     x[ir+3] = t[ii+3];
541:     ii     += 4;
542:   }

544:   ISRestoreIndices(isrow,&rout);
545:   ISRestoreIndices(iscol,&cout);
546:   VecRestoreArrayRead(bb,&b);
547:   VecRestoreArray(xx,&x);
548:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
549:   return(0);
550: }

554: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
555: {
556:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
557:   PetscErrorCode    ierr;
558:   IS                iscol=a->col,isrow=a->row;
559:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
560:   const PetscInt    *r,*c,*rout,*cout;
561:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
562:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
563:   const MatScalar   *aa=a->a,*v;
564:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
565:   const PetscScalar *b;

568:   VecGetArrayRead(bb,&b);
569:   VecGetArray(xx,&x);
570:   t    = a->solve_work;

572:   ISGetIndices(isrow,&rout); r = rout;
573:   ISGetIndices(iscol,&cout); c = cout;

575:   /* copy b into temp work space according to permutation */
576:   for (i=0; i<n; i++) {
577:     ii    = bs*i; ic = bs*c[i];
578:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
579:   }

581:   /* forward solve the U^T */
582:   idx = 0;
583:   for (i=0; i<n; i++) {
584:     v = aa + bs2*diag[i];
585:     /* multiply by the inverse of the block diagonal */
586:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
587:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
588:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
589:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
590:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
591:     v -= bs2;

593:     vi = aj + diag[i] - 1;
594:     nz = diag[i] - diag[i+1] - 1;
595:     for (j=0; j>-nz; j--) {
596:       oidx       = bs*vi[j];
597:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
598:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
599:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
600:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
601:       v         -= bs2;
602:     }
603:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
604:     idx   += bs;
605:   }
606:   /* backward solve the L^T */
607:   for (i=n-1; i>=0; i--) {
608:     v   = aa + bs2*ai[i];
609:     vi  = aj + ai[i];
610:     nz  = ai[i+1] - ai[i];
611:     idt = bs*i;
612:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
613:     for (j=0; j<nz; j++) {
614:       idx       = bs*vi[j];
615:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
616:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
617:       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
618:       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
619:       v        += bs2;
620:     }
621:   }

623:   /* copy t into x according to permutation */
624:   for (i=0; i<n; i++) {
625:     ii    = bs*i;  ir = bs*r[i];
626:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
627:   }

629:   ISRestoreIndices(isrow,&rout);
630:   ISRestoreIndices(iscol,&cout);
631:   VecRestoreArrayRead(bb,&b);
632:   VecRestoreArray(xx,&x);
633:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
634:   return(0);
635: }

639: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
640: {
641:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
642:   IS                iscol=a->col,isrow=a->row;
643:   PetscErrorCode    ierr;
644:   const PetscInt    *r,*c,*rout,*cout;
645:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
646:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
647:   const MatScalar   *aa=a->a,*v;
648:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
649:   const PetscScalar *b;

652:   VecGetArrayRead(bb,&b);
653:   VecGetArray(xx,&x);
654:   t    = a->solve_work;

656:   ISGetIndices(isrow,&rout); r = rout;
657:   ISGetIndices(iscol,&cout); c = cout;

659:   /* copy the b into temp work space according to permutation */
660:   ii = 0;
661:   for (i=0; i<n; i++) {
662:     ic      = 5*c[i];
663:     t[ii]   = b[ic];
664:     t[ii+1] = b[ic+1];
665:     t[ii+2] = b[ic+2];
666:     t[ii+3] = b[ic+3];
667:     t[ii+4] = b[ic+4];
668:     ii     += 5;
669:   }

671:   /* forward solve the U^T */
672:   idx = 0;
673:   for (i=0; i<n; i++) {

675:     v = aa + 25*diag[i];
676:     /* multiply by the inverse of the block diagonal */
677:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
678:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
679:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
680:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
681:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
682:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
683:     v += 25;

685:     vi = aj + diag[i] + 1;
686:     nz = ai[i+1] - diag[i] - 1;
687:     while (nz--) {
688:       oidx       = 5*(*vi++);
689:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
690:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
691:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
692:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
693:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
694:       v         += 25;
695:     }
696:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
697:     idx   += 5;
698:   }
699:   /* backward solve the L^T */
700:   for (i=n-1; i>=0; i--) {
701:     v   = aa + 25*diag[i] - 25;
702:     vi  = aj + diag[i] - 1;
703:     nz  = diag[i] - ai[i];
704:     idt = 5*i;
705:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
706:     while (nz--) {
707:       idx       = 5*(*vi--);
708:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
709:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
710:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
711:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
712:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
713:       v        -= 25;
714:     }
715:   }

717:   /* copy t into x according to permutation */
718:   ii = 0;
719:   for (i=0; i<n; i++) {
720:     ir      = 5*r[i];
721:     x[ir]   = t[ii];
722:     x[ir+1] = t[ii+1];
723:     x[ir+2] = t[ii+2];
724:     x[ir+3] = t[ii+3];
725:     x[ir+4] = t[ii+4];
726:     ii     += 5;
727:   }

729:   ISRestoreIndices(isrow,&rout);
730:   ISRestoreIndices(iscol,&cout);
731:   VecRestoreArrayRead(bb,&b);
732:   VecRestoreArray(xx,&x);
733:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
734:   return(0);
735: }

739: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
740: {
741:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
742:   PetscErrorCode    ierr;
743:   IS                iscol=a->col,isrow=a->row;
744:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
745:   const PetscInt    *r,*c,*rout,*cout;
746:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
747:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
748:   const MatScalar   *aa=a->a,*v;
749:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
750:   const PetscScalar *b;

753:   VecGetArrayRead(bb,&b);
754:   VecGetArray(xx,&x);
755:   t    = a->solve_work;

757:   ISGetIndices(isrow,&rout); r = rout;
758:   ISGetIndices(iscol,&cout); c = cout;

760:   /* copy b into temp work space according to permutation */
761:   for (i=0; i<n; i++) {
762:     ii      = bs*i; ic = bs*c[i];
763:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
764:     t[ii+4] = b[ic+4];
765:   }

767:   /* forward solve the U^T */
768:   idx = 0;
769:   for (i=0; i<n; i++) {
770:     v = aa + bs2*diag[i];
771:     /* multiply by the inverse of the block diagonal */
772:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
773:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
774:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
775:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
776:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
777:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
778:     v -= bs2;

780:     vi = aj + diag[i] - 1;
781:     nz = diag[i] - diag[i+1] - 1;
782:     for (j=0; j>-nz; j--) {
783:       oidx       = bs*vi[j];
784:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
785:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
786:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
787:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
788:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
789:       v         -= bs2;
790:     }
791:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
792:     idx   += bs;
793:   }
794:   /* backward solve the L^T */
795:   for (i=n-1; i>=0; i--) {
796:     v   = aa + bs2*ai[i];
797:     vi  = aj + ai[i];
798:     nz  = ai[i+1] - ai[i];
799:     idt = bs*i;
800:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
801:     for (j=0; j<nz; j++) {
802:       idx       = bs*vi[j];
803:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
804:       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
805:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
806:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
807:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
808:       v        += bs2;
809:     }
810:   }

812:   /* copy t into x according to permutation */
813:   for (i=0; i<n; i++) {
814:     ii      = bs*i;  ir = bs*r[i];
815:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
816:     x[ir+4] = t[ii+4];
817:   }

819:   ISRestoreIndices(isrow,&rout);
820:   ISRestoreIndices(iscol,&cout);
821:   VecRestoreArrayRead(bb,&b);
822:   VecRestoreArray(xx,&x);
823:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
824:   return(0);
825: }

829: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
830: {
831:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
832:   IS                iscol=a->col,isrow=a->row;
833:   PetscErrorCode    ierr;
834:   const PetscInt    *r,*c,*rout,*cout;
835:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
836:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
837:   const MatScalar   *aa=a->a,*v;
838:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
839:   const PetscScalar *b;

842:   VecGetArrayRead(bb,&b);
843:   VecGetArray(xx,&x);
844:   t    = a->solve_work;

846:   ISGetIndices(isrow,&rout); r = rout;
847:   ISGetIndices(iscol,&cout); c = cout;

849:   /* copy the b into temp work space according to permutation */
850:   ii = 0;
851:   for (i=0; i<n; i++) {
852:     ic      = 6*c[i];
853:     t[ii]   = b[ic];
854:     t[ii+1] = b[ic+1];
855:     t[ii+2] = b[ic+2];
856:     t[ii+3] = b[ic+3];
857:     t[ii+4] = b[ic+4];
858:     t[ii+5] = b[ic+5];
859:     ii     += 6;
860:   }

862:   /* forward solve the U^T */
863:   idx = 0;
864:   for (i=0; i<n; i++) {

866:     v = aa + 36*diag[i];
867:     /* multiply by the inverse of the block diagonal */
868:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
869:     x6 = t[5+idx];
870:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
871:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
872:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
873:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
874:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
875:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
876:     v += 36;

878:     vi = aj + diag[i] + 1;
879:     nz = ai[i+1] - diag[i] - 1;
880:     while (nz--) {
881:       oidx       = 6*(*vi++);
882:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
883:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
884:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
885:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
886:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
887:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
888:       v         += 36;
889:     }
890:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
891:     t[5+idx] = s6;
892:     idx     += 6;
893:   }
894:   /* backward solve the L^T */
895:   for (i=n-1; i>=0; i--) {
896:     v   = aa + 36*diag[i] - 36;
897:     vi  = aj + diag[i] - 1;
898:     nz  = diag[i] - ai[i];
899:     idt = 6*i;
900:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
901:     s6  = t[5+idt];
902:     while (nz--) {
903:       idx       = 6*(*vi--);
904:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
905:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
906:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
907:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
908:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
909:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
910:       v        -= 36;
911:     }
912:   }

914:   /* copy t into x according to permutation */
915:   ii = 0;
916:   for (i=0; i<n; i++) {
917:     ir      = 6*r[i];
918:     x[ir]   = t[ii];
919:     x[ir+1] = t[ii+1];
920:     x[ir+2] = t[ii+2];
921:     x[ir+3] = t[ii+3];
922:     x[ir+4] = t[ii+4];
923:     x[ir+5] = t[ii+5];
924:     ii     += 6;
925:   }

927:   ISRestoreIndices(isrow,&rout);
928:   ISRestoreIndices(iscol,&cout);
929:   VecRestoreArrayRead(bb,&b);
930:   VecRestoreArray(xx,&x);
931:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
932:   return(0);
933: }

937: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
938: {
939:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
940:   PetscErrorCode    ierr;
941:   IS                iscol=a->col,isrow=a->row;
942:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
943:   const PetscInt    *r,*c,*rout,*cout;
944:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
945:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
946:   const MatScalar   *aa=a->a,*v;
947:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
948:   const PetscScalar *b;

951:   VecGetArrayRead(bb,&b);
952:   VecGetArray(xx,&x);
953:   t    = a->solve_work;

955:   ISGetIndices(isrow,&rout); r = rout;
956:   ISGetIndices(iscol,&cout); c = cout;

958:   /* copy b into temp work space according to permutation */
959:   for (i=0; i<n; i++) {
960:     ii      = bs*i; ic = bs*c[i];
961:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
962:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
963:   }

965:   /* forward solve the U^T */
966:   idx = 0;
967:   for (i=0; i<n; i++) {
968:     v = aa + bs2*diag[i];
969:     /* multiply by the inverse of the block diagonal */
970:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
971:     x6 = t[5+idx];
972:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
973:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
974:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
975:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
976:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
977:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
978:     v -= bs2;

980:     vi = aj + diag[i] - 1;
981:     nz = diag[i] - diag[i+1] - 1;
982:     for (j=0; j>-nz; j--) {
983:       oidx       = bs*vi[j];
984:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
985:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
986:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
987:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
988:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
989:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
990:       v         -= bs2;
991:     }
992:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
993:     t[5+idx] = s6;
994:     idx     += bs;
995:   }
996:   /* backward solve the L^T */
997:   for (i=n-1; i>=0; i--) {
998:     v   = aa + bs2*ai[i];
999:     vi  = aj + ai[i];
1000:     nz  = ai[i+1] - ai[i];
1001:     idt = bs*i;
1002:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1003:     s6  = t[5+idt];
1004:     for (j=0; j<nz; j++) {
1005:       idx       = bs*vi[j];
1006:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1007:       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1008:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1009:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1010:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1011:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1012:       v        += bs2;
1013:     }
1014:   }

1016:   /* copy t into x according to permutation */
1017:   for (i=0; i<n; i++) {
1018:     ii      = bs*i;  ir = bs*r[i];
1019:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1020:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1021:   }

1023:   ISRestoreIndices(isrow,&rout);
1024:   ISRestoreIndices(iscol,&cout);
1025:   VecRestoreArrayRead(bb,&b);
1026:   VecRestoreArray(xx,&x);
1027:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1028:   return(0);
1029: }

1033: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1034: {
1035:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1036:   IS                iscol=a->col,isrow=a->row;
1037:   PetscErrorCode    ierr;
1038:   const PetscInt    *r,*c,*rout,*cout;
1039:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1040:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1041:   const MatScalar   *aa=a->a,*v;
1042:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1043:   const PetscScalar *b;

1046:   VecGetArrayRead(bb,&b);
1047:   VecGetArray(xx,&x);
1048:   t    = a->solve_work;

1050:   ISGetIndices(isrow,&rout); r = rout;
1051:   ISGetIndices(iscol,&cout); c = cout;

1053:   /* copy the b into temp work space according to permutation */
1054:   ii = 0;
1055:   for (i=0; i<n; i++) {
1056:     ic      = 7*c[i];
1057:     t[ii]   = b[ic];
1058:     t[ii+1] = b[ic+1];
1059:     t[ii+2] = b[ic+2];
1060:     t[ii+3] = b[ic+3];
1061:     t[ii+4] = b[ic+4];
1062:     t[ii+5] = b[ic+5];
1063:     t[ii+6] = b[ic+6];
1064:     ii     += 7;
1065:   }

1067:   /* forward solve the U^T */
1068:   idx = 0;
1069:   for (i=0; i<n; i++) {

1071:     v = aa + 49*diag[i];
1072:     /* multiply by the inverse of the block diagonal */
1073:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1074:     x6 = t[5+idx]; x7 = t[6+idx];
1075:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1076:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1077:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1078:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1079:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1080:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1081:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1082:     v += 49;

1084:     vi = aj + diag[i] + 1;
1085:     nz = ai[i+1] - diag[i] - 1;
1086:     while (nz--) {
1087:       oidx       = 7*(*vi++);
1088:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1089:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1090:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1091:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1092:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1093:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1094:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1095:       v         += 49;
1096:     }
1097:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1098:     t[5+idx] = s6;t[6+idx] = s7;
1099:     idx     += 7;
1100:   }
1101:   /* backward solve the L^T */
1102:   for (i=n-1; i>=0; i--) {
1103:     v   = aa + 49*diag[i] - 49;
1104:     vi  = aj + diag[i] - 1;
1105:     nz  = diag[i] - ai[i];
1106:     idt = 7*i;
1107:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1108:     s6  = t[5+idt];s7 = t[6+idt];
1109:     while (nz--) {
1110:       idx       = 7*(*vi--);
1111:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1112:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1113:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1114:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1115:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1116:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1117:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1118:       v        -= 49;
1119:     }
1120:   }

1122:   /* copy t into x according to permutation */
1123:   ii = 0;
1124:   for (i=0; i<n; i++) {
1125:     ir      = 7*r[i];
1126:     x[ir]   = t[ii];
1127:     x[ir+1] = t[ii+1];
1128:     x[ir+2] = t[ii+2];
1129:     x[ir+3] = t[ii+3];
1130:     x[ir+4] = t[ii+4];
1131:     x[ir+5] = t[ii+5];
1132:     x[ir+6] = t[ii+6];
1133:     ii     += 7;
1134:   }

1136:   ISRestoreIndices(isrow,&rout);
1137:   ISRestoreIndices(iscol,&cout);
1138:   VecRestoreArrayRead(bb,&b);
1139:   VecRestoreArray(xx,&x);
1140:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
1141:   return(0);
1142: }
1145: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1146: {
1147:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1148:   PetscErrorCode    ierr;
1149:   IS                iscol=a->col,isrow=a->row;
1150:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1151:   const PetscInt    *r,*c,*rout,*cout;
1152:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1153:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1154:   const MatScalar   *aa=a->a,*v;
1155:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1156:   const PetscScalar *b;

1159:   VecGetArrayRead(bb,&b);
1160:   VecGetArray(xx,&x);
1161:   t    = a->solve_work;

1163:   ISGetIndices(isrow,&rout); r = rout;
1164:   ISGetIndices(iscol,&cout); c = cout;

1166:   /* copy b into temp work space according to permutation */
1167:   for (i=0; i<n; i++) {
1168:     ii      = bs*i; ic = bs*c[i];
1169:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1170:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1171:   }

1173:   /* forward solve the U^T */
1174:   idx = 0;
1175:   for (i=0; i<n; i++) {
1176:     v = aa + bs2*diag[i];
1177:     /* multiply by the inverse of the block diagonal */
1178:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1179:     x6 = t[5+idx]; x7 = t[6+idx];
1180:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1181:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1182:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1183:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1184:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1185:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1186:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1187:     v -= bs2;

1189:     vi = aj + diag[i] - 1;
1190:     nz = diag[i] - diag[i+1] - 1;
1191:     for (j=0; j>-nz; j--) {
1192:       oidx       = bs*vi[j];
1193:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1194:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1195:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1196:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1197:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1198:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1199:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1200:       v         -= bs2;
1201:     }
1202:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1203:     t[5+idx] = s6;  t[6+idx] = s7;
1204:     idx     += bs;
1205:   }
1206:   /* backward solve the L^T */
1207:   for (i=n-1; i>=0; i--) {
1208:     v   = aa + bs2*ai[i];
1209:     vi  = aj + ai[i];
1210:     nz  = ai[i+1] - ai[i];
1211:     idt = bs*i;
1212:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1213:     s6  = t[5+idt];  s7 = t[6+idt];
1214:     for (j=0; j<nz; j++) {
1215:       idx       = bs*vi[j];
1216:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1217:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1218:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1219:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1220:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1221:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1222:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1223:       v        += bs2;
1224:     }
1225:   }

1227:   /* copy t into x according to permutation */
1228:   for (i=0; i<n; i++) {
1229:     ii      = bs*i;  ir = bs*r[i];
1230:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1231:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
1232:   }

1234:   ISRestoreIndices(isrow,&rout);
1235:   ISRestoreIndices(iscol,&cout);
1236:   VecRestoreArrayRead(bb,&b);
1237:   VecRestoreArray(xx,&x);
1238:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1239:   return(0);
1240: }

1242: /* ----------------------------------------------------------- */
1245: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
1246: {
1247:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1248:   IS                iscol=a->col,isrow=a->row;
1249:   PetscErrorCode    ierr;
1250:   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1251:   PetscInt          i,nz,j;
1252:   const PetscInt    n  =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
1253:   const MatScalar   *aa=a->a,*v;
1254:   PetscScalar       *x,*t,*ls;
1255:   const PetscScalar *b;

1258:   VecGetArrayRead(bb,&b);
1259:   VecGetArray(xx,&x);
1260:   t    = a->solve_work;

1262:   ISGetIndices(isrow,&rout); r = rout;
1263:   ISGetIndices(iscol,&cout); c = cout;

1265:   /* copy the b into temp work space according to permutation */
1266:   for (i=0; i<n; i++) {
1267:     for (j=0; j<bs; j++) {
1268:       t[i*bs+j] = b[c[i]*bs+j];
1269:     }
1270:   }


1273:   /* forward solve the upper triangular transpose */
1274:   ls = a->solve_work + A->cmap->n;
1275:   for (i=0; i<n; i++) {
1276:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1277:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1278:     v  = aa + bs2*(a->diag[i] + 1);
1279:     vi = aj + a->diag[i] + 1;
1280:     nz = ai[i+1] - a->diag[i] - 1;
1281:     while (nz--) {
1282:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1283:       v += bs2;
1284:     }
1285:   }

1287:   /* backward solve the lower triangular transpose */
1288:   for (i=n-1; i>=0; i--) {
1289:     v  = aa + bs2*ai[i];
1290:     vi = aj + ai[i];
1291:     nz = a->diag[i] - ai[i];
1292:     while (nz--) {
1293:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1294:       v += bs2;
1295:     }
1296:   }

1298:   /* copy t into x according to permutation */
1299:   for (i=0; i<n; i++) {
1300:     for (j=0; j<bs; j++) {
1301:       x[bs*r[i]+j]   = t[bs*i+j];
1302:     }
1303:   }

1305:   ISRestoreIndices(isrow,&rout);
1306:   ISRestoreIndices(iscol,&cout);
1307:   VecRestoreArrayRead(bb,&b);
1308:   VecRestoreArray(xx,&x);
1309:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1310:   return(0);
1311: }

1315: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1316: {
1317:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1318:   IS                iscol=a->col,isrow=a->row;
1319:   PetscErrorCode    ierr;
1320:   const PetscInt    *r,*c,*rout,*cout;
1321:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
1322:   PetscInt          i,j,nz;
1323:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1324:   const MatScalar   *aa=a->a,*v;
1325:   PetscScalar       *x,*t,*ls;
1326:   const PetscScalar *b;

1329:   VecGetArrayRead(bb,&b);
1330:   VecGetArray(xx,&x);
1331:   t    = a->solve_work;

1333:   ISGetIndices(isrow,&rout); r = rout;
1334:   ISGetIndices(iscol,&cout); c = cout;

1336:   /* copy the b into temp work space according to permutation */
1337:   for (i=0; i<n; i++) {
1338:     for (j=0; j<bs; j++) {
1339:       t[i*bs+j] = b[c[i]*bs+j];
1340:     }
1341:   }


1344:   /* forward solve the upper triangular transpose */
1345:   ls = a->solve_work + A->cmap->n;
1346:   for (i=0; i<n; i++) {
1347:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1348:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
1349:     v  = aa + bs2*(diag[i] - 1);
1350:     vi = aj + diag[i] - 1;
1351:     nz = diag[i] - diag[i+1] - 1;
1352:     for (j=0; j>-nz; j--) {
1353:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1354:       v -= bs2;
1355:     }
1356:   }

1358:   /* backward solve the lower triangular transpose */
1359:   for (i=n-1; i>=0; i--) {
1360:     v  = aa + bs2*ai[i];
1361:     vi = aj + ai[i];
1362:     nz = ai[i+1] - ai[i];
1363:     for (j=0; j<nz; j++) {
1364:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1365:       v += bs2;
1366:     }
1367:   }

1369:   /* copy t into x according to permutation */
1370:   for (i=0; i<n; i++) {
1371:     for (j=0; j<bs; j++) {
1372:       x[bs*r[i]+j]   = t[bs*i+j];
1373:     }
1374:   }

1376:   ISRestoreIndices(isrow,&rout);
1377:   ISRestoreIndices(iscol,&cout);
1378:   VecRestoreArrayRead(bb,&b);
1379:   VecRestoreArray(xx,&x);
1380:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1381:   return(0);
1382: }