Actual source code: baijsolvtran.c

petsc-3.8.4 2018-03-24
Report Typos and Errors
  1:  #include <../src/mat/impls/baij/seq/baij.h>
  2:  #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
  7:   IS                iscol = a->col,isrow = a->row;
  8:   PetscErrorCode    ierr;
  9:   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
 10:   PetscInt          i,n = a->mbs,j;
 11:   PetscInt          nz;
 12:   PetscScalar       *x,*tmp,s1;
 13:   const MatScalar   *aa = a->a,*v;
 14:   const PetscScalar *b;

 17:   VecGetArrayRead(bb,&b);
 18:   VecGetArray(xx,&x);
 19:   tmp  = a->solve_work;

 21:   ISGetIndices(isrow,&rout); r = rout;
 22:   ISGetIndices(iscol,&cout); c = cout;

 24:   /* copy the b into temp work space according to permutation */
 25:   for (i=0; i<n; i++) tmp[i] = b[c[i]];

 27:   /* forward solve the U^T */
 28:   for (i=0; i<n; i++) {
 29:     v   = aa + adiag[i+1] + 1;
 30:     vi  = aj + adiag[i+1] + 1;
 31:     nz  = adiag[i] - adiag[i+1] - 1;
 32:     s1  = tmp[i];
 33:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
 34:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 35:     tmp[i] = s1;
 36:   }

 38:   /* backward solve the L^T */
 39:   for (i=n-1; i>=0; i--) {
 40:     v  = aa + ai[i];
 41:     vi = aj + ai[i];
 42:     nz = ai[i+1] - ai[i];
 43:     s1 = tmp[i];
 44:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 45:   }

 47:   /* copy tmp into x according to permutation */
 48:   for (i=0; i<n; i++) x[r[i]] = tmp[i];

 50:   ISRestoreIndices(isrow,&rout);
 51:   ISRestoreIndices(iscol,&cout);
 52:   VecRestoreArrayRead(bb,&b);
 53:   VecRestoreArray(xx,&x);

 55:   PetscLogFlops(2.0*a->nz-A->cmap->n);
 56:   return(0);
 57: }

 59: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
 60: {
 61:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
 62:   IS                iscol=a->col,isrow=a->row;
 63:   PetscErrorCode    ierr;
 64:   const PetscInt    *r,*c,*rout,*cout;
 65:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 66:   PetscInt          i,nz;
 67:   const MatScalar   *aa=a->a,*v;
 68:   PetscScalar       s1,*x,*t;
 69:   const PetscScalar *b;

 72:   VecGetArrayRead(bb,&b);
 73:   VecGetArray(xx,&x);
 74:   t    = a->solve_work;

 76:   ISGetIndices(isrow,&rout); r = rout;
 77:   ISGetIndices(iscol,&cout); c = cout;

 79:   /* copy the b into temp work space according to permutation */
 80:   for (i=0; i<n; i++) t[i] = b[c[i]];

 82:   /* forward solve the U^T */
 83:   for (i=0; i<n; i++) {

 85:     v = aa + diag[i];
 86:     /* multiply by the inverse of the block diagonal */
 87:     s1 = (*v++)*t[i];
 88:     vi = aj + diag[i] + 1;
 89:     nz = ai[i+1] - diag[i] - 1;
 90:     while (nz--) {
 91:       t[*vi++] -= (*v++)*s1;
 92:     }
 93:     t[i] = s1;
 94:   }
 95:   /* backward solve the L^T */
 96:   for (i=n-1; i>=0; i--) {
 97:     v  = aa + diag[i] - 1;
 98:     vi = aj + diag[i] - 1;
 99:     nz = diag[i] - ai[i];
100:     s1 = t[i];
101:     while (nz--) {
102:       t[*vi--] -=  (*v--)*s1;
103:     }
104:   }

106:   /* copy t into x according to permutation */
107:   for (i=0; i<n; i++) x[r[i]] = t[i];

109:   ISRestoreIndices(isrow,&rout);
110:   ISRestoreIndices(iscol,&cout);
111:   VecRestoreArrayRead(bb,&b);
112:   VecRestoreArray(xx,&x);
113:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
114:   return(0);
115: }

117: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
118: {
119:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
120:   IS                iscol=a->col,isrow=a->row;
121:   PetscErrorCode    ierr;
122:   const PetscInt    *r,*c,*rout,*cout;
123:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
124:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
125:   const MatScalar   *aa=a->a,*v;
126:   PetscScalar       s1,s2,x1,x2,*x,*t;
127:   const PetscScalar *b;

130:   VecGetArrayRead(bb,&b);
131:   VecGetArray(xx,&x);
132:   t    = a->solve_work;

134:   ISGetIndices(isrow,&rout); r = rout;
135:   ISGetIndices(iscol,&cout); c = cout;

137:   /* copy the b into temp work space according to permutation */
138:   ii = 0;
139:   for (i=0; i<n; i++) {
140:     ic      = 2*c[i];
141:     t[ii]   = b[ic];
142:     t[ii+1] = b[ic+1];
143:     ii     += 2;
144:   }

146:   /* forward solve the U^T */
147:   idx = 0;
148:   for (i=0; i<n; i++) {

150:     v = aa + 4*diag[i];
151:     /* multiply by the inverse of the block diagonal */
152:     x1 = t[idx];   x2 = t[1+idx];
153:     s1 = v[0]*x1  +  v[1]*x2;
154:     s2 = v[2]*x1  +  v[3]*x2;
155:     v += 4;

157:     vi = aj + diag[i] + 1;
158:     nz = ai[i+1] - diag[i] - 1;
159:     while (nz--) {
160:       oidx       = 2*(*vi++);
161:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
162:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
163:       v         += 4;
164:     }
165:     t[idx] = s1;t[1+idx] = s2;
166:     idx   += 2;
167:   }
168:   /* backward solve the L^T */
169:   for (i=n-1; i>=0; i--) {
170:     v   = aa + 4*diag[i] - 4;
171:     vi  = aj + diag[i] - 1;
172:     nz  = diag[i] - ai[i];
173:     idt = 2*i;
174:     s1  = t[idt];  s2 = t[1+idt];
175:     while (nz--) {
176:       idx       = 2*(*vi--);
177:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
178:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
179:       v        -= 4;
180:     }
181:   }

183:   /* copy t into x according to permutation */
184:   ii = 0;
185:   for (i=0; i<n; i++) {
186:     ir      = 2*r[i];
187:     x[ir]   = t[ii];
188:     x[ir+1] = t[ii+1];
189:     ii     += 2;
190:   }

192:   ISRestoreIndices(isrow,&rout);
193:   ISRestoreIndices(iscol,&cout);
194:   VecRestoreArrayRead(bb,&b);
195:   VecRestoreArray(xx,&x);
196:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
197:   return(0);
198: }

200: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
201: {
202:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
203:   PetscErrorCode    ierr;
204:   IS                iscol=a->col,isrow=a->row;
205:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
206:   const PetscInt    *r,*c,*rout,*cout;
207:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
208:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
209:   const MatScalar   *aa=a->a,*v;
210:   PetscScalar       s1,s2,x1,x2,*x,*t;
211:   const PetscScalar *b;

214:   VecGetArrayRead(bb,&b);
215:   VecGetArray(xx,&x);
216:   t    = a->solve_work;

218:   ISGetIndices(isrow,&rout); r = rout;
219:   ISGetIndices(iscol,&cout); c = cout;

221:   /* copy b into temp work space according to permutation */
222:   for (i=0; i<n; i++) {
223:     ii    = bs*i; ic = bs*c[i];
224:     t[ii] = b[ic]; t[ii+1] = b[ic+1];
225:   }

227:   /* forward solve the U^T */
228:   idx = 0;
229:   for (i=0; i<n; i++) {
230:     v = aa + bs2*diag[i];
231:     /* multiply by the inverse of the block diagonal */
232:     x1 = t[idx];   x2 = t[1+idx];
233:     s1 = v[0]*x1  +  v[1]*x2;
234:     s2 = v[2]*x1  +  v[3]*x2;
235:     v -= bs2;

237:     vi = aj + diag[i] - 1;
238:     nz = diag[i] - diag[i+1] - 1;
239:     for (j=0; j>-nz; j--) {
240:       oidx       = bs*vi[j];
241:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
242:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
243:       v         -= bs2;
244:     }
245:     t[idx] = s1;t[1+idx] = s2;
246:     idx   += bs;
247:   }
248:   /* backward solve the L^T */
249:   for (i=n-1; i>=0; i--) {
250:     v   = aa + bs2*ai[i];
251:     vi  = aj + ai[i];
252:     nz  = ai[i+1] - ai[i];
253:     idt = bs*i;
254:     s1  = t[idt];  s2 = t[1+idt];
255:     for (j=0; j<nz; j++) {
256:       idx       = bs*vi[j];
257:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
258:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
259:       v        += bs2;
260:     }
261:   }

263:   /* copy t into x according to permutation */
264:   for (i=0; i<n; i++) {
265:     ii    = bs*i;  ir = bs*r[i];
266:     x[ir] = t[ii];  x[ir+1] = t[ii+1];
267:   }

269:   ISRestoreIndices(isrow,&rout);
270:   ISRestoreIndices(iscol,&cout);
271:   VecRestoreArrayRead(bb,&b);
272:   VecRestoreArray(xx,&x);
273:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
274:   return(0);
275: }

277: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
278: {
279:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
280:   IS                iscol=a->col,isrow=a->row;
281:   PetscErrorCode    ierr;
282:   const PetscInt    *r,*c,*rout,*cout;
283:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
284:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
285:   const MatScalar   *aa=a->a,*v;
286:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
287:   const PetscScalar *b;

290:   VecGetArrayRead(bb,&b);
291:   VecGetArray(xx,&x);
292:   t    = a->solve_work;

294:   ISGetIndices(isrow,&rout); r = rout;
295:   ISGetIndices(iscol,&cout); c = cout;

297:   /* copy the b into temp work space according to permutation */
298:   ii = 0;
299:   for (i=0; i<n; i++) {
300:     ic      = 3*c[i];
301:     t[ii]   = b[ic];
302:     t[ii+1] = b[ic+1];
303:     t[ii+2] = b[ic+2];
304:     ii     += 3;
305:   }

307:   /* forward solve the U^T */
308:   idx = 0;
309:   for (i=0; i<n; i++) {

311:     v = aa + 9*diag[i];
312:     /* multiply by the inverse of the block diagonal */
313:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
314:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
315:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
316:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
317:     v += 9;

319:     vi = aj + diag[i] + 1;
320:     nz = ai[i+1] - diag[i] - 1;
321:     while (nz--) {
322:       oidx       = 3*(*vi++);
323:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
324:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
325:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
326:       v         += 9;
327:     }
328:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
329:     idx   += 3;
330:   }
331:   /* backward solve the L^T */
332:   for (i=n-1; i>=0; i--) {
333:     v   = aa + 9*diag[i] - 9;
334:     vi  = aj + diag[i] - 1;
335:     nz  = diag[i] - ai[i];
336:     idt = 3*i;
337:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
338:     while (nz--) {
339:       idx       = 3*(*vi--);
340:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
341:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
342:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
343:       v        -= 9;
344:     }
345:   }

347:   /* copy t into x according to permutation */
348:   ii = 0;
349:   for (i=0; i<n; i++) {
350:     ir      = 3*r[i];
351:     x[ir]   = t[ii];
352:     x[ir+1] = t[ii+1];
353:     x[ir+2] = t[ii+2];
354:     ii     += 3;
355:   }

357:   ISRestoreIndices(isrow,&rout);
358:   ISRestoreIndices(iscol,&cout);
359:   VecRestoreArrayRead(bb,&b);
360:   VecRestoreArray(xx,&x);
361:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
362:   return(0);
363: }

365: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
366: {
367:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
368:   PetscErrorCode    ierr;
369:   IS                iscol=a->col,isrow=a->row;
370:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
371:   const PetscInt    *r,*c,*rout,*cout;
372:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
373:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
374:   const MatScalar   *aa=a->a,*v;
375:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
376:   const PetscScalar *b;

379:   VecGetArrayRead(bb,&b);
380:   VecGetArray(xx,&x);
381:   t    = a->solve_work;

383:   ISGetIndices(isrow,&rout); r = rout;
384:   ISGetIndices(iscol,&cout); c = cout;

386:   /* copy b into temp work space according to permutation */
387:   for (i=0; i<n; i++) {
388:     ii    = bs*i; ic = bs*c[i];
389:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
390:   }

392:   /* forward solve the U^T */
393:   idx = 0;
394:   for (i=0; i<n; i++) {
395:     v = aa + bs2*diag[i];
396:     /* multiply by the inverse of the block diagonal */
397:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
398:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
399:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
400:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
401:     v -= bs2;

403:     vi = aj + diag[i] - 1;
404:     nz = diag[i] - diag[i+1] - 1;
405:     for (j=0; j>-nz; j--) {
406:       oidx       = bs*vi[j];
407:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
408:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
409:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
410:       v         -= bs2;
411:     }
412:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;
413:     idx   += bs;
414:   }
415:   /* backward solve the L^T */
416:   for (i=n-1; i>=0; i--) {
417:     v   = aa + bs2*ai[i];
418:     vi  = aj + ai[i];
419:     nz  = ai[i+1] - ai[i];
420:     idt = bs*i;
421:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
422:     for (j=0; j<nz; j++) {
423:       idx       = bs*vi[j];
424:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
425:       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
426:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
427:       v        += bs2;
428:     }
429:   }

431:   /* copy t into x according to permutation */
432:   for (i=0; i<n; i++) {
433:     ii    = bs*i;  ir = bs*r[i];
434:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
435:   }

437:   ISRestoreIndices(isrow,&rout);
438:   ISRestoreIndices(iscol,&cout);
439:   VecRestoreArrayRead(bb,&b);
440:   VecRestoreArray(xx,&x);
441:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
442:   return(0);
443: }

445: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
446: {
447:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
448:   IS                iscol=a->col,isrow=a->row;
449:   PetscErrorCode    ierr;
450:   const PetscInt    *r,*c,*rout,*cout;
451:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
452:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
453:   const MatScalar   *aa=a->a,*v;
454:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
455:   const PetscScalar *b;

458:   VecGetArrayRead(bb,&b);
459:   VecGetArray(xx,&x);
460:   t    = a->solve_work;

462:   ISGetIndices(isrow,&rout); r = rout;
463:   ISGetIndices(iscol,&cout); c = cout;

465:   /* copy the b into temp work space according to permutation */
466:   ii = 0;
467:   for (i=0; i<n; i++) {
468:     ic      = 4*c[i];
469:     t[ii]   = b[ic];
470:     t[ii+1] = b[ic+1];
471:     t[ii+2] = b[ic+2];
472:     t[ii+3] = b[ic+3];
473:     ii     += 4;
474:   }

476:   /* forward solve the U^T */
477:   idx = 0;
478:   for (i=0; i<n; i++) {

480:     v = aa + 16*diag[i];
481:     /* multiply by the inverse of the block diagonal */
482:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
483:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
484:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
485:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
486:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
487:     v += 16;

489:     vi = aj + diag[i] + 1;
490:     nz = ai[i+1] - diag[i] - 1;
491:     while (nz--) {
492:       oidx       = 4*(*vi++);
493:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
494:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
495:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
496:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
497:       v         += 16;
498:     }
499:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
500:     idx   += 4;
501:   }
502:   /* backward solve the L^T */
503:   for (i=n-1; i>=0; i--) {
504:     v   = aa + 16*diag[i] - 16;
505:     vi  = aj + diag[i] - 1;
506:     nz  = diag[i] - ai[i];
507:     idt = 4*i;
508:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
509:     while (nz--) {
510:       idx       = 4*(*vi--);
511:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
512:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
513:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
514:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
515:       v        -= 16;
516:     }
517:   }

519:   /* copy t into x according to permutation */
520:   ii = 0;
521:   for (i=0; i<n; i++) {
522:     ir      = 4*r[i];
523:     x[ir]   = t[ii];
524:     x[ir+1] = t[ii+1];
525:     x[ir+2] = t[ii+2];
526:     x[ir+3] = t[ii+3];
527:     ii     += 4;
528:   }

530:   ISRestoreIndices(isrow,&rout);
531:   ISRestoreIndices(iscol,&cout);
532:   VecRestoreArrayRead(bb,&b);
533:   VecRestoreArray(xx,&x);
534:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
535:   return(0);
536: }

538: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
539: {
540:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
541:   PetscErrorCode    ierr;
542:   IS                iscol=a->col,isrow=a->row;
543:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
544:   const PetscInt    *r,*c,*rout,*cout;
545:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
546:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
547:   const MatScalar   *aa=a->a,*v;
548:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
549:   const PetscScalar *b;

552:   VecGetArrayRead(bb,&b);
553:   VecGetArray(xx,&x);
554:   t    = a->solve_work;

556:   ISGetIndices(isrow,&rout); r = rout;
557:   ISGetIndices(iscol,&cout); c = cout;

559:   /* copy b into temp work space according to permutation */
560:   for (i=0; i<n; i++) {
561:     ii    = bs*i; ic = bs*c[i];
562:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
563:   }

565:   /* forward solve the U^T */
566:   idx = 0;
567:   for (i=0; i<n; i++) {
568:     v = aa + bs2*diag[i];
569:     /* multiply by the inverse of the block diagonal */
570:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
571:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
572:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
573:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
574:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
575:     v -= bs2;

577:     vi = aj + diag[i] - 1;
578:     nz = diag[i] - diag[i+1] - 1;
579:     for (j=0; j>-nz; j--) {
580:       oidx       = bs*vi[j];
581:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
582:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
583:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
584:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
585:       v         -= bs2;
586:     }
587:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
588:     idx   += bs;
589:   }
590:   /* backward solve the L^T */
591:   for (i=n-1; i>=0; i--) {
592:     v   = aa + bs2*ai[i];
593:     vi  = aj + ai[i];
594:     nz  = ai[i+1] - ai[i];
595:     idt = bs*i;
596:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
597:     for (j=0; j<nz; j++) {
598:       idx       = bs*vi[j];
599:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
600:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
601:       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
602:       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
603:       v        += bs2;
604:     }
605:   }

607:   /* copy t into x according to permutation */
608:   for (i=0; i<n; i++) {
609:     ii    = bs*i;  ir = bs*r[i];
610:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
611:   }

613:   ISRestoreIndices(isrow,&rout);
614:   ISRestoreIndices(iscol,&cout);
615:   VecRestoreArrayRead(bb,&b);
616:   VecRestoreArray(xx,&x);
617:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
618:   return(0);
619: }

621: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
622: {
623:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
624:   IS                iscol=a->col,isrow=a->row;
625:   PetscErrorCode    ierr;
626:   const PetscInt    *r,*c,*rout,*cout;
627:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
628:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
629:   const MatScalar   *aa=a->a,*v;
630:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
631:   const PetscScalar *b;

634:   VecGetArrayRead(bb,&b);
635:   VecGetArray(xx,&x);
636:   t    = a->solve_work;

638:   ISGetIndices(isrow,&rout); r = rout;
639:   ISGetIndices(iscol,&cout); c = cout;

641:   /* copy the b into temp work space according to permutation */
642:   ii = 0;
643:   for (i=0; i<n; i++) {
644:     ic      = 5*c[i];
645:     t[ii]   = b[ic];
646:     t[ii+1] = b[ic+1];
647:     t[ii+2] = b[ic+2];
648:     t[ii+3] = b[ic+3];
649:     t[ii+4] = b[ic+4];
650:     ii     += 5;
651:   }

653:   /* forward solve the U^T */
654:   idx = 0;
655:   for (i=0; i<n; i++) {

657:     v = aa + 25*diag[i];
658:     /* multiply by the inverse of the block diagonal */
659:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
660:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
661:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
662:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
663:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
664:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
665:     v += 25;

667:     vi = aj + diag[i] + 1;
668:     nz = ai[i+1] - diag[i] - 1;
669:     while (nz--) {
670:       oidx       = 5*(*vi++);
671:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
672:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
673:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
674:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
675:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
676:       v         += 25;
677:     }
678:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
679:     idx   += 5;
680:   }
681:   /* backward solve the L^T */
682:   for (i=n-1; i>=0; i--) {
683:     v   = aa + 25*diag[i] - 25;
684:     vi  = aj + diag[i] - 1;
685:     nz  = diag[i] - ai[i];
686:     idt = 5*i;
687:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
688:     while (nz--) {
689:       idx       = 5*(*vi--);
690:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
691:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
692:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
693:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
694:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
695:       v        -= 25;
696:     }
697:   }

699:   /* copy t into x according to permutation */
700:   ii = 0;
701:   for (i=0; i<n; i++) {
702:     ir      = 5*r[i];
703:     x[ir]   = t[ii];
704:     x[ir+1] = t[ii+1];
705:     x[ir+2] = t[ii+2];
706:     x[ir+3] = t[ii+3];
707:     x[ir+4] = t[ii+4];
708:     ii     += 5;
709:   }

711:   ISRestoreIndices(isrow,&rout);
712:   ISRestoreIndices(iscol,&cout);
713:   VecRestoreArrayRead(bb,&b);
714:   VecRestoreArray(xx,&x);
715:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
716:   return(0);
717: }

719: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
720: {
721:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
722:   PetscErrorCode    ierr;
723:   IS                iscol=a->col,isrow=a->row;
724:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
725:   const PetscInt    *r,*c,*rout,*cout;
726:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
727:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
728:   const MatScalar   *aa=a->a,*v;
729:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
730:   const PetscScalar *b;

733:   VecGetArrayRead(bb,&b);
734:   VecGetArray(xx,&x);
735:   t    = a->solve_work;

737:   ISGetIndices(isrow,&rout); r = rout;
738:   ISGetIndices(iscol,&cout); c = cout;

740:   /* copy b into temp work space according to permutation */
741:   for (i=0; i<n; i++) {
742:     ii      = bs*i; ic = bs*c[i];
743:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
744:     t[ii+4] = b[ic+4];
745:   }

747:   /* forward solve the U^T */
748:   idx = 0;
749:   for (i=0; i<n; i++) {
750:     v = aa + bs2*diag[i];
751:     /* multiply by the inverse of the block diagonal */
752:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
753:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
754:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
755:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
756:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
757:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
758:     v -= bs2;

760:     vi = aj + diag[i] - 1;
761:     nz = diag[i] - diag[i+1] - 1;
762:     for (j=0; j>-nz; j--) {
763:       oidx       = bs*vi[j];
764:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
765:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
766:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
767:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
768:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
769:       v         -= bs2;
770:     }
771:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
772:     idx   += bs;
773:   }
774:   /* backward solve the L^T */
775:   for (i=n-1; i>=0; i--) {
776:     v   = aa + bs2*ai[i];
777:     vi  = aj + ai[i];
778:     nz  = ai[i+1] - ai[i];
779:     idt = bs*i;
780:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
781:     for (j=0; j<nz; j++) {
782:       idx       = bs*vi[j];
783:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
784:       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
785:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
786:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
787:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
788:       v        += bs2;
789:     }
790:   }

792:   /* copy t into x according to permutation */
793:   for (i=0; i<n; i++) {
794:     ii      = bs*i;  ir = bs*r[i];
795:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
796:     x[ir+4] = t[ii+4];
797:   }

799:   ISRestoreIndices(isrow,&rout);
800:   ISRestoreIndices(iscol,&cout);
801:   VecRestoreArrayRead(bb,&b);
802:   VecRestoreArray(xx,&x);
803:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
804:   return(0);
805: }

807: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
808: {
809:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
810:   IS                iscol=a->col,isrow=a->row;
811:   PetscErrorCode    ierr;
812:   const PetscInt    *r,*c,*rout,*cout;
813:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
814:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
815:   const MatScalar   *aa=a->a,*v;
816:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
817:   const PetscScalar *b;

820:   VecGetArrayRead(bb,&b);
821:   VecGetArray(xx,&x);
822:   t    = a->solve_work;

824:   ISGetIndices(isrow,&rout); r = rout;
825:   ISGetIndices(iscol,&cout); c = cout;

827:   /* copy the b into temp work space according to permutation */
828:   ii = 0;
829:   for (i=0; i<n; i++) {
830:     ic      = 6*c[i];
831:     t[ii]   = b[ic];
832:     t[ii+1] = b[ic+1];
833:     t[ii+2] = b[ic+2];
834:     t[ii+3] = b[ic+3];
835:     t[ii+4] = b[ic+4];
836:     t[ii+5] = b[ic+5];
837:     ii     += 6;
838:   }

840:   /* forward solve the U^T */
841:   idx = 0;
842:   for (i=0; i<n; i++) {

844:     v = aa + 36*diag[i];
845:     /* multiply by the inverse of the block diagonal */
846:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
847:     x6 = t[5+idx];
848:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
849:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
850:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
851:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
852:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
853:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
854:     v += 36;

856:     vi = aj + diag[i] + 1;
857:     nz = ai[i+1] - diag[i] - 1;
858:     while (nz--) {
859:       oidx       = 6*(*vi++);
860:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
861:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
862:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
863:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
864:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
865:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
866:       v         += 36;
867:     }
868:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
869:     t[5+idx] = s6;
870:     idx     += 6;
871:   }
872:   /* backward solve the L^T */
873:   for (i=n-1; i>=0; i--) {
874:     v   = aa + 36*diag[i] - 36;
875:     vi  = aj + diag[i] - 1;
876:     nz  = diag[i] - ai[i];
877:     idt = 6*i;
878:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
879:     s6  = t[5+idt];
880:     while (nz--) {
881:       idx       = 6*(*vi--);
882:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
883:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
884:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
885:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
886:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
887:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
888:       v        -= 36;
889:     }
890:   }

892:   /* copy t into x according to permutation */
893:   ii = 0;
894:   for (i=0; i<n; i++) {
895:     ir      = 6*r[i];
896:     x[ir]   = t[ii];
897:     x[ir+1] = t[ii+1];
898:     x[ir+2] = t[ii+2];
899:     x[ir+3] = t[ii+3];
900:     x[ir+4] = t[ii+4];
901:     x[ir+5] = t[ii+5];
902:     ii     += 6;
903:   }

905:   ISRestoreIndices(isrow,&rout);
906:   ISRestoreIndices(iscol,&cout);
907:   VecRestoreArrayRead(bb,&b);
908:   VecRestoreArray(xx,&x);
909:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
910:   return(0);
911: }

913: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
914: {
915:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
916:   PetscErrorCode    ierr;
917:   IS                iscol=a->col,isrow=a->row;
918:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
919:   const PetscInt    *r,*c,*rout,*cout;
920:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
921:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
922:   const MatScalar   *aa=a->a,*v;
923:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
924:   const PetscScalar *b;

927:   VecGetArrayRead(bb,&b);
928:   VecGetArray(xx,&x);
929:   t    = a->solve_work;

931:   ISGetIndices(isrow,&rout); r = rout;
932:   ISGetIndices(iscol,&cout); c = cout;

934:   /* copy b into temp work space according to permutation */
935:   for (i=0; i<n; i++) {
936:     ii      = bs*i; ic = bs*c[i];
937:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
938:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
939:   }

941:   /* forward solve the U^T */
942:   idx = 0;
943:   for (i=0; i<n; i++) {
944:     v = aa + bs2*diag[i];
945:     /* multiply by the inverse of the block diagonal */
946:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
947:     x6 = t[5+idx];
948:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
949:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
950:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
951:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
952:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
953:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
954:     v -= bs2;

956:     vi = aj + diag[i] - 1;
957:     nz = diag[i] - diag[i+1] - 1;
958:     for (j=0; j>-nz; j--) {
959:       oidx       = bs*vi[j];
960:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
961:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
962:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
963:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
964:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
965:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
966:       v         -= bs2;
967:     }
968:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
969:     t[5+idx] = s6;
970:     idx     += bs;
971:   }
972:   /* backward solve the L^T */
973:   for (i=n-1; i>=0; i--) {
974:     v   = aa + bs2*ai[i];
975:     vi  = aj + ai[i];
976:     nz  = ai[i+1] - ai[i];
977:     idt = bs*i;
978:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
979:     s6  = t[5+idt];
980:     for (j=0; j<nz; j++) {
981:       idx       = bs*vi[j];
982:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
983:       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
984:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
985:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
986:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
987:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
988:       v        += bs2;
989:     }
990:   }

992:   /* copy t into x according to permutation */
993:   for (i=0; i<n; i++) {
994:     ii      = bs*i;  ir = bs*r[i];
995:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
996:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
997:   }

999:   ISRestoreIndices(isrow,&rout);
1000:   ISRestoreIndices(iscol,&cout);
1001:   VecRestoreArrayRead(bb,&b);
1002:   VecRestoreArray(xx,&x);
1003:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1004:   return(0);
1005: }

1007: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1008: {
1009:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1010:   IS                iscol=a->col,isrow=a->row;
1011:   PetscErrorCode    ierr;
1012:   const PetscInt    *r,*c,*rout,*cout;
1013:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1014:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1015:   const MatScalar   *aa=a->a,*v;
1016:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1017:   const PetscScalar *b;

1020:   VecGetArrayRead(bb,&b);
1021:   VecGetArray(xx,&x);
1022:   t    = a->solve_work;

1024:   ISGetIndices(isrow,&rout); r = rout;
1025:   ISGetIndices(iscol,&cout); c = cout;

1027:   /* copy the b into temp work space according to permutation */
1028:   ii = 0;
1029:   for (i=0; i<n; i++) {
1030:     ic      = 7*c[i];
1031:     t[ii]   = b[ic];
1032:     t[ii+1] = b[ic+1];
1033:     t[ii+2] = b[ic+2];
1034:     t[ii+3] = b[ic+3];
1035:     t[ii+4] = b[ic+4];
1036:     t[ii+5] = b[ic+5];
1037:     t[ii+6] = b[ic+6];
1038:     ii     += 7;
1039:   }

1041:   /* forward solve the U^T */
1042:   idx = 0;
1043:   for (i=0; i<n; i++) {

1045:     v = aa + 49*diag[i];
1046:     /* multiply by the inverse of the block diagonal */
1047:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048:     x6 = t[5+idx]; x7 = t[6+idx];
1049:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056:     v += 49;

1058:     vi = aj + diag[i] + 1;
1059:     nz = ai[i+1] - diag[i] - 1;
1060:     while (nz--) {
1061:       oidx       = 7*(*vi++);
1062:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069:       v         += 49;
1070:     }
1071:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072:     t[5+idx] = s6;t[6+idx] = s7;
1073:     idx     += 7;
1074:   }
1075:   /* backward solve the L^T */
1076:   for (i=n-1; i>=0; i--) {
1077:     v   = aa + 49*diag[i] - 49;
1078:     vi  = aj + diag[i] - 1;
1079:     nz  = diag[i] - ai[i];
1080:     idt = 7*i;
1081:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082:     s6  = t[5+idt];s7 = t[6+idt];
1083:     while (nz--) {
1084:       idx       = 7*(*vi--);
1085:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092:       v        -= 49;
1093:     }
1094:   }

1096:   /* copy t into x according to permutation */
1097:   ii = 0;
1098:   for (i=0; i<n; i++) {
1099:     ir      = 7*r[i];
1100:     x[ir]   = t[ii];
1101:     x[ir+1] = t[ii+1];
1102:     x[ir+2] = t[ii+2];
1103:     x[ir+3] = t[ii+3];
1104:     x[ir+4] = t[ii+4];
1105:     x[ir+5] = t[ii+5];
1106:     x[ir+6] = t[ii+6];
1107:     ii     += 7;
1108:   }

1110:   ISRestoreIndices(isrow,&rout);
1111:   ISRestoreIndices(iscol,&cout);
1112:   VecRestoreArrayRead(bb,&b);
1113:   VecRestoreArray(xx,&x);
1114:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
1115:   return(0);
1116: }
1117: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1118: {
1119:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1120:   PetscErrorCode    ierr;
1121:   IS                iscol=a->col,isrow=a->row;
1122:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1123:   const PetscInt    *r,*c,*rout,*cout;
1124:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1125:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1126:   const MatScalar   *aa=a->a,*v;
1127:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1128:   const PetscScalar *b;

1131:   VecGetArrayRead(bb,&b);
1132:   VecGetArray(xx,&x);
1133:   t    = a->solve_work;

1135:   ISGetIndices(isrow,&rout); r = rout;
1136:   ISGetIndices(iscol,&cout); c = cout;

1138:   /* copy b into temp work space according to permutation */
1139:   for (i=0; i<n; i++) {
1140:     ii      = bs*i; ic = bs*c[i];
1141:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1142:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1143:   }

1145:   /* forward solve the U^T */
1146:   idx = 0;
1147:   for (i=0; i<n; i++) {
1148:     v = aa + bs2*diag[i];
1149:     /* multiply by the inverse of the block diagonal */
1150:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1151:     x6 = t[5+idx]; x7 = t[6+idx];
1152:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1153:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1154:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1155:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1156:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1157:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1158:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1159:     v -= bs2;

1161:     vi = aj + diag[i] - 1;
1162:     nz = diag[i] - diag[i+1] - 1;
1163:     for (j=0; j>-nz; j--) {
1164:       oidx       = bs*vi[j];
1165:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1166:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1167:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1168:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1169:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1170:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1171:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1172:       v         -= bs2;
1173:     }
1174:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1175:     t[5+idx] = s6;  t[6+idx] = s7;
1176:     idx     += bs;
1177:   }
1178:   /* backward solve the L^T */
1179:   for (i=n-1; i>=0; i--) {
1180:     v   = aa + bs2*ai[i];
1181:     vi  = aj + ai[i];
1182:     nz  = ai[i+1] - ai[i];
1183:     idt = bs*i;
1184:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1185:     s6  = t[5+idt];  s7 = t[6+idt];
1186:     for (j=0; j<nz; j++) {
1187:       idx       = bs*vi[j];
1188:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1189:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1190:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1191:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1192:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1193:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1194:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1195:       v        += bs2;
1196:     }
1197:   }

1199:   /* copy t into x according to permutation */
1200:   for (i=0; i<n; i++) {
1201:     ii      = bs*i;  ir = bs*r[i];
1202:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1203:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
1204:   }

1206:   ISRestoreIndices(isrow,&rout);
1207:   ISRestoreIndices(iscol,&cout);
1208:   VecRestoreArrayRead(bb,&b);
1209:   VecRestoreArray(xx,&x);
1210:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1211:   return(0);
1212: }

1214: /* ----------------------------------------------------------- */
1215: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
1216: {
1217:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1218:   IS                iscol=a->col,isrow=a->row;
1219:   PetscErrorCode    ierr;
1220:   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1221:   PetscInt          i,nz,j;
1222:   const PetscInt    n  =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
1223:   const MatScalar   *aa=a->a,*v;
1224:   PetscScalar       *x,*t,*ls;
1225:   const PetscScalar *b;

1228:   VecGetArrayRead(bb,&b);
1229:   VecGetArray(xx,&x);
1230:   t    = a->solve_work;

1232:   ISGetIndices(isrow,&rout); r = rout;
1233:   ISGetIndices(iscol,&cout); c = cout;

1235:   /* copy the b into temp work space according to permutation */
1236:   for (i=0; i<n; i++) {
1237:     for (j=0; j<bs; j++) {
1238:       t[i*bs+j] = b[c[i]*bs+j];
1239:     }
1240:   }


1243:   /* forward solve the upper triangular transpose */
1244:   ls = a->solve_work + A->cmap->n;
1245:   for (i=0; i<n; i++) {
1246:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1247:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1248:     v  = aa + bs2*(a->diag[i] + 1);
1249:     vi = aj + a->diag[i] + 1;
1250:     nz = ai[i+1] - a->diag[i] - 1;
1251:     while (nz--) {
1252:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1253:       v += bs2;
1254:     }
1255:   }

1257:   /* backward solve the lower triangular transpose */
1258:   for (i=n-1; i>=0; i--) {
1259:     v  = aa + bs2*ai[i];
1260:     vi = aj + ai[i];
1261:     nz = a->diag[i] - ai[i];
1262:     while (nz--) {
1263:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1264:       v += bs2;
1265:     }
1266:   }

1268:   /* copy t into x according to permutation */
1269:   for (i=0; i<n; i++) {
1270:     for (j=0; j<bs; j++) {
1271:       x[bs*r[i]+j]   = t[bs*i+j];
1272:     }
1273:   }

1275:   ISRestoreIndices(isrow,&rout);
1276:   ISRestoreIndices(iscol,&cout);
1277:   VecRestoreArrayRead(bb,&b);
1278:   VecRestoreArray(xx,&x);
1279:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1280:   return(0);
1281: }

1283: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1284: {
1285:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1286:   IS                iscol=a->col,isrow=a->row;
1287:   PetscErrorCode    ierr;
1288:   const PetscInt    *r,*c,*rout,*cout;
1289:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
1290:   PetscInt          i,j,nz;
1291:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1292:   const MatScalar   *aa=a->a,*v;
1293:   PetscScalar       *x,*t,*ls;
1294:   const PetscScalar *b;

1297:   VecGetArrayRead(bb,&b);
1298:   VecGetArray(xx,&x);
1299:   t    = a->solve_work;

1301:   ISGetIndices(isrow,&rout); r = rout;
1302:   ISGetIndices(iscol,&cout); c = cout;

1304:   /* copy the b into temp work space according to permutation */
1305:   for (i=0; i<n; i++) {
1306:     for (j=0; j<bs; j++) {
1307:       t[i*bs+j] = b[c[i]*bs+j];
1308:     }
1309:   }


1312:   /* forward solve the upper triangular transpose */
1313:   ls = a->solve_work + A->cmap->n;
1314:   for (i=0; i<n; i++) {
1315:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1316:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
1317:     v  = aa + bs2*(diag[i] - 1);
1318:     vi = aj + diag[i] - 1;
1319:     nz = diag[i] - diag[i+1] - 1;
1320:     for (j=0; j>-nz; j--) {
1321:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1322:       v -= bs2;
1323:     }
1324:   }

1326:   /* backward solve the lower triangular transpose */
1327:   for (i=n-1; i>=0; i--) {
1328:     v  = aa + bs2*ai[i];
1329:     vi = aj + ai[i];
1330:     nz = ai[i+1] - ai[i];
1331:     for (j=0; j<nz; j++) {
1332:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1333:       v += bs2;
1334:     }
1335:   }

1337:   /* copy t into x according to permutation */
1338:   for (i=0; i<n; i++) {
1339:     for (j=0; j<bs; j++) {
1340:       x[bs*r[i]+j]   = t[bs*i+j];
1341:     }
1342:   }

1344:   ISRestoreIndices(isrow,&rout);
1345:   ISRestoreIndices(iscol,&cout);
1346:   VecRestoreArrayRead(bb,&b);
1347:   VecRestoreArray(xx,&x);
1348:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
1349:   return(0);
1350: }