Actual source code: baijfact2.c

petsc-3.4.0 2013-05-13
  2: /*
  3:     Factorization code for BAIJ format.
  4: */

  6: #include <../src/mat/impls/baij/seq/baij.h>
  7: #include <petsc-private/kernels/blockinvert.h>
  8: #include <petscbt.h>
  9: #include <../src/mat/utils/freespace.h>

 13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
 16:   PetscErrorCode    ierr;
 17:   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
 18:   PetscInt          i,n = a->mbs,j;
 19:   PetscInt          nz;
 20:   PetscScalar       *x,*tmp,s1;
 21:   const MatScalar   *aa = a->a,*v;
 22:   const PetscScalar *b;

 25:   VecGetArrayRead(bb,&b);
 26:   VecGetArray(xx,&x);
 27:   tmp  = a->solve_work;


 30:   /* copy the b into temp work space according to permutation */
 31:   for (i=0; i<n; i++) tmp[i] = b[i];

 33:   /* forward solve the U^T */
 34:   for (i=0; i<n; i++) {
 35:     v   = aa + adiag[i+1] + 1;
 36:     vi  = aj + adiag[i+1] + 1;
 37:     nz  = adiag[i] - adiag[i+1] - 1;
 38:     s1  = tmp[i];
 39:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
 40:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 41:     tmp[i] = s1;
 42:   }

 44:   /* backward solve the L^T */
 45:   for (i=n-1; i>=0; i--) {
 46:     v  = aa + ai[i];
 47:     vi = aj + ai[i];
 48:     nz = ai[i+1] - ai[i];
 49:     s1 = tmp[i];
 50:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 51:   }

 53:   /* copy tmp into x according to permutation */
 54:   for (i=0; i<n; i++) x[i] = tmp[i];

 56:   VecRestoreArrayRead(bb,&b);
 57:   VecRestoreArray(xx,&x);

 59:   PetscLogFlops(2.0*a->nz-A->cmap->n);
 60:   return(0);
 61: }

 65: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 66: {
 67:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
 68:   PetscErrorCode  ierr;
 69:   PetscInt        i,nz;
 70:   const PetscInt  *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 71:   const MatScalar *aa   =a->a,*v;
 72:   PetscScalar     s1,*x;

 75:   VecCopy(bb,xx);
 76:   VecGetArray(xx,&x);

 78:   /* forward solve the U^T */
 79:   for (i=0; i<n; i++) {

 81:     v = aa + diag[i];
 82:     /* multiply by the inverse of the block diagonal */
 83:     s1 = (*v++)*x[i];
 84:     vi = aj + diag[i] + 1;
 85:     nz = ai[i+1] - diag[i] - 1;
 86:     while (nz--) {
 87:       x[*vi++] -= (*v++)*s1;
 88:     }
 89:     x[i] = s1;
 90:   }
 91:   /* backward solve the L^T */
 92:   for (i=n-1; i>=0; i--) {
 93:     v  = aa + diag[i] - 1;
 94:     vi = aj + diag[i] - 1;
 95:     nz = diag[i] - ai[i];
 96:     s1 = x[i];
 97:     while (nz--) {
 98:       x[*vi--] -=  (*v--)*s1;
 99:     }
100:   }
101:   VecRestoreArray(xx,&x);
102:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
103:   return(0);
104: }

108: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109: {
110:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
111:   PetscErrorCode  ierr;
112:   PetscInt        i,nz,idx,idt,oidx;
113:   const PetscInt  *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114:   const MatScalar *aa   =a->a,*v;
115:   PetscScalar     s1,s2,x1,x2,*x;

118:   VecCopy(bb,xx);
119:   VecGetArray(xx,&x);

121:   /* forward solve the U^T */
122:   idx = 0;
123:   for (i=0; i<n; i++) {

125:     v = aa + 4*diag[i];
126:     /* multiply by the inverse of the block diagonal */
127:     x1 = x[idx];   x2 = x[1+idx];
128:     s1 = v[0]*x1  +  v[1]*x2;
129:     s2 = v[2]*x1  +  v[3]*x2;
130:     v += 4;

132:     vi = aj + diag[i] + 1;
133:     nz = ai[i+1] - diag[i] - 1;
134:     while (nz--) {
135:       oidx       = 2*(*vi++);
136:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138:       v         += 4;
139:     }
140:     x[idx] = s1;x[1+idx] = s2;
141:     idx   += 2;
142:   }
143:   /* backward solve the L^T */
144:   for (i=n-1; i>=0; i--) {
145:     v   = aa + 4*diag[i] - 4;
146:     vi  = aj + diag[i] - 1;
147:     nz  = diag[i] - ai[i];
148:     idt = 2*i;
149:     s1  = x[idt];  s2 = x[1+idt];
150:     while (nz--) {
151:       idx       = 2*(*vi--);
152:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154:       v        -= 4;
155:     }
156:   }
157:   VecRestoreArray(xx,&x);
158:   PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
159:   return(0);
160: }

164: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165: {
166:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
167:   PetscErrorCode  ierr;
168:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169:   PetscInt        nz,idx,idt,j,i,oidx;
170:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
171:   const MatScalar *aa=a->a,*v;
172:   PetscScalar     s1,s2,x1,x2,*x;

175:   VecCopy(bb,xx);
176:   VecGetArray(xx,&x);

178:   /* forward solve the U^T */
179:   idx = 0;
180:   for (i=0; i<n; i++) {
181:     v = aa + bs2*diag[i];
182:     /* multiply by the inverse of the block diagonal */
183:     x1 = x[idx];   x2 = x[1+idx];
184:     s1 = v[0]*x1  +  v[1]*x2;
185:     s2 = v[2]*x1  +  v[3]*x2;
186:     v -= bs2;

188:     vi = aj + diag[i] - 1;
189:     nz = diag[i] - diag[i+1] - 1;
190:     for (j=0; j>-nz; j--) {
191:       oidx       = bs*vi[j];
192:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
193:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
194:       v         -= bs2;
195:     }
196:     x[idx] = s1;x[1+idx] = s2;
197:     idx   += bs;
198:   }
199:   /* backward solve the L^T */
200:   for (i=n-1; i>=0; i--) {
201:     v   = aa + bs2*ai[i];
202:     vi  = aj + ai[i];
203:     nz  = ai[i+1] - ai[i];
204:     idt = bs*i;
205:     s1  = x[idt];  s2 = x[1+idt];
206:     for (j=0; j<nz; j++) {
207:       idx       = bs*vi[j];
208:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
209:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
210:       v        += bs2;
211:     }
212:   }
213:   VecRestoreArray(xx,&x);
214:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
215:   return(0);
216: }

220: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221: {
222:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
223:   PetscErrorCode  ierr;
224:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225:   PetscInt        i,nz,idx,idt,oidx;
226:   const MatScalar *aa=a->a,*v;
227:   PetscScalar     s1,s2,s3,x1,x2,x3,*x;

230:   VecCopy(bb,xx);
231:   VecGetArray(xx,&x);

233:   /* forward solve the U^T */
234:   idx = 0;
235:   for (i=0; i<n; i++) {

237:     v = aa + 9*diag[i];
238:     /* multiply by the inverse of the block diagonal */
239:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243:     v += 9;

245:     vi = aj + diag[i] + 1;
246:     nz = ai[i+1] - diag[i] - 1;
247:     while (nz--) {
248:       oidx       = 3*(*vi++);
249:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252:       v         += 9;
253:     }
254:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
255:     idx   += 3;
256:   }
257:   /* backward solve the L^T */
258:   for (i=n-1; i>=0; i--) {
259:     v   = aa + 9*diag[i] - 9;
260:     vi  = aj + diag[i] - 1;
261:     nz  = diag[i] - ai[i];
262:     idt = 3*i;
263:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264:     while (nz--) {
265:       idx       = 3*(*vi--);
266:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269:       v        -= 9;
270:     }
271:   }
272:   VecRestoreArray(xx,&x);
273:   PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
274:   return(0);
275: }

279: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280: {
281:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
282:   PetscErrorCode  ierr;
283:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284:   PetscInt        nz,idx,idt,j,i,oidx;
285:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
286:   const MatScalar *aa=a->a,*v;
287:   PetscScalar     s1,s2,s3,x1,x2,x3,*x;

290:   VecCopy(bb,xx);
291:   VecGetArray(xx,&x);

293:   /* forward solve the U^T */
294:   idx = 0;
295:   for (i=0; i<n; i++) {
296:     v = aa + bs2*diag[i];
297:     /* multiply by the inverse of the block diagonal */
298:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
299:     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
300:     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
301:     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
302:     v -= bs2;

304:     vi = aj + diag[i] - 1;
305:     nz = diag[i] - diag[i+1] - 1;
306:     for (j=0; j>-nz; j--) {
307:       oidx       = bs*vi[j];
308:       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
309:       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
310:       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
311:       v         -= bs2;
312:     }
313:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;
314:     idx   += bs;
315:   }
316:   /* backward solve the L^T */
317:   for (i=n-1; i>=0; i--) {
318:     v   = aa + bs2*ai[i];
319:     vi  = aj + ai[i];
320:     nz  = ai[i+1] - ai[i];
321:     idt = bs*i;
322:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
323:     for (j=0; j<nz; j++) {
324:       idx       = bs*vi[j];
325:       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
326:       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
327:       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
328:       v        += bs2;
329:     }
330:   }
331:   VecRestoreArray(xx,&x);
332:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
333:   return(0);
334: }

338: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339: {
340:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
341:   PetscErrorCode  ierr;
342:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343:   PetscInt        i,nz,idx,idt,oidx;
344:   const MatScalar *aa=a->a,*v;
345:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;

348:   VecCopy(bb,xx);
349:   VecGetArray(xx,&x);

351:   /* forward solve the U^T */
352:   idx = 0;
353:   for (i=0; i<n; i++) {

355:     v = aa + 16*diag[i];
356:     /* multiply by the inverse of the block diagonal */
357:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362:     v += 16;

364:     vi = aj + diag[i] + 1;
365:     nz = ai[i+1] - diag[i] - 1;
366:     while (nz--) {
367:       oidx       = 4*(*vi++);
368:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372:       v         += 16;
373:     }
374:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375:     idx   += 4;
376:   }
377:   /* backward solve the L^T */
378:   for (i=n-1; i>=0; i--) {
379:     v   = aa + 16*diag[i] - 16;
380:     vi  = aj + diag[i] - 1;
381:     nz  = diag[i] - ai[i];
382:     idt = 4*i;
383:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384:     while (nz--) {
385:       idx       = 4*(*vi--);
386:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390:       v        -= 16;
391:     }
392:   }
393:   VecRestoreArray(xx,&x);
394:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
395:   return(0);
396: }

400: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401: {
402:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
403:   PetscErrorCode  ierr;
404:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405:   PetscInt        nz,idx,idt,j,i,oidx;
406:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
407:   const MatScalar *aa=a->a,*v;
408:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;

411:   VecCopy(bb,xx);
412:   VecGetArray(xx,&x);

414:   /* forward solve the U^T */
415:   idx = 0;
416:   for (i=0; i<n; i++) {
417:     v = aa + bs2*diag[i];
418:     /* multiply by the inverse of the block diagonal */
419:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
420:     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
421:     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
422:     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
423:     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
424:     v -= bs2;

426:     vi = aj + diag[i] - 1;
427:     nz = diag[i] - diag[i+1] - 1;
428:     for (j=0; j>-nz; j--) {
429:       oidx       = bs*vi[j];
430:       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
431:       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
432:       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
433:       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
434:       v         -= bs2;
435:     }
436:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
437:     idx   += bs;
438:   }
439:   /* backward solve the L^T */
440:   for (i=n-1; i>=0; i--) {
441:     v   = aa + bs2*ai[i];
442:     vi  = aj + ai[i];
443:     nz  = ai[i+1] - ai[i];
444:     idt = bs*i;
445:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
446:     for (j=0; j<nz; j++) {
447:       idx       = bs*vi[j];
448:       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
449:       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
450:       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
451:       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
452:       v        += bs2;
453:     }
454:   }
455:   VecRestoreArray(xx,&x);
456:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
457:   return(0);
458: }

462: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463: {
464:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
465:   PetscErrorCode  ierr;
466:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467:   PetscInt        i,nz,idx,idt,oidx;
468:   const MatScalar *aa=a->a,*v;
469:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

472:   VecCopy(bb,xx);
473:   VecGetArray(xx,&x);

475:   /* forward solve the U^T */
476:   idx = 0;
477:   for (i=0; i<n; i++) {

479:     v = aa + 25*diag[i];
480:     /* multiply by the inverse of the block diagonal */
481:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487:     v += 25;

489:     vi = aj + diag[i] + 1;
490:     nz = ai[i+1] - diag[i] - 1;
491:     while (nz--) {
492:       oidx       = 5*(*vi++);
493:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498:       v         += 25;
499:     }
500:     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501:     idx   += 5;
502:   }
503:   /* backward solve the L^T */
504:   for (i=n-1; i>=0; i--) {
505:     v   = aa + 25*diag[i] - 25;
506:     vi  = aj + diag[i] - 1;
507:     nz  = diag[i] - ai[i];
508:     idt = 5*i;
509:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510:     while (nz--) {
511:       idx       = 5*(*vi--);
512:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517:       v        -= 25;
518:     }
519:   }
520:   VecRestoreArray(xx,&x);
521:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
522:   return(0);
523: }

527: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528: {
529:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
530:   PetscErrorCode  ierr;
531:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532:   PetscInt        nz,idx,idt,j,i,oidx;
533:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
534:   const MatScalar *aa=a->a,*v;
535:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

538:   VecCopy(bb,xx);
539:   VecGetArray(xx,&x);

541:   /* forward solve the U^T */
542:   idx = 0;
543:   for (i=0; i<n; i++) {
544:     v = aa + bs2*diag[i];
545:     /* multiply by the inverse of the block diagonal */
546:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
547:     x5 = x[4+idx];
548:     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
549:     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
550:     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
551:     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
552:     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
553:     v -= bs2;

555:     vi = aj + diag[i] - 1;
556:     nz = diag[i] - diag[i+1] - 1;
557:     for (j=0; j>-nz; j--) {
558:       oidx       = bs*vi[j];
559:       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
560:       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
561:       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
562:       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
563:       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
564:       v         -= bs2;
565:     }
566:     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
567:     idx   += bs;
568:   }
569:   /* backward solve the L^T */
570:   for (i=n-1; i>=0; i--) {
571:     v   = aa + bs2*ai[i];
572:     vi  = aj + ai[i];
573:     nz  = ai[i+1] - ai[i];
574:     idt = bs*i;
575:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
576:     for (j=0; j<nz; j++) {
577:       idx       = bs*vi[j];
578:       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
579:       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
580:       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
581:       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
582:       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
583:       v        += bs2;
584:     }
585:   }
586:   VecRestoreArray(xx,&x);
587:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
588:   return(0);
589: }

593: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594: {
595:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
596:   PetscErrorCode  ierr;
597:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598:   PetscInt        i,nz,idx,idt,oidx;
599:   const MatScalar *aa=a->a,*v;
600:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

603:   VecCopy(bb,xx);
604:   VecGetArray(xx,&x);

606:   /* forward solve the U^T */
607:   idx = 0;
608:   for (i=0; i<n; i++) {

610:     v = aa + 36*diag[i];
611:     /* multiply by the inverse of the block diagonal */
612:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613:     x6 = x[5+idx];
614:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620:     v += 36;

622:     vi = aj + diag[i] + 1;
623:     nz = ai[i+1] - diag[i] - 1;
624:     while (nz--) {
625:       oidx       = 6*(*vi++);
626:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632:       v         += 36;
633:     }
634:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635:     x[5+idx] = s6;
636:     idx     += 6;
637:   }
638:   /* backward solve the L^T */
639:   for (i=n-1; i>=0; i--) {
640:     v   = aa + 36*diag[i] - 36;
641:     vi  = aj + diag[i] - 1;
642:     nz  = diag[i] - ai[i];
643:     idt = 6*i;
644:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645:     s6  = x[5+idt];
646:     while (nz--) {
647:       idx       = 6*(*vi--);
648:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654:       v        -= 36;
655:     }
656:   }
657:   VecRestoreArray(xx,&x);
658:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
659:   return(0);
660: }

664: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665: {
666:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
667:   PetscErrorCode  ierr;
668:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669:   PetscInt        nz,idx,idt,j,i,oidx;
670:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
671:   const MatScalar *aa=a->a,*v;
672:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

675:   VecCopy(bb,xx);
676:   VecGetArray(xx,&x);

678:   /* forward solve the U^T */
679:   idx = 0;
680:   for (i=0; i<n; i++) {
681:     v = aa + bs2*diag[i];
682:     /* multiply by the inverse of the block diagonal */
683:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
684:     x5 = x[4+idx]; x6 = x[5+idx];
685:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
686:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
687:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691:     v -= bs2;

693:     vi = aj + diag[i] - 1;
694:     nz = diag[i] - diag[i+1] - 1;
695:     for (j=0; j>-nz; j--) {
696:       oidx       = bs*vi[j];
697:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
698:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
699:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703:       v         -= bs2;
704:     }
705:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
706:     x[5+idx] = s6;
707:     idx     += bs;
708:   }
709:   /* backward solve the L^T */
710:   for (i=n-1; i>=0; i--) {
711:     v   = aa + bs2*ai[i];
712:     vi  = aj + ai[i];
713:     nz  = ai[i+1] - ai[i];
714:     idt = bs*i;
715:     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
716:     s6  = x[5+idt];
717:     for (j=0; j<nz; j++) {
718:       idx       = bs*vi[j];
719:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
720:       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
721:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725:       v        += bs2;
726:     }
727:   }
728:   VecRestoreArray(xx,&x);
729:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
730:   return(0);
731: }

735: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736: {
737:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
738:   PetscErrorCode  ierr;
739:   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740:   PetscInt        i,nz,idx,idt,oidx;
741:   const MatScalar *aa=a->a,*v;
742:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

745:   VecCopy(bb,xx);
746:   VecGetArray(xx,&x);

748:   /* forward solve the U^T */
749:   idx = 0;
750:   for (i=0; i<n; i++) {

752:     v = aa + 49*diag[i];
753:     /* multiply by the inverse of the block diagonal */
754:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755:     x6 = x[5+idx]; x7 = x[6+idx];
756:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763:     v += 49;

765:     vi = aj + diag[i] + 1;
766:     nz = ai[i+1] - diag[i] - 1;
767:     while (nz--) {
768:       oidx       = 7*(*vi++);
769:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776:       v         += 49;
777:     }
778:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779:     x[5+idx] = s6;x[6+idx] = s7;
780:     idx     += 7;
781:   }
782:   /* backward solve the L^T */
783:   for (i=n-1; i>=0; i--) {
784:     v   = aa + 49*diag[i] - 49;
785:     vi  = aj + diag[i] - 1;
786:     nz  = diag[i] - ai[i];
787:     idt = 7*i;
788:     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789:     s6  = x[5+idt];s7 = x[6+idt];
790:     while (nz--) {
791:       idx       = 7*(*vi--);
792:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799:       v        -= 49;
800:     }
801:   }
802:   VecRestoreArray(xx,&x);
803:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
804:   return(0);
805: }
808: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809: {
810:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
811:   PetscErrorCode  ierr;
812:   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813:   PetscInt        nz,idx,idt,j,i,oidx;
814:   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
815:   const MatScalar *aa=a->a,*v;
816:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

819:   VecCopy(bb,xx);
820:   VecGetArray(xx,&x);

822:   /* forward solve the U^T */
823:   idx = 0;
824:   for (i=0; i<n; i++) {
825:     v = aa + bs2*diag[i];
826:     /* multiply by the inverse of the block diagonal */
827:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
828:     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
829:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
830:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836:     v -= bs2;
837:     vi = aj + diag[i] - 1;
838:     nz = diag[i] - diag[i+1] - 1;
839:     for (j=0; j>-nz; j--) {
840:       oidx       = bs*vi[j];
841:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
842:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848:       v         -= bs2;
849:     }
850:     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
851:     x[5+idx] = s6;  x[6+idx] = s7;
852:     idx     += bs;
853:   }
854:   /* backward solve the L^T */
855:   for (i=n-1; i>=0; i--) {
856:     v   = aa + bs2*ai[i];
857:     vi  = aj + ai[i];
858:     nz  = ai[i+1] - ai[i];
859:     idt = bs*i;
860:     s1  = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
861:     s6  = x[5+idt];  s7 = x[6+idt];
862:     for (j=0; j<nz; j++) {
863:       idx       = bs*vi[j];
864:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
865:       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871:       v        += bs2;
872:     }
873:   }
874:   VecRestoreArray(xx,&x);
875:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
876:   return(0);
877: }

879: /*---------------------------------------------------------------------------------------------*/
882: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883: {
884:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
885:   IS                iscol = a->col,isrow = a->row;
886:   PetscErrorCode    ierr;
887:   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888:   PetscInt          i,n = a->mbs,j;
889:   PetscInt          nz;
890:   PetscScalar       *x,*tmp,s1;
891:   const MatScalar   *aa = a->a,*v;
892:   const PetscScalar *b;

895:   VecGetArrayRead(bb,&b);
896:   VecGetArray(xx,&x);
897:   tmp  = a->solve_work;

899:   ISGetIndices(isrow,&rout); r = rout;
900:   ISGetIndices(iscol,&cout); c = cout;

902:   /* copy the b into temp work space according to permutation */
903:   for (i=0; i<n; i++) tmp[i] = b[c[i]];

905:   /* forward solve the U^T */
906:   for (i=0; i<n; i++) {
907:     v   = aa + adiag[i+1] + 1;
908:     vi  = aj + adiag[i+1] + 1;
909:     nz  = adiag[i] - adiag[i+1] - 1;
910:     s1  = tmp[i];
911:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
912:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913:     tmp[i] = s1;
914:   }

916:   /* backward solve the L^T */
917:   for (i=n-1; i>=0; i--) {
918:     v  = aa + ai[i];
919:     vi = aj + ai[i];
920:     nz = ai[i+1] - ai[i];
921:     s1 = tmp[i];
922:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923:   }

925:   /* copy tmp into x according to permutation */
926:   for (i=0; i<n; i++) x[r[i]] = tmp[i];

928:   ISRestoreIndices(isrow,&rout);
929:   ISRestoreIndices(iscol,&cout);
930:   VecRestoreArrayRead(bb,&b);
931:   VecRestoreArray(xx,&x);

933:   PetscLogFlops(2.0*a->nz-A->cmap->n);
934:   return(0);
935: }

939: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940: {
941:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
942:   IS                iscol=a->col,isrow=a->row;
943:   PetscErrorCode    ierr;
944:   const PetscInt    *r,*c,*rout,*cout;
945:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946:   PetscInt          i,nz;
947:   const MatScalar   *aa=a->a,*v;
948:   PetscScalar       s1,*x,*t;
949:   const PetscScalar *b;

952:   VecGetArrayRead(bb,&b);
953:   VecGetArray(xx,&x);
954:   t    = a->solve_work;

956:   ISGetIndices(isrow,&rout); r = rout;
957:   ISGetIndices(iscol,&cout); c = cout;

959:   /* copy the b into temp work space according to permutation */
960:   for (i=0; i<n; i++) t[i] = b[c[i]];

962:   /* forward solve the U^T */
963:   for (i=0; i<n; i++) {

965:     v = aa + diag[i];
966:     /* multiply by the inverse of the block diagonal */
967:     s1 = (*v++)*t[i];
968:     vi = aj + diag[i] + 1;
969:     nz = ai[i+1] - diag[i] - 1;
970:     while (nz--) {
971:       t[*vi++] -= (*v++)*s1;
972:     }
973:     t[i] = s1;
974:   }
975:   /* backward solve the L^T */
976:   for (i=n-1; i>=0; i--) {
977:     v  = aa + diag[i] - 1;
978:     vi = aj + diag[i] - 1;
979:     nz = diag[i] - ai[i];
980:     s1 = t[i];
981:     while (nz--) {
982:       t[*vi--] -=  (*v--)*s1;
983:     }
984:   }

986:   /* copy t into x according to permutation */
987:   for (i=0; i<n; i++) x[r[i]] = t[i];

989:   ISRestoreIndices(isrow,&rout);
990:   ISRestoreIndices(iscol,&cout);
991:   VecRestoreArrayRead(bb,&b);
992:   VecRestoreArray(xx,&x);
993:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
994:   return(0);
995: }

999: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1000: {
1001:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1002:   IS                iscol=a->col,isrow=a->row;
1003:   PetscErrorCode    ierr;
1004:   const PetscInt    *r,*c,*rout,*cout;
1005:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1006:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1007:   const MatScalar   *aa=a->a,*v;
1008:   PetscScalar       s1,s2,x1,x2,*x,*t;
1009:   const PetscScalar *b;

1012:   VecGetArrayRead(bb,&b);
1013:   VecGetArray(xx,&x);
1014:   t    = a->solve_work;

1016:   ISGetIndices(isrow,&rout); r = rout;
1017:   ISGetIndices(iscol,&cout); c = cout;

1019:   /* copy the b into temp work space according to permutation */
1020:   ii = 0;
1021:   for (i=0; i<n; i++) {
1022:     ic      = 2*c[i];
1023:     t[ii]   = b[ic];
1024:     t[ii+1] = b[ic+1];
1025:     ii     += 2;
1026:   }

1028:   /* forward solve the U^T */
1029:   idx = 0;
1030:   for (i=0; i<n; i++) {

1032:     v = aa + 4*diag[i];
1033:     /* multiply by the inverse of the block diagonal */
1034:     x1 = t[idx];   x2 = t[1+idx];
1035:     s1 = v[0]*x1  +  v[1]*x2;
1036:     s2 = v[2]*x1  +  v[3]*x2;
1037:     v += 4;

1039:     vi = aj + diag[i] + 1;
1040:     nz = ai[i+1] - diag[i] - 1;
1041:     while (nz--) {
1042:       oidx       = 2*(*vi++);
1043:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1044:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1045:       v         += 4;
1046:     }
1047:     t[idx] = s1;t[1+idx] = s2;
1048:     idx   += 2;
1049:   }
1050:   /* backward solve the L^T */
1051:   for (i=n-1; i>=0; i--) {
1052:     v   = aa + 4*diag[i] - 4;
1053:     vi  = aj + diag[i] - 1;
1054:     nz  = diag[i] - ai[i];
1055:     idt = 2*i;
1056:     s1  = t[idt];  s2 = t[1+idt];
1057:     while (nz--) {
1058:       idx       = 2*(*vi--);
1059:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1060:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1061:       v        -= 4;
1062:     }
1063:   }

1065:   /* copy t into x according to permutation */
1066:   ii = 0;
1067:   for (i=0; i<n; i++) {
1068:     ir      = 2*r[i];
1069:     x[ir]   = t[ii];
1070:     x[ir+1] = t[ii+1];
1071:     ii     += 2;
1072:   }

1074:   ISRestoreIndices(isrow,&rout);
1075:   ISRestoreIndices(iscol,&cout);
1076:   VecRestoreArrayRead(bb,&b);
1077:   VecRestoreArray(xx,&x);
1078:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1079:   return(0);
1080: }

1084: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1085: {
1086:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1087:   PetscErrorCode    ierr;
1088:   IS                iscol=a->col,isrow=a->row;
1089:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1090:   const PetscInt    *r,*c,*rout,*cout;
1091:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1092:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1093:   const MatScalar   *aa=a->a,*v;
1094:   PetscScalar       s1,s2,x1,x2,*x,*t;
1095:   const PetscScalar *b;

1098:   VecGetArrayRead(bb,&b);
1099:   VecGetArray(xx,&x);
1100:   t    = a->solve_work;

1102:   ISGetIndices(isrow,&rout); r = rout;
1103:   ISGetIndices(iscol,&cout); c = cout;

1105:   /* copy b into temp work space according to permutation */
1106:   for (i=0; i<n; i++) {
1107:     ii    = bs*i; ic = bs*c[i];
1108:     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1109:   }

1111:   /* forward solve the U^T */
1112:   idx = 0;
1113:   for (i=0; i<n; i++) {
1114:     v = aa + bs2*diag[i];
1115:     /* multiply by the inverse of the block diagonal */
1116:     x1 = t[idx];   x2 = t[1+idx];
1117:     s1 = v[0]*x1  +  v[1]*x2;
1118:     s2 = v[2]*x1  +  v[3]*x2;
1119:     v -= bs2;

1121:     vi = aj + diag[i] - 1;
1122:     nz = diag[i] - diag[i+1] - 1;
1123:     for (j=0; j>-nz; j--) {
1124:       oidx       = bs*vi[j];
1125:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1126:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1127:       v         -= bs2;
1128:     }
1129:     t[idx] = s1;t[1+idx] = s2;
1130:     idx   += bs;
1131:   }
1132:   /* backward solve the L^T */
1133:   for (i=n-1; i>=0; i--) {
1134:     v   = aa + bs2*ai[i];
1135:     vi  = aj + ai[i];
1136:     nz  = ai[i+1] - ai[i];
1137:     idt = bs*i;
1138:     s1  = t[idt];  s2 = t[1+idt];
1139:     for (j=0; j<nz; j++) {
1140:       idx       = bs*vi[j];
1141:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1142:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1143:       v        += bs2;
1144:     }
1145:   }

1147:   /* copy t into x according to permutation */
1148:   for (i=0; i<n; i++) {
1149:     ii    = bs*i;  ir = bs*r[i];
1150:     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1151:   }

1153:   ISRestoreIndices(isrow,&rout);
1154:   ISRestoreIndices(iscol,&cout);
1155:   VecRestoreArrayRead(bb,&b);
1156:   VecRestoreArray(xx,&x);
1157:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1158:   return(0);
1159: }

1163: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1164: {
1165:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1166:   IS                iscol=a->col,isrow=a->row;
1167:   PetscErrorCode    ierr;
1168:   const PetscInt    *r,*c,*rout,*cout;
1169:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1170:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1171:   const MatScalar   *aa=a->a,*v;
1172:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1173:   const PetscScalar *b;

1176:   VecGetArrayRead(bb,&b);
1177:   VecGetArray(xx,&x);
1178:   t    = a->solve_work;

1180:   ISGetIndices(isrow,&rout); r = rout;
1181:   ISGetIndices(iscol,&cout); c = cout;

1183:   /* copy the b into temp work space according to permutation */
1184:   ii = 0;
1185:   for (i=0; i<n; i++) {
1186:     ic      = 3*c[i];
1187:     t[ii]   = b[ic];
1188:     t[ii+1] = b[ic+1];
1189:     t[ii+2] = b[ic+2];
1190:     ii     += 3;
1191:   }

1193:   /* forward solve the U^T */
1194:   idx = 0;
1195:   for (i=0; i<n; i++) {

1197:     v = aa + 9*diag[i];
1198:     /* multiply by the inverse of the block diagonal */
1199:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1200:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1201:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1202:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1203:     v += 9;

1205:     vi = aj + diag[i] + 1;
1206:     nz = ai[i+1] - diag[i] - 1;
1207:     while (nz--) {
1208:       oidx       = 3*(*vi++);
1209:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1210:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1211:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1212:       v         += 9;
1213:     }
1214:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1215:     idx   += 3;
1216:   }
1217:   /* backward solve the L^T */
1218:   for (i=n-1; i>=0; i--) {
1219:     v   = aa + 9*diag[i] - 9;
1220:     vi  = aj + diag[i] - 1;
1221:     nz  = diag[i] - ai[i];
1222:     idt = 3*i;
1223:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1224:     while (nz--) {
1225:       idx       = 3*(*vi--);
1226:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1227:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1228:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1229:       v        -= 9;
1230:     }
1231:   }

1233:   /* copy t into x according to permutation */
1234:   ii = 0;
1235:   for (i=0; i<n; i++) {
1236:     ir      = 3*r[i];
1237:     x[ir]   = t[ii];
1238:     x[ir+1] = t[ii+1];
1239:     x[ir+2] = t[ii+2];
1240:     ii     += 3;
1241:   }

1243:   ISRestoreIndices(isrow,&rout);
1244:   ISRestoreIndices(iscol,&cout);
1245:   VecRestoreArrayRead(bb,&b);
1246:   VecRestoreArray(xx,&x);
1247:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1248:   return(0);
1249: }

1253: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1254: {
1255:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1256:   PetscErrorCode    ierr;
1257:   IS                iscol=a->col,isrow=a->row;
1258:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1259:   const PetscInt    *r,*c,*rout,*cout;
1260:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1261:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1262:   const MatScalar   *aa=a->a,*v;
1263:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1264:   const PetscScalar *b;

1267:   VecGetArrayRead(bb,&b);
1268:   VecGetArray(xx,&x);
1269:   t    = a->solve_work;

1271:   ISGetIndices(isrow,&rout); r = rout;
1272:   ISGetIndices(iscol,&cout); c = cout;

1274:   /* copy b into temp work space according to permutation */
1275:   for (i=0; i<n; i++) {
1276:     ii    = bs*i; ic = bs*c[i];
1277:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1278:   }

1280:   /* forward solve the U^T */
1281:   idx = 0;
1282:   for (i=0; i<n; i++) {
1283:     v = aa + bs2*diag[i];
1284:     /* multiply by the inverse of the block diagonal */
1285:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1286:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1287:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1288:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1289:     v -= bs2;

1291:     vi = aj + diag[i] - 1;
1292:     nz = diag[i] - diag[i+1] - 1;
1293:     for (j=0; j>-nz; j--) {
1294:       oidx       = bs*vi[j];
1295:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1296:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1297:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1298:       v         -= bs2;
1299:     }
1300:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;
1301:     idx   += bs;
1302:   }
1303:   /* backward solve the L^T */
1304:   for (i=n-1; i>=0; i--) {
1305:     v   = aa + bs2*ai[i];
1306:     vi  = aj + ai[i];
1307:     nz  = ai[i+1] - ai[i];
1308:     idt = bs*i;
1309:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1310:     for (j=0; j<nz; j++) {
1311:       idx       = bs*vi[j];
1312:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1313:       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1314:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1315:       v        += bs2;
1316:     }
1317:   }

1319:   /* copy t into x according to permutation */
1320:   for (i=0; i<n; i++) {
1321:     ii    = bs*i;  ir = bs*r[i];
1322:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1323:   }

1325:   ISRestoreIndices(isrow,&rout);
1326:   ISRestoreIndices(iscol,&cout);
1327:   VecRestoreArrayRead(bb,&b);
1328:   VecRestoreArray(xx,&x);
1329:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1330:   return(0);
1331: }

1335: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1336: {
1337:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1338:   IS                iscol=a->col,isrow=a->row;
1339:   PetscErrorCode    ierr;
1340:   const PetscInt    *r,*c,*rout,*cout;
1341:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1342:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1343:   const MatScalar   *aa=a->a,*v;
1344:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1345:   const PetscScalar *b;

1348:   VecGetArrayRead(bb,&b);
1349:   VecGetArray(xx,&x);
1350:   t    = a->solve_work;

1352:   ISGetIndices(isrow,&rout); r = rout;
1353:   ISGetIndices(iscol,&cout); c = cout;

1355:   /* copy the b into temp work space according to permutation */
1356:   ii = 0;
1357:   for (i=0; i<n; i++) {
1358:     ic      = 4*c[i];
1359:     t[ii]   = b[ic];
1360:     t[ii+1] = b[ic+1];
1361:     t[ii+2] = b[ic+2];
1362:     t[ii+3] = b[ic+3];
1363:     ii     += 4;
1364:   }

1366:   /* forward solve the U^T */
1367:   idx = 0;
1368:   for (i=0; i<n; i++) {

1370:     v = aa + 16*diag[i];
1371:     /* multiply by the inverse of the block diagonal */
1372:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1373:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1374:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1375:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1376:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1377:     v += 16;

1379:     vi = aj + diag[i] + 1;
1380:     nz = ai[i+1] - diag[i] - 1;
1381:     while (nz--) {
1382:       oidx       = 4*(*vi++);
1383:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1384:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1385:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1386:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1387:       v         += 16;
1388:     }
1389:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1390:     idx   += 4;
1391:   }
1392:   /* backward solve the L^T */
1393:   for (i=n-1; i>=0; i--) {
1394:     v   = aa + 16*diag[i] - 16;
1395:     vi  = aj + diag[i] - 1;
1396:     nz  = diag[i] - ai[i];
1397:     idt = 4*i;
1398:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1399:     while (nz--) {
1400:       idx       = 4*(*vi--);
1401:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1402:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1403:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1404:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1405:       v        -= 16;
1406:     }
1407:   }

1409:   /* copy t into x according to permutation */
1410:   ii = 0;
1411:   for (i=0; i<n; i++) {
1412:     ir      = 4*r[i];
1413:     x[ir]   = t[ii];
1414:     x[ir+1] = t[ii+1];
1415:     x[ir+2] = t[ii+2];
1416:     x[ir+3] = t[ii+3];
1417:     ii     += 4;
1418:   }

1420:   ISRestoreIndices(isrow,&rout);
1421:   ISRestoreIndices(iscol,&cout);
1422:   VecRestoreArrayRead(bb,&b);
1423:   VecRestoreArray(xx,&x);
1424:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1425:   return(0);
1426: }

1430: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1431: {
1432:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1433:   PetscErrorCode    ierr;
1434:   IS                iscol=a->col,isrow=a->row;
1435:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1436:   const PetscInt    *r,*c,*rout,*cout;
1437:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1438:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1439:   const MatScalar   *aa=a->a,*v;
1440:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1441:   const PetscScalar *b;

1444:   VecGetArrayRead(bb,&b);
1445:   VecGetArray(xx,&x);
1446:   t    = a->solve_work;

1448:   ISGetIndices(isrow,&rout); r = rout;
1449:   ISGetIndices(iscol,&cout); c = cout;

1451:   /* copy b into temp work space according to permutation */
1452:   for (i=0; i<n; i++) {
1453:     ii    = bs*i; ic = bs*c[i];
1454:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1455:   }

1457:   /* forward solve the U^T */
1458:   idx = 0;
1459:   for (i=0; i<n; i++) {
1460:     v = aa + bs2*diag[i];
1461:     /* multiply by the inverse of the block diagonal */
1462:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1463:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1464:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1465:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1466:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1467:     v -= bs2;

1469:     vi = aj + diag[i] - 1;
1470:     nz = diag[i] - diag[i+1] - 1;
1471:     for (j=0; j>-nz; j--) {
1472:       oidx       = bs*vi[j];
1473:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1474:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1475:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1476:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1477:       v         -= bs2;
1478:     }
1479:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1480:     idx   += bs;
1481:   }
1482:   /* backward solve the L^T */
1483:   for (i=n-1; i>=0; i--) {
1484:     v   = aa + bs2*ai[i];
1485:     vi  = aj + ai[i];
1486:     nz  = ai[i+1] - ai[i];
1487:     idt = bs*i;
1488:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1489:     for (j=0; j<nz; j++) {
1490:       idx       = bs*vi[j];
1491:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1492:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1493:       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1494:       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1495:       v        += bs2;
1496:     }
1497:   }

1499:   /* copy t into x according to permutation */
1500:   for (i=0; i<n; i++) {
1501:     ii    = bs*i;  ir = bs*r[i];
1502:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1503:   }

1505:   ISRestoreIndices(isrow,&rout);
1506:   ISRestoreIndices(iscol,&cout);
1507:   VecRestoreArrayRead(bb,&b);
1508:   VecRestoreArray(xx,&x);
1509:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1510:   return(0);
1511: }

1515: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1516: {
1517:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1518:   IS                iscol=a->col,isrow=a->row;
1519:   PetscErrorCode    ierr;
1520:   const PetscInt    *r,*c,*rout,*cout;
1521:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1522:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1523:   const MatScalar   *aa=a->a,*v;
1524:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1525:   const PetscScalar *b;

1528:   VecGetArrayRead(bb,&b);
1529:   VecGetArray(xx,&x);
1530:   t    = a->solve_work;

1532:   ISGetIndices(isrow,&rout); r = rout;
1533:   ISGetIndices(iscol,&cout); c = cout;

1535:   /* copy the b into temp work space according to permutation */
1536:   ii = 0;
1537:   for (i=0; i<n; i++) {
1538:     ic      = 5*c[i];
1539:     t[ii]   = b[ic];
1540:     t[ii+1] = b[ic+1];
1541:     t[ii+2] = b[ic+2];
1542:     t[ii+3] = b[ic+3];
1543:     t[ii+4] = b[ic+4];
1544:     ii     += 5;
1545:   }

1547:   /* forward solve the U^T */
1548:   idx = 0;
1549:   for (i=0; i<n; i++) {

1551:     v = aa + 25*diag[i];
1552:     /* multiply by the inverse of the block diagonal */
1553:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1554:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1555:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1556:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1557:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1558:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1559:     v += 25;

1561:     vi = aj + diag[i] + 1;
1562:     nz = ai[i+1] - diag[i] - 1;
1563:     while (nz--) {
1564:       oidx       = 5*(*vi++);
1565:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1566:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1567:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1568:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1569:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1570:       v         += 25;
1571:     }
1572:     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1573:     idx   += 5;
1574:   }
1575:   /* backward solve the L^T */
1576:   for (i=n-1; i>=0; i--) {
1577:     v   = aa + 25*diag[i] - 25;
1578:     vi  = aj + diag[i] - 1;
1579:     nz  = diag[i] - ai[i];
1580:     idt = 5*i;
1581:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1582:     while (nz--) {
1583:       idx       = 5*(*vi--);
1584:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1585:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1586:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1587:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1588:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1589:       v        -= 25;
1590:     }
1591:   }

1593:   /* copy t into x according to permutation */
1594:   ii = 0;
1595:   for (i=0; i<n; i++) {
1596:     ir      = 5*r[i];
1597:     x[ir]   = t[ii];
1598:     x[ir+1] = t[ii+1];
1599:     x[ir+2] = t[ii+2];
1600:     x[ir+3] = t[ii+3];
1601:     x[ir+4] = t[ii+4];
1602:     ii     += 5;
1603:   }

1605:   ISRestoreIndices(isrow,&rout);
1606:   ISRestoreIndices(iscol,&cout);
1607:   VecRestoreArrayRead(bb,&b);
1608:   VecRestoreArray(xx,&x);
1609:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1610:   return(0);
1611: }

1615: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1616: {
1617:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1618:   PetscErrorCode    ierr;
1619:   IS                iscol=a->col,isrow=a->row;
1620:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1621:   const PetscInt    *r,*c,*rout,*cout;
1622:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1623:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1624:   const MatScalar   *aa=a->a,*v;
1625:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1626:   const PetscScalar *b;

1629:   VecGetArrayRead(bb,&b);
1630:   VecGetArray(xx,&x);
1631:   t    = a->solve_work;

1633:   ISGetIndices(isrow,&rout); r = rout;
1634:   ISGetIndices(iscol,&cout); c = cout;

1636:   /* copy b into temp work space according to permutation */
1637:   for (i=0; i<n; i++) {
1638:     ii      = bs*i; ic = bs*c[i];
1639:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1640:     t[ii+4] = b[ic+4];
1641:   }

1643:   /* forward solve the U^T */
1644:   idx = 0;
1645:   for (i=0; i<n; i++) {
1646:     v = aa + bs2*diag[i];
1647:     /* multiply by the inverse of the block diagonal */
1648:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1649:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1650:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1651:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1652:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1653:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1654:     v -= bs2;

1656:     vi = aj + diag[i] - 1;
1657:     nz = diag[i] - diag[i+1] - 1;
1658:     for (j=0; j>-nz; j--) {
1659:       oidx       = bs*vi[j];
1660:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1661:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1662:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1663:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1664:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1665:       v         -= bs2;
1666:     }
1667:     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1668:     idx   += bs;
1669:   }
1670:   /* backward solve the L^T */
1671:   for (i=n-1; i>=0; i--) {
1672:     v   = aa + bs2*ai[i];
1673:     vi  = aj + ai[i];
1674:     nz  = ai[i+1] - ai[i];
1675:     idt = bs*i;
1676:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1677:     for (j=0; j<nz; j++) {
1678:       idx       = bs*vi[j];
1679:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1680:       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1681:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1682:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1683:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1684:       v        += bs2;
1685:     }
1686:   }

1688:   /* copy t into x according to permutation */
1689:   for (i=0; i<n; i++) {
1690:     ii      = bs*i;  ir = bs*r[i];
1691:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1692:     x[ir+4] = t[ii+4];
1693:   }

1695:   ISRestoreIndices(isrow,&rout);
1696:   ISRestoreIndices(iscol,&cout);
1697:   VecRestoreArrayRead(bb,&b);
1698:   VecRestoreArray(xx,&x);
1699:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1700:   return(0);
1701: }

1705: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1706: {
1707:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1708:   IS                iscol=a->col,isrow=a->row;
1709:   PetscErrorCode    ierr;
1710:   const PetscInt    *r,*c,*rout,*cout;
1711:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1712:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1713:   const MatScalar   *aa=a->a,*v;
1714:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1715:   const PetscScalar *b;

1718:   VecGetArrayRead(bb,&b);
1719:   VecGetArray(xx,&x);
1720:   t    = a->solve_work;

1722:   ISGetIndices(isrow,&rout); r = rout;
1723:   ISGetIndices(iscol,&cout); c = cout;

1725:   /* copy the b into temp work space according to permutation */
1726:   ii = 0;
1727:   for (i=0; i<n; i++) {
1728:     ic      = 6*c[i];
1729:     t[ii]   = b[ic];
1730:     t[ii+1] = b[ic+1];
1731:     t[ii+2] = b[ic+2];
1732:     t[ii+3] = b[ic+3];
1733:     t[ii+4] = b[ic+4];
1734:     t[ii+5] = b[ic+5];
1735:     ii     += 6;
1736:   }

1738:   /* forward solve the U^T */
1739:   idx = 0;
1740:   for (i=0; i<n; i++) {

1742:     v = aa + 36*diag[i];
1743:     /* multiply by the inverse of the block diagonal */
1744:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1745:     x6 = t[5+idx];
1746:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1747:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1748:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1749:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1750:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1751:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1752:     v += 36;

1754:     vi = aj + diag[i] + 1;
1755:     nz = ai[i+1] - diag[i] - 1;
1756:     while (nz--) {
1757:       oidx       = 6*(*vi++);
1758:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1759:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1760:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1761:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1762:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1763:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1764:       v         += 36;
1765:     }
1766:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1767:     t[5+idx] = s6;
1768:     idx     += 6;
1769:   }
1770:   /* backward solve the L^T */
1771:   for (i=n-1; i>=0; i--) {
1772:     v   = aa + 36*diag[i] - 36;
1773:     vi  = aj + diag[i] - 1;
1774:     nz  = diag[i] - ai[i];
1775:     idt = 6*i;
1776:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1777:     s6  = t[5+idt];
1778:     while (nz--) {
1779:       idx       = 6*(*vi--);
1780:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1781:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1782:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1783:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1784:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1785:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1786:       v        -= 36;
1787:     }
1788:   }

1790:   /* copy t into x according to permutation */
1791:   ii = 0;
1792:   for (i=0; i<n; i++) {
1793:     ir      = 6*r[i];
1794:     x[ir]   = t[ii];
1795:     x[ir+1] = t[ii+1];
1796:     x[ir+2] = t[ii+2];
1797:     x[ir+3] = t[ii+3];
1798:     x[ir+4] = t[ii+4];
1799:     x[ir+5] = t[ii+5];
1800:     ii     += 6;
1801:   }

1803:   ISRestoreIndices(isrow,&rout);
1804:   ISRestoreIndices(iscol,&cout);
1805:   VecRestoreArrayRead(bb,&b);
1806:   VecRestoreArray(xx,&x);
1807:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
1808:   return(0);
1809: }

1813: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1814: {
1815:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1816:   PetscErrorCode    ierr;
1817:   IS                iscol=a->col,isrow=a->row;
1818:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1819:   const PetscInt    *r,*c,*rout,*cout;
1820:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1821:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1822:   const MatScalar   *aa=a->a,*v;
1823:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1824:   const PetscScalar *b;

1827:   VecGetArrayRead(bb,&b);
1828:   VecGetArray(xx,&x);
1829:   t    = a->solve_work;

1831:   ISGetIndices(isrow,&rout); r = rout;
1832:   ISGetIndices(iscol,&cout); c = cout;

1834:   /* copy b into temp work space according to permutation */
1835:   for (i=0; i<n; i++) {
1836:     ii      = bs*i; ic = bs*c[i];
1837:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1838:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1839:   }

1841:   /* forward solve the U^T */
1842:   idx = 0;
1843:   for (i=0; i<n; i++) {
1844:     v = aa + bs2*diag[i];
1845:     /* multiply by the inverse of the block diagonal */
1846:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1847:     x6 = t[5+idx];
1848:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1849:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1850:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1851:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1852:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1853:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1854:     v -= bs2;

1856:     vi = aj + diag[i] - 1;
1857:     nz = diag[i] - diag[i+1] - 1;
1858:     for (j=0; j>-nz; j--) {
1859:       oidx       = bs*vi[j];
1860:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1861:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1862:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1863:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1864:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1865:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1866:       v         -= bs2;
1867:     }
1868:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1869:     t[5+idx] = s6;
1870:     idx     += bs;
1871:   }
1872:   /* backward solve the L^T */
1873:   for (i=n-1; i>=0; i--) {
1874:     v   = aa + bs2*ai[i];
1875:     vi  = aj + ai[i];
1876:     nz  = ai[i+1] - ai[i];
1877:     idt = bs*i;
1878:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1879:     s6  = t[5+idt];
1880:     for (j=0; j<nz; j++) {
1881:       idx       = bs*vi[j];
1882:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1883:       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1884:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1885:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1886:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1887:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1888:       v        += bs2;
1889:     }
1890:   }

1892:   /* copy t into x according to permutation */
1893:   for (i=0; i<n; i++) {
1894:     ii      = bs*i;  ir = bs*r[i];
1895:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1896:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1897:   }

1899:   ISRestoreIndices(isrow,&rout);
1900:   ISRestoreIndices(iscol,&cout);
1901:   VecRestoreArrayRead(bb,&b);
1902:   VecRestoreArray(xx,&x);
1903:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1904:   return(0);
1905: }

1909: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1910: {
1911:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1912:   IS                iscol=a->col,isrow=a->row;
1913:   PetscErrorCode    ierr;
1914:   const PetscInt    *r,*c,*rout,*cout;
1915:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1916:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1917:   const MatScalar   *aa=a->a,*v;
1918:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1919:   const PetscScalar *b;

1922:   VecGetArrayRead(bb,&b);
1923:   VecGetArray(xx,&x);
1924:   t    = a->solve_work;

1926:   ISGetIndices(isrow,&rout); r = rout;
1927:   ISGetIndices(iscol,&cout); c = cout;

1929:   /* copy the b into temp work space according to permutation */
1930:   ii = 0;
1931:   for (i=0; i<n; i++) {
1932:     ic      = 7*c[i];
1933:     t[ii]   = b[ic];
1934:     t[ii+1] = b[ic+1];
1935:     t[ii+2] = b[ic+2];
1936:     t[ii+3] = b[ic+3];
1937:     t[ii+4] = b[ic+4];
1938:     t[ii+5] = b[ic+5];
1939:     t[ii+6] = b[ic+6];
1940:     ii     += 7;
1941:   }

1943:   /* forward solve the U^T */
1944:   idx = 0;
1945:   for (i=0; i<n; i++) {

1947:     v = aa + 49*diag[i];
1948:     /* multiply by the inverse of the block diagonal */
1949:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1950:     x6 = t[5+idx]; x7 = t[6+idx];
1951:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1952:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1953:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1954:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1955:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1956:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1957:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1958:     v += 49;

1960:     vi = aj + diag[i] + 1;
1961:     nz = ai[i+1] - diag[i] - 1;
1962:     while (nz--) {
1963:       oidx       = 7*(*vi++);
1964:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1965:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1966:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1967:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1968:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1969:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1970:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1971:       v         += 49;
1972:     }
1973:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1974:     t[5+idx] = s6;t[6+idx] = s7;
1975:     idx     += 7;
1976:   }
1977:   /* backward solve the L^T */
1978:   for (i=n-1; i>=0; i--) {
1979:     v   = aa + 49*diag[i] - 49;
1980:     vi  = aj + diag[i] - 1;
1981:     nz  = diag[i] - ai[i];
1982:     idt = 7*i;
1983:     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1984:     s6  = t[5+idt];s7 = t[6+idt];
1985:     while (nz--) {
1986:       idx       = 7*(*vi--);
1987:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1988:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1989:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1990:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1991:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1992:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1993:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1994:       v        -= 49;
1995:     }
1996:   }

1998:   /* copy t into x according to permutation */
1999:   ii = 0;
2000:   for (i=0; i<n; i++) {
2001:     ir      = 7*r[i];
2002:     x[ir]   = t[ii];
2003:     x[ir+1] = t[ii+1];
2004:     x[ir+2] = t[ii+2];
2005:     x[ir+3] = t[ii+3];
2006:     x[ir+4] = t[ii+4];
2007:     x[ir+5] = t[ii+5];
2008:     x[ir+6] = t[ii+6];
2009:     ii     += 7;
2010:   }

2012:   ISRestoreIndices(isrow,&rout);
2013:   ISRestoreIndices(iscol,&cout);
2014:   VecRestoreArrayRead(bb,&b);
2015:   VecRestoreArray(xx,&x);
2016:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2017:   return(0);
2018: }
2021: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2022: {
2023:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
2024:   PetscErrorCode    ierr;
2025:   IS                iscol=a->col,isrow=a->row;
2026:   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2027:   const PetscInt    *r,*c,*rout,*cout;
2028:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2029:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2030:   const MatScalar   *aa=a->a,*v;
2031:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2032:   const PetscScalar *b;

2035:   VecGetArrayRead(bb,&b);
2036:   VecGetArray(xx,&x);
2037:   t    = a->solve_work;

2039:   ISGetIndices(isrow,&rout); r = rout;
2040:   ISGetIndices(iscol,&cout); c = cout;

2042:   /* copy b into temp work space according to permutation */
2043:   for (i=0; i<n; i++) {
2044:     ii      = bs*i; ic = bs*c[i];
2045:     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2046:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2047:   }

2049:   /* forward solve the U^T */
2050:   idx = 0;
2051:   for (i=0; i<n; i++) {
2052:     v = aa + bs2*diag[i];
2053:     /* multiply by the inverse of the block diagonal */
2054:     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2055:     x6 = t[5+idx]; x7 = t[6+idx];
2056:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2057:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2058:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2059:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2060:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2061:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2062:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2063:     v -= bs2;

2065:     vi = aj + diag[i] - 1;
2066:     nz = diag[i] - diag[i+1] - 1;
2067:     for (j=0; j>-nz; j--) {
2068:       oidx       = bs*vi[j];
2069:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2070:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2071:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2072:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2073:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2074:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2075:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2076:       v         -= bs2;
2077:     }
2078:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2079:     t[5+idx] = s6;  t[6+idx] = s7;
2080:     idx     += bs;
2081:   }
2082:   /* backward solve the L^T */
2083:   for (i=n-1; i>=0; i--) {
2084:     v   = aa + bs2*ai[i];
2085:     vi  = aj + ai[i];
2086:     nz  = ai[i+1] - ai[i];
2087:     idt = bs*i;
2088:     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2089:     s6  = t[5+idt];  s7 = t[6+idt];
2090:     for (j=0; j<nz; j++) {
2091:       idx       = bs*vi[j];
2092:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2093:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2094:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2095:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2096:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2097:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2098:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2099:       v        += bs2;
2100:     }
2101:   }

2103:   /* copy t into x according to permutation */
2104:   for (i=0; i<n; i++) {
2105:     ii      = bs*i;  ir = bs*r[i];
2106:     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2107:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2108:   }

2110:   ISRestoreIndices(isrow,&rout);
2111:   ISRestoreIndices(iscol,&cout);
2112:   VecRestoreArrayRead(bb,&b);
2113:   VecRestoreArray(xx,&x);
2114:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2115:   return(0);
2116: }

2118: /* ----------------------------------------------------------- */
2121: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2122: {
2123:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2124:   IS                iscol=a->col,isrow=a->row;
2125:   PetscErrorCode    ierr;
2126:   const PetscInt    *r,*c,*rout,*cout;
2127:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2128:   PetscInt          i,nz;
2129:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2130:   const MatScalar   *aa=a->a,*v;
2131:   PetscScalar       *x,*s,*t,*ls;
2132:   const PetscScalar *b;

2135:   VecGetArrayRead(bb,&b);
2136:   VecGetArray(xx,&x);
2137:   t    = a->solve_work;

2139:   ISGetIndices(isrow,&rout); r = rout;
2140:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2142:   /* forward solve the lower triangular */
2143:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
2144:   for (i=1; i<n; i++) {
2145:     v    = aa + bs2*ai[i];
2146:     vi   = aj + ai[i];
2147:     nz   = a->diag[i] - ai[i];
2148:     s    = t + bs*i;
2149:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
2150:     while (nz--) {
2151:       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2152:       v += bs2;
2153:     }
2154:   }
2155:   /* backward solve the upper triangular */
2156:   ls = a->solve_work + A->cmap->n;
2157:   for (i=n-1; i>=0; i--) {
2158:     v    = aa + bs2*(a->diag[i] + 1);
2159:     vi   = aj + a->diag[i] + 1;
2160:     nz   = ai[i+1] - a->diag[i] - 1;
2161:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2162:     while (nz--) {
2163:       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2164:       v += bs2;
2165:     }
2166:     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2167:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
2168:   }

2170:   ISRestoreIndices(isrow,&rout);
2171:   ISRestoreIndices(iscol,&cout);
2172:   VecRestoreArrayRead(bb,&b);
2173:   VecRestoreArray(xx,&x);
2174:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2175:   return(0);
2176: }

2178: /* ----------------------------------------------------------- */
2181: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2182: {
2183:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2184:   IS                iscol=a->col,isrow=a->row;
2185:   PetscErrorCode    ierr;
2186:   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2187:   PetscInt          i,nz,j;
2188:   const PetscInt    n  =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2189:   const MatScalar   *aa=a->a,*v;
2190:   PetscScalar       *x,*t,*ls;
2191:   const PetscScalar *b;

2194:   VecGetArrayRead(bb,&b);
2195:   VecGetArray(xx,&x);
2196:   t    = a->solve_work;

2198:   ISGetIndices(isrow,&rout); r = rout;
2199:   ISGetIndices(iscol,&cout); c = cout;

2201:   /* copy the b into temp work space according to permutation */
2202:   for (i=0; i<n; i++) {
2203:     for (j=0; j<bs; j++) {
2204:       t[i*bs+j] = b[c[i]*bs+j];
2205:     }
2206:   }


2209:   /* forward solve the upper triangular transpose */
2210:   ls = a->solve_work + A->cmap->n;
2211:   for (i=0; i<n; i++) {
2212:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2213:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2214:     v  = aa + bs2*(a->diag[i] + 1);
2215:     vi = aj + a->diag[i] + 1;
2216:     nz = ai[i+1] - a->diag[i] - 1;
2217:     while (nz--) {
2218:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2219:       v += bs2;
2220:     }
2221:   }

2223:   /* backward solve the lower triangular transpose */
2224:   for (i=n-1; i>=0; i--) {
2225:     v  = aa + bs2*ai[i];
2226:     vi = aj + ai[i];
2227:     nz = a->diag[i] - ai[i];
2228:     while (nz--) {
2229:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2230:       v += bs2;
2231:     }
2232:   }

2234:   /* copy t into x according to permutation */
2235:   for (i=0; i<n; i++) {
2236:     for (j=0; j<bs; j++) {
2237:       x[bs*r[i]+j]   = t[bs*i+j];
2238:     }
2239:   }

2241:   ISRestoreIndices(isrow,&rout);
2242:   ISRestoreIndices(iscol,&cout);
2243:   VecRestoreArrayRead(bb,&b);
2244:   VecRestoreArray(xx,&x);
2245:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2246:   return(0);
2247: }

2251: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2252: {
2253:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2254:   IS                iscol=a->col,isrow=a->row;
2255:   PetscErrorCode    ierr;
2256:   const PetscInt    *r,*c,*rout,*cout;
2257:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2258:   PetscInt          i,j,nz;
2259:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2260:   const MatScalar   *aa=a->a,*v;
2261:   PetscScalar       *x,*t,*ls;
2262:   const PetscScalar *b;

2265:   VecGetArrayRead(bb,&b);
2266:   VecGetArray(xx,&x);
2267:   t    = a->solve_work;

2269:   ISGetIndices(isrow,&rout); r = rout;
2270:   ISGetIndices(iscol,&cout); c = cout;

2272:   /* copy the b into temp work space according to permutation */
2273:   for (i=0; i<n; i++) {
2274:     for (j=0; j<bs; j++) {
2275:       t[i*bs+j] = b[c[i]*bs+j];
2276:     }
2277:   }


2280:   /* forward solve the upper triangular transpose */
2281:   ls = a->solve_work + A->cmap->n;
2282:   for (i=0; i<n; i++) {
2283:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2284:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2285:     v  = aa + bs2*(diag[i] - 1);
2286:     vi = aj + diag[i] - 1;
2287:     nz = diag[i] - diag[i+1] - 1;
2288:     for (j=0; j>-nz; j--) {
2289:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2290:       v -= bs2;
2291:     }
2292:   }

2294:   /* backward solve the lower triangular transpose */
2295:   for (i=n-1; i>=0; i--) {
2296:     v  = aa + bs2*ai[i];
2297:     vi = aj + ai[i];
2298:     nz = ai[i+1] - ai[i];
2299:     for (j=0; j<nz; j++) {
2300:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2301:       v += bs2;
2302:     }
2303:   }

2305:   /* copy t into x according to permutation */
2306:   for (i=0; i<n; i++) {
2307:     for (j=0; j<bs; j++) {
2308:       x[bs*r[i]+j]   = t[bs*i+j];
2309:     }
2310:   }

2312:   ISRestoreIndices(isrow,&rout);
2313:   ISRestoreIndices(iscol,&cout);
2314:   VecRestoreArrayRead(bb,&b);
2315:   VecRestoreArray(xx,&x);
2316:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2317:   return(0);
2318: }

2320: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */

2324: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2325: {
2326:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
2327:   PetscErrorCode    ierr;
2328:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2329:   PetscInt          i,nz,idx,idt,m;
2330:   const MatScalar   *aa=a->a,*v;
2331:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2332:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2333:   PetscScalar       *x;
2334:   const PetscScalar *b;

2337:   VecGetArrayRead(bb,&b);
2338:   VecGetArray(xx,&x);

2340:   /* forward solve the lower triangular */
2341:   idx   = 0;
2342:   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2343:   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2344:   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];

2346:   for (i=1; i<n; i++) {
2347:     v   = aa + bs2*ai[i];
2348:     vi  = aj + ai[i];
2349:     nz  = ai[i+1] - ai[i];
2350:     idt = bs*i;
2351:     s1  = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2352:     s6  = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2353:     s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2354:     for (m=0; m<nz; m++) {
2355:       idx = bs*vi[m];
2356:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2357:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2358:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];


2361:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2362:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2363:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2364:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2365:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2366:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2367:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2368:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2369:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2370:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2371:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2372:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2373:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2374:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2375:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

2377:       v += bs2;
2378:     }
2379:     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2380:     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2381:     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;

2383:   }
2384:   /* backward solve the upper triangular */
2385:   for (i=n-1; i>=0; i--) {
2386:     v   = aa + bs2*(adiag[i+1]+1);
2387:     vi  = aj + adiag[i+1]+1;
2388:     nz  = adiag[i] - adiag[i+1] - 1;
2389:     idt = bs*i;
2390:     s1  = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2391:     s6  = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2392:     s11 = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];

2394:     for (m=0; m<nz; m++) {
2395:       idx = bs*vi[m];
2396:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2397:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2398:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

2400:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2401:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2402:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2403:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2404:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2405:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2406:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2407:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2408:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2409:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2410:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2411:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2412:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2413:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2414:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

2416:       v += bs2;
2417:     }

2419:     x[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2420:     x[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2421:     x[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2422:     x[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2423:     x[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2424:     x[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2425:     x[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2426:     x[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2427:     x[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2428:     x[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2429:     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2430:     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2431:     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2432:     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2433:     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;

2435:   }

2437:   VecRestoreArrayRead(bb,&b);
2438:   VecRestoreArray(xx,&x);
2439:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2440:   return(0);
2441: }

2443: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2444: /* Default MatSolve for block size 15 */

2448: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2449: {
2450:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
2451:   PetscErrorCode    ierr;
2452:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2453:   PetscInt          i,k,nz,idx,idt,m;
2454:   const MatScalar   *aa=a->a,*v;
2455:   PetscScalar       s[15];
2456:   PetscScalar       *x,xv;
2457:   const PetscScalar *b;

2460:   VecGetArrayRead(bb,&b);
2461:   VecGetArray(xx,&x);

2463:   /* forward solve the lower triangular */
2464:   for (i=0; i<n; i++) {
2465:     v         = aa + bs2*ai[i];
2466:     vi        = aj + ai[i];
2467:     nz        = ai[i+1] - ai[i];
2468:     idt       = bs*i;
2469:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2470:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2471:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2472:     for (m=0; m<nz; m++) {
2473:       idx = bs*vi[m];
2474:       for (k=0; k<15; k++) {
2475:         xv         = x[k + idx];
2476:         x[idt]    -= v[0]*xv;
2477:         x[1+idt]  -= v[1]*xv;
2478:         x[2+idt]  -= v[2]*xv;
2479:         x[3+idt]  -= v[3]*xv;
2480:         x[4+idt]  -= v[4]*xv;
2481:         x[5+idt]  -= v[5]*xv;
2482:         x[6+idt]  -= v[6]*xv;
2483:         x[7+idt]  -= v[7]*xv;
2484:         x[8+idt]  -= v[8]*xv;
2485:         x[9+idt]  -= v[9]*xv;
2486:         x[10+idt] -= v[10]*xv;
2487:         x[11+idt] -= v[11]*xv;
2488:         x[12+idt] -= v[12]*xv;
2489:         x[13+idt] -= v[13]*xv;
2490:         x[14+idt] -= v[14]*xv;
2491:         v         += 15;
2492:       }
2493:     }
2494:   }
2495:   /* backward solve the upper triangular */
2496:   for (i=n-1; i>=0; i--) {
2497:     v     = aa + bs2*(adiag[i+1]+1);
2498:     vi    = aj + adiag[i+1]+1;
2499:     nz    = adiag[i] - adiag[i+1] - 1;
2500:     idt   = bs*i;
2501:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2502:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2503:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];

2505:     for (m=0; m<nz; m++) {
2506:       idx = bs*vi[m];
2507:       for (k=0; k<15; k++) {
2508:         xv     = x[k + idx];
2509:         s[0]  -= v[0]*xv;
2510:         s[1]  -= v[1]*xv;
2511:         s[2]  -= v[2]*xv;
2512:         s[3]  -= v[3]*xv;
2513:         s[4]  -= v[4]*xv;
2514:         s[5]  -= v[5]*xv;
2515:         s[6]  -= v[6]*xv;
2516:         s[7]  -= v[7]*xv;
2517:         s[8]  -= v[8]*xv;
2518:         s[9]  -= v[9]*xv;
2519:         s[10] -= v[10]*xv;
2520:         s[11] -= v[11]*xv;
2521:         s[12] -= v[12]*xv;
2522:         s[13] -= v[13]*xv;
2523:         s[14] -= v[14]*xv;
2524:         v     += 15;
2525:       }
2526:     }
2527:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
2528:     for (k=0; k<15; k++) {
2529:       x[idt]    += v[0]*s[k];
2530:       x[1+idt]  += v[1]*s[k];
2531:       x[2+idt]  += v[2]*s[k];
2532:       x[3+idt]  += v[3]*s[k];
2533:       x[4+idt]  += v[4]*s[k];
2534:       x[5+idt]  += v[5]*s[k];
2535:       x[6+idt]  += v[6]*s[k];
2536:       x[7+idt]  += v[7]*s[k];
2537:       x[8+idt]  += v[8]*s[k];
2538:       x[9+idt]  += v[9]*s[k];
2539:       x[10+idt] += v[10]*s[k];
2540:       x[11+idt] += v[11]*s[k];
2541:       x[12+idt] += v[12]*s[k];
2542:       x[13+idt] += v[13]*s[k];
2543:       x[14+idt] += v[14]*s[k];
2544:       v         += 15;
2545:     }
2546:   }
2547:   VecRestoreArrayRead(bb,&b);
2548:   VecRestoreArray(xx,&x);
2549:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2550:   return(0);
2551: }


2556: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2557: {
2558:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2559:   IS                iscol=a->col,isrow=a->row;
2560:   PetscErrorCode    ierr;
2561:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2562:   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2563:   PetscInt          i,nz,idx,idt,idc;
2564:   const MatScalar   *aa=a->a,*v;
2565:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2566:   const PetscScalar *b;

2569:   VecGetArrayRead(bb,&b);
2570:   VecGetArray(xx,&x);
2571:   t    = a->solve_work;

2573:   ISGetIndices(isrow,&rout); r = rout;
2574:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2576:   /* forward solve the lower triangular */
2577:   idx  = 7*(*r++);
2578:   t[0] = b[idx];   t[1] = b[1+idx];
2579:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2580:   t[5] = b[5+idx]; t[6] = b[6+idx];

2582:   for (i=1; i<n; i++) {
2583:     v   = aa + 49*ai[i];
2584:     vi  = aj + ai[i];
2585:     nz  = diag[i] - ai[i];
2586:     idx = 7*(*r++);
2587:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2588:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2589:     while (nz--) {
2590:       idx = 7*(*vi++);
2591:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2592:       x4  = t[3+idx];x5 = t[4+idx];
2593:       x6  = t[5+idx];x7 = t[6+idx];
2594:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2595:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2596:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2597:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2598:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2599:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2600:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2601:       v  += 49;
2602:     }
2603:     idx      = 7*i;
2604:     t[idx]   = s1;t[1+idx] = s2;
2605:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2606:     t[5+idx] = s6;t[6+idx] = s7;
2607:   }
2608:   /* backward solve the upper triangular */
2609:   for (i=n-1; i>=0; i--) {
2610:     v   = aa + 49*diag[i] + 49;
2611:     vi  = aj + diag[i] + 1;
2612:     nz  = ai[i+1] - diag[i] - 1;
2613:     idt = 7*i;
2614:     s1  = t[idt];  s2 = t[1+idt];
2615:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2616:     s6  = t[5+idt];s7 = t[6+idt];
2617:     while (nz--) {
2618:       idx = 7*(*vi++);
2619:       x1  = t[idx];   x2 = t[1+idx];
2620:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2621:       x6  = t[5+idx]; x7 = t[6+idx];
2622:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2623:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2624:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2625:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2626:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2627:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2628:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2629:       v  += 49;
2630:     }
2631:     idc    = 7*(*c--);
2632:     v      = aa + 49*diag[i];
2633:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2634:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2635:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2636:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2637:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2638:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2639:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2640:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2641:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2642:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2643:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2644:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2645:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2646:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2647:   }

2649:   ISRestoreIndices(isrow,&rout);
2650:   ISRestoreIndices(iscol,&cout);
2651:   VecRestoreArrayRead(bb,&b);
2652:   VecRestoreArray(xx,&x);
2653:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2654:   return(0);
2655: }

2659: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2660: {
2661:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2662:   IS                iscol=a->col,isrow=a->row;
2663:   PetscErrorCode    ierr;
2664:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2665:   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2666:   PetscInt          i,nz,idx,idt,idc,m;
2667:   const MatScalar   *aa=a->a,*v;
2668:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2669:   const PetscScalar *b;

2672:   VecGetArrayRead(bb,&b);
2673:   VecGetArray(xx,&x);
2674:   t    = a->solve_work;

2676:   ISGetIndices(isrow,&rout); r = rout;
2677:   ISGetIndices(iscol,&cout); c = cout;

2679:   /* forward solve the lower triangular */
2680:   idx  = 7*r[0];
2681:   t[0] = b[idx];   t[1] = b[1+idx];
2682:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2683:   t[5] = b[5+idx]; t[6] = b[6+idx];

2685:   for (i=1; i<n; i++) {
2686:     v   = aa + 49*ai[i];
2687:     vi  = aj + ai[i];
2688:     nz  = ai[i+1] - ai[i];
2689:     idx = 7*r[i];
2690:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2691:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2692:     for (m=0; m<nz; m++) {
2693:       idx = 7*vi[m];
2694:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2695:       x4  = t[3+idx];x5 = t[4+idx];
2696:       x6  = t[5+idx];x7 = t[6+idx];
2697:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2698:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2699:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2700:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2701:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2702:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2703:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2704:       v  += 49;
2705:     }
2706:     idx      = 7*i;
2707:     t[idx]   = s1;t[1+idx] = s2;
2708:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2709:     t[5+idx] = s6;t[6+idx] = s7;
2710:   }
2711:   /* backward solve the upper triangular */
2712:   for (i=n-1; i>=0; i--) {
2713:     v   = aa + 49*(adiag[i+1]+1);
2714:     vi  = aj + adiag[i+1]+1;
2715:     nz  = adiag[i] - adiag[i+1] - 1;
2716:     idt = 7*i;
2717:     s1  = t[idt];  s2 = t[1+idt];
2718:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2719:     s6  = t[5+idt];s7 = t[6+idt];
2720:     for (m=0; m<nz; m++) {
2721:       idx = 7*vi[m];
2722:       x1  = t[idx];   x2 = t[1+idx];
2723:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2724:       x6  = t[5+idx]; x7 = t[6+idx];
2725:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2726:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2727:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2728:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2729:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2730:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2731:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2732:       v  += 49;
2733:     }
2734:     idc    = 7*c[i];
2735:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2736:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2737:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2738:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2739:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2740:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2741:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2742:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2743:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2744:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2745:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2746:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2747:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2748:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2749:   }

2751:   ISRestoreIndices(isrow,&rout);
2752:   ISRestoreIndices(iscol,&cout);
2753:   VecRestoreArrayRead(bb,&b);
2754:   VecRestoreArray(xx,&x);
2755:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2756:   return(0);
2757: }

2761: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2762: {
2763:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
2764:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2765:   PetscErrorCode    ierr;
2766:   PetscInt          i,nz,idx,idt,jdx;
2767:   const MatScalar   *aa=a->a,*v;
2768:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2769:   const PetscScalar *b;

2772:   VecGetArrayRead(bb,&b);
2773:   VecGetArray(xx,&x);
2774:   /* forward solve the lower triangular */
2775:   idx  = 0;
2776:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2777:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2778:   x[6] = b[6+idx];
2779:   for (i=1; i<n; i++) {
2780:     v   =  aa + 49*ai[i];
2781:     vi  =  aj + ai[i];
2782:     nz  =  diag[i] - ai[i];
2783:     idx =  7*i;
2784:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2785:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2786:     s7  =  b[6+idx];
2787:     while (nz--) {
2788:       jdx = 7*(*vi++);
2789:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2790:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2791:       x7  = x[6+jdx];
2792:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2793:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2794:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2795:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2796:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2797:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2798:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2799:       v  += 49;
2800:     }
2801:     x[idx]   = s1;
2802:     x[1+idx] = s2;
2803:     x[2+idx] = s3;
2804:     x[3+idx] = s4;
2805:     x[4+idx] = s5;
2806:     x[5+idx] = s6;
2807:     x[6+idx] = s7;
2808:   }
2809:   /* backward solve the upper triangular */
2810:   for (i=n-1; i>=0; i--) {
2811:     v   = aa + 49*diag[i] + 49;
2812:     vi  = aj + diag[i] + 1;
2813:     nz  = ai[i+1] - diag[i] - 1;
2814:     idt = 7*i;
2815:     s1  = x[idt];   s2 = x[1+idt];
2816:     s3  = x[2+idt]; s4 = x[3+idt];
2817:     s5  = x[4+idt]; s6 = x[5+idt];
2818:     s7  = x[6+idt];
2819:     while (nz--) {
2820:       idx = 7*(*vi++);
2821:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2822:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2823:       x7  = x[6+idx];
2824:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2825:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2826:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2827:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2828:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2829:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2830:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2831:       v  += 49;
2832:     }
2833:     v      = aa + 49*diag[i];
2834:     x[idt] = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2835:              + v[28]*s5 + v[35]*s6 + v[42]*s7;
2836:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2837:                + v[29]*s5 + v[36]*s6 + v[43]*s7;
2838:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2839:                + v[30]*s5 + v[37]*s6 + v[44]*s7;
2840:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2841:                + v[31]*s5 + v[38]*s6 + v[45]*s7;
2842:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2843:                + v[32]*s5 + v[39]*s6 + v[46]*s7;
2844:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2845:                + v[33]*s5 + v[40]*s6 + v[47]*s7;
2846:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2847:                + v[34]*s5 + v[41]*s6 + v[48]*s7;
2848:   }

2850:   VecRestoreArrayRead(bb,&b);
2851:   VecRestoreArray(xx,&x);
2852:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2853:   return(0);
2854: }

2858: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2859: {
2860:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2861:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2862:   PetscErrorCode    ierr;
2863:   PetscInt          i,k,nz,idx,jdx,idt;
2864:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2865:   const MatScalar   *aa=a->a,*v;
2866:   PetscScalar       *x;
2867:   const PetscScalar *b;
2868:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

2871:   VecGetArrayRead(bb,&b);
2872:   VecGetArray(xx,&x);
2873:   /* forward solve the lower triangular */
2874:   idx  = 0;
2875:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2876:   x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2877:   for (i=1; i<n; i++) {
2878:     v   = aa + bs2*ai[i];
2879:     vi  = aj + ai[i];
2880:     nz  = ai[i+1] - ai[i];
2881:     idx = bs*i;
2882:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2883:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2884:     for (k=0; k<nz; k++) {
2885:       jdx = bs*vi[k];
2886:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2887:       x5  = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2888:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2889:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2890:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2891:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2892:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2893:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2894:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2895:       v  +=  bs2;
2896:     }

2898:     x[idx]   = s1;
2899:     x[1+idx] = s2;
2900:     x[2+idx] = s3;
2901:     x[3+idx] = s4;
2902:     x[4+idx] = s5;
2903:     x[5+idx] = s6;
2904:     x[6+idx] = s7;
2905:   }

2907:   /* backward solve the upper triangular */
2908:   for (i=n-1; i>=0; i--) {
2909:     v   = aa + bs2*(adiag[i+1]+1);
2910:     vi  = aj + adiag[i+1]+1;
2911:     nz  = adiag[i] - adiag[i+1]-1;
2912:     idt = bs*i;
2913:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2914:     s5  = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2915:     for (k=0; k<nz; k++) {
2916:       idx = bs*vi[k];
2917:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2918:       x5  = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2919:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2920:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2921:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2922:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2923:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2924:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2925:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2926:       v  +=  bs2;
2927:     }
2928:     /* x = inv_diagonal*x */
2929:     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2930:     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2931:     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2932:     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2933:     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2934:     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2935:     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2936:   }

2938:   VecRestoreArrayRead(bb,&b);
2939:   VecRestoreArray(xx,&x);
2940:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2941:   return(0);
2942: }

2946: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2947: {
2948:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2949:   IS                iscol=a->col,isrow=a->row;
2950:   PetscErrorCode    ierr;
2951:   const PetscInt    *r,*c,*rout,*cout;
2952:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2953:   PetscInt          i,nz,idx,idt,idc;
2954:   const MatScalar   *aa=a->a,*v;
2955:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2956:   const PetscScalar *b;

2959:   VecGetArrayRead(bb,&b);
2960:   VecGetArray(xx,&x);
2961:   t    = a->solve_work;

2963:   ISGetIndices(isrow,&rout); r = rout;
2964:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2966:   /* forward solve the lower triangular */
2967:   idx  = 6*(*r++);
2968:   t[0] = b[idx];   t[1] = b[1+idx];
2969:   t[2] = b[2+idx]; t[3] = b[3+idx];
2970:   t[4] = b[4+idx]; t[5] = b[5+idx];
2971:   for (i=1; i<n; i++) {
2972:     v   = aa + 36*ai[i];
2973:     vi  = aj + ai[i];
2974:     nz  = diag[i] - ai[i];
2975:     idx = 6*(*r++);
2976:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2977:     s5  = b[4+idx]; s6 = b[5+idx];
2978:     while (nz--) {
2979:       idx = 6*(*vi++);
2980:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2981:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2982:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2983:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2984:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2985:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2986:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2987:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2988:       v  += 36;
2989:     }
2990:     idx      = 6*i;
2991:     t[idx]   = s1;t[1+idx] = s2;
2992:     t[2+idx] = s3;t[3+idx] = s4;
2993:     t[4+idx] = s5;t[5+idx] = s6;
2994:   }
2995:   /* backward solve the upper triangular */
2996:   for (i=n-1; i>=0; i--) {
2997:     v   = aa + 36*diag[i] + 36;
2998:     vi  = aj + diag[i] + 1;
2999:     nz  = ai[i+1] - diag[i] - 1;
3000:     idt = 6*i;
3001:     s1  = t[idt];  s2 = t[1+idt];
3002:     s3  = t[2+idt];s4 = t[3+idt];
3003:     s5  = t[4+idt];s6 = t[5+idt];
3004:     while (nz--) {
3005:       idx = 6*(*vi++);
3006:       x1  = t[idx];   x2 = t[1+idx];
3007:       x3  = t[2+idx]; x4 = t[3+idx];
3008:       x5  = t[4+idx]; x6 = t[5+idx];
3009:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3010:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3011:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3012:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3013:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3014:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3015:       v  += 36;
3016:     }
3017:     idc    = 6*(*c--);
3018:     v      = aa + 36*diag[i];
3019:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3020:                         v[18]*s4+v[24]*s5+v[30]*s6;
3021:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3022:                           v[19]*s4+v[25]*s5+v[31]*s6;
3023:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3024:                           v[20]*s4+v[26]*s5+v[32]*s6;
3025:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3026:                           v[21]*s4+v[27]*s5+v[33]*s6;
3027:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3028:                           v[22]*s4+v[28]*s5+v[34]*s6;
3029:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3030:                           v[23]*s4+v[29]*s5+v[35]*s6;
3031:   }

3033:   ISRestoreIndices(isrow,&rout);
3034:   ISRestoreIndices(iscol,&cout);
3035:   VecRestoreArrayRead(bb,&b);
3036:   VecRestoreArray(xx,&x);
3037:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3038:   return(0);
3039: }

3043: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3044: {
3045:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
3046:   IS                iscol=a->col,isrow=a->row;
3047:   PetscErrorCode    ierr;
3048:   const PetscInt    *r,*c,*rout,*cout;
3049:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3050:   PetscInt          i,nz,idx,idt,idc,m;
3051:   const MatScalar   *aa=a->a,*v;
3052:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3053:   const PetscScalar *b;

3056:   VecGetArrayRead(bb,&b);
3057:   VecGetArray(xx,&x);
3058:   t    = a->solve_work;

3060:   ISGetIndices(isrow,&rout); r = rout;
3061:   ISGetIndices(iscol,&cout); c = cout;

3063:   /* forward solve the lower triangular */
3064:   idx  = 6*r[0];
3065:   t[0] = b[idx];   t[1] = b[1+idx];
3066:   t[2] = b[2+idx]; t[3] = b[3+idx];
3067:   t[4] = b[4+idx]; t[5] = b[5+idx];
3068:   for (i=1; i<n; i++) {
3069:     v   = aa + 36*ai[i];
3070:     vi  = aj + ai[i];
3071:     nz  = ai[i+1] - ai[i];
3072:     idx = 6*r[i];
3073:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3074:     s5  = b[4+idx]; s6 = b[5+idx];
3075:     for (m=0; m<nz; m++) {
3076:       idx = 6*vi[m];
3077:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3078:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3079:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3080:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3081:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3082:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3083:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3084:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3085:       v  += 36;
3086:     }
3087:     idx      = 6*i;
3088:     t[idx]   = s1;t[1+idx] = s2;
3089:     t[2+idx] = s3;t[3+idx] = s4;
3090:     t[4+idx] = s5;t[5+idx] = s6;
3091:   }
3092:   /* backward solve the upper triangular */
3093:   for (i=n-1; i>=0; i--) {
3094:     v   = aa + 36*(adiag[i+1]+1);
3095:     vi  = aj + adiag[i+1]+1;
3096:     nz  = adiag[i] - adiag[i+1] - 1;
3097:     idt = 6*i;
3098:     s1  = t[idt];  s2 = t[1+idt];
3099:     s3  = t[2+idt];s4 = t[3+idt];
3100:     s5  = t[4+idt];s6 = t[5+idt];
3101:     for (m=0; m<nz; m++) {
3102:       idx = 6*vi[m];
3103:       x1  = t[idx];   x2 = t[1+idx];
3104:       x3  = t[2+idx]; x4 = t[3+idx];
3105:       x5  = t[4+idx]; x6 = t[5+idx];
3106:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3107:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3108:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3109:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3110:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3111:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3112:       v  += 36;
3113:     }
3114:     idc    = 6*c[i];
3115:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3116:                         v[18]*s4+v[24]*s5+v[30]*s6;
3117:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3118:                           v[19]*s4+v[25]*s5+v[31]*s6;
3119:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3120:                           v[20]*s4+v[26]*s5+v[32]*s6;
3121:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3122:                           v[21]*s4+v[27]*s5+v[33]*s6;
3123:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3124:                           v[22]*s4+v[28]*s5+v[34]*s6;
3125:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3126:                           v[23]*s4+v[29]*s5+v[35]*s6;
3127:   }

3129:   ISRestoreIndices(isrow,&rout);
3130:   ISRestoreIndices(iscol,&cout);
3131:   VecRestoreArrayRead(bb,&b);
3132:   VecRestoreArray(xx,&x);
3133:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3134:   return(0);
3135: }

3139: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3140: {
3141:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3142:   PetscInt          i,nz,idx,idt,jdx;
3143:   PetscErrorCode    ierr;
3144:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3145:   const MatScalar   *aa   =a->a,*v;
3146:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3147:   const PetscScalar *b;

3150:   VecGetArrayRead(bb,&b);
3151:   VecGetArray(xx,&x);
3152:   /* forward solve the lower triangular */
3153:   idx  = 0;
3154:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3155:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3156:   for (i=1; i<n; i++) {
3157:     v   =  aa + 36*ai[i];
3158:     vi  =  aj + ai[i];
3159:     nz  =  diag[i] - ai[i];
3160:     idx =  6*i;
3161:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3162:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3163:     while (nz--) {
3164:       jdx = 6*(*vi++);
3165:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3166:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3167:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3168:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3169:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3170:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3171:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3172:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3173:       v  += 36;
3174:     }
3175:     x[idx]   = s1;
3176:     x[1+idx] = s2;
3177:     x[2+idx] = s3;
3178:     x[3+idx] = s4;
3179:     x[4+idx] = s5;
3180:     x[5+idx] = s6;
3181:   }
3182:   /* backward solve the upper triangular */
3183:   for (i=n-1; i>=0; i--) {
3184:     v   = aa + 36*diag[i] + 36;
3185:     vi  = aj + diag[i] + 1;
3186:     nz  = ai[i+1] - diag[i] - 1;
3187:     idt = 6*i;
3188:     s1  = x[idt];   s2 = x[1+idt];
3189:     s3  = x[2+idt]; s4 = x[3+idt];
3190:     s5  = x[4+idt]; s6 = x[5+idt];
3191:     while (nz--) {
3192:       idx = 6*(*vi++);
3193:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3194:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3195:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3196:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3197:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3198:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3199:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3200:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3201:       v  += 36;
3202:     }
3203:     v        = aa + 36*diag[i];
3204:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3205:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3206:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3207:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3208:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3209:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3210:   }

3212:   VecRestoreArrayRead(bb,&b);
3213:   VecRestoreArray(xx,&x);
3214:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3215:   return(0);
3216: }

3220: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3221: {
3222:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3223:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3224:   PetscErrorCode    ierr;
3225:   PetscInt          i,k,nz,idx,jdx,idt;
3226:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3227:   const MatScalar   *aa=a->a,*v;
3228:   PetscScalar       *x;
3229:   const PetscScalar *b;
3230:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

3233:   VecGetArrayRead(bb,&b);
3234:   VecGetArray(xx,&x);
3235:   /* forward solve the lower triangular */
3236:   idx  = 0;
3237:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3238:   x[4] = b[4+idx];x[5] = b[5+idx];
3239:   for (i=1; i<n; i++) {
3240:     v   = aa + bs2*ai[i];
3241:     vi  = aj + ai[i];
3242:     nz  = ai[i+1] - ai[i];
3243:     idx = bs*i;
3244:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3245:     s5  = b[4+idx];s6 = b[5+idx];
3246:     for (k=0; k<nz; k++) {
3247:       jdx = bs*vi[k];
3248:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3249:       x5  = x[4+jdx]; x6 = x[5+jdx];
3250:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3251:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3252:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3253:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3254:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3255:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3256:       v  +=  bs2;
3257:     }

3259:     x[idx]   = s1;
3260:     x[1+idx] = s2;
3261:     x[2+idx] = s3;
3262:     x[3+idx] = s4;
3263:     x[4+idx] = s5;
3264:     x[5+idx] = s6;
3265:   }

3267:   /* backward solve the upper triangular */
3268:   for (i=n-1; i>=0; i--) {
3269:     v   = aa + bs2*(adiag[i+1]+1);
3270:     vi  = aj + adiag[i+1]+1;
3271:     nz  = adiag[i] - adiag[i+1]-1;
3272:     idt = bs*i;
3273:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3274:     s5  = x[4+idt];s6 = x[5+idt];
3275:     for (k=0; k<nz; k++) {
3276:       idx = bs*vi[k];
3277:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3278:       x5  = x[4+idx];x6 = x[5+idx];
3279:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3280:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3281:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3282:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3283:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3284:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3285:       v  +=  bs2;
3286:     }
3287:     /* x = inv_diagonal*x */
3288:     x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3289:     x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3290:     x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3291:     x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3292:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3293:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3294:   }

3296:   VecRestoreArrayRead(bb,&b);
3297:   VecRestoreArray(xx,&x);
3298:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
3299:   return(0);
3300: }

3304: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3305: {
3306:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
3307:   IS                iscol=a->col,isrow=a->row;
3308:   PetscErrorCode    ierr;
3309:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3310:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3311:   PetscInt          i,nz,idx,idt,idc;
3312:   const MatScalar   *aa=a->a,*v;
3313:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3314:   const PetscScalar *b;

3317:   VecGetArrayRead(bb,&b);
3318:   VecGetArray(xx,&x);
3319:   t    = a->solve_work;

3321:   ISGetIndices(isrow,&rout); r = rout;
3322:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3324:   /* forward solve the lower triangular */
3325:   idx  = 5*(*r++);
3326:   t[0] = b[idx];   t[1] = b[1+idx];
3327:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3328:   for (i=1; i<n; i++) {
3329:     v   = aa + 25*ai[i];
3330:     vi  = aj + ai[i];
3331:     nz  = diag[i] - ai[i];
3332:     idx = 5*(*r++);
3333:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3334:     s5  = b[4+idx];
3335:     while (nz--) {
3336:       idx = 5*(*vi++);
3337:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3338:       x4  = t[3+idx];x5 = t[4+idx];
3339:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3340:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3341:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3342:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3343:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3344:       v  += 25;
3345:     }
3346:     idx      = 5*i;
3347:     t[idx]   = s1;t[1+idx] = s2;
3348:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3349:   }
3350:   /* backward solve the upper triangular */
3351:   for (i=n-1; i>=0; i--) {
3352:     v   = aa + 25*diag[i] + 25;
3353:     vi  = aj + diag[i] + 1;
3354:     nz  = ai[i+1] - diag[i] - 1;
3355:     idt = 5*i;
3356:     s1  = t[idt];  s2 = t[1+idt];
3357:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3358:     while (nz--) {
3359:       idx = 5*(*vi++);
3360:       x1  = t[idx];   x2 = t[1+idx];
3361:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3362:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3367:       v  += 25;
3368:     }
3369:     idc    = 5*(*c--);
3370:     v      = aa + 25*diag[i];
3371:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3372:                         v[15]*s4+v[20]*s5;
3373:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3374:                           v[16]*s4+v[21]*s5;
3375:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3376:                           v[17]*s4+v[22]*s5;
3377:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3378:                           v[18]*s4+v[23]*s5;
3379:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3380:                           v[19]*s4+v[24]*s5;
3381:   }

3383:   ISRestoreIndices(isrow,&rout);
3384:   ISRestoreIndices(iscol,&cout);
3385:   VecRestoreArrayRead(bb,&b);
3386:   VecRestoreArray(xx,&x);
3387:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3388:   return(0);
3389: }

3393: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3394: {
3395:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
3396:   IS                iscol=a->col,isrow=a->row;
3397:   PetscErrorCode    ierr;
3398:   const PetscInt    *r,*c,*rout,*cout;
3399:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3400:   PetscInt          i,nz,idx,idt,idc,m;
3401:   const MatScalar   *aa=a->a,*v;
3402:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3403:   const PetscScalar *b;

3406:   VecGetArrayRead(bb,&b);
3407:   VecGetArray(xx,&x);
3408:   t    = a->solve_work;

3410:   ISGetIndices(isrow,&rout); r = rout;
3411:   ISGetIndices(iscol,&cout); c = cout;

3413:   /* forward solve the lower triangular */
3414:   idx  = 5*r[0];
3415:   t[0] = b[idx];   t[1] = b[1+idx];
3416:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3417:   for (i=1; i<n; i++) {
3418:     v   = aa + 25*ai[i];
3419:     vi  = aj + ai[i];
3420:     nz  = ai[i+1] - ai[i];
3421:     idx = 5*r[i];
3422:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3423:     s5  = b[4+idx];
3424:     for (m=0; m<nz; m++) {
3425:       idx = 5*vi[m];
3426:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3427:       x4  = t[3+idx];x5 = t[4+idx];
3428:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3429:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3430:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3431:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3432:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3433:       v  += 25;
3434:     }
3435:     idx      = 5*i;
3436:     t[idx]   = s1;t[1+idx] = s2;
3437:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3438:   }
3439:   /* backward solve the upper triangular */
3440:   for (i=n-1; i>=0; i--) {
3441:     v   = aa + 25*(adiag[i+1]+1);
3442:     vi  = aj + adiag[i+1]+1;
3443:     nz  = adiag[i] - adiag[i+1] - 1;
3444:     idt = 5*i;
3445:     s1  = t[idt];  s2 = t[1+idt];
3446:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3447:     for (m=0; m<nz; m++) {
3448:       idx = 5*vi[m];
3449:       x1  = t[idx];   x2 = t[1+idx];
3450:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3451:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3452:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3453:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3454:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3455:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3456:       v  += 25;
3457:     }
3458:     idc    = 5*c[i];
3459:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3460:                         v[15]*s4+v[20]*s5;
3461:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3462:                           v[16]*s4+v[21]*s5;
3463:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3464:                           v[17]*s4+v[22]*s5;
3465:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3466:                           v[18]*s4+v[23]*s5;
3467:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3468:                           v[19]*s4+v[24]*s5;
3469:   }

3471:   ISRestoreIndices(isrow,&rout);
3472:   ISRestoreIndices(iscol,&cout);
3473:   VecRestoreArrayRead(bb,&b);
3474:   VecRestoreArray(xx,&x);
3475:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3476:   return(0);
3477: }

3481: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3482: {
3483:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3484:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3485:   PetscInt          i,nz,idx,idt,jdx;
3486:   PetscErrorCode    ierr;
3487:   const MatScalar   *aa=a->a,*v;
3488:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3489:   const PetscScalar *b;

3492:   VecGetArrayRead(bb,&b);
3493:   VecGetArray(xx,&x);
3494:   /* forward solve the lower triangular */
3495:   idx  = 0;
3496:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3497:   for (i=1; i<n; i++) {
3498:     v   =  aa + 25*ai[i];
3499:     vi  =  aj + ai[i];
3500:     nz  =  diag[i] - ai[i];
3501:     idx =  5*i;
3502:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3503:     while (nz--) {
3504:       jdx = 5*(*vi++);
3505:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3506:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3507:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3508:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3509:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3510:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3511:       v  += 25;
3512:     }
3513:     x[idx]   = s1;
3514:     x[1+idx] = s2;
3515:     x[2+idx] = s3;
3516:     x[3+idx] = s4;
3517:     x[4+idx] = s5;
3518:   }
3519:   /* backward solve the upper triangular */
3520:   for (i=n-1; i>=0; i--) {
3521:     v   = aa + 25*diag[i] + 25;
3522:     vi  = aj + diag[i] + 1;
3523:     nz  = ai[i+1] - diag[i] - 1;
3524:     idt = 5*i;
3525:     s1  = x[idt];  s2 = x[1+idt];
3526:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3527:     while (nz--) {
3528:       idx = 5*(*vi++);
3529:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3530:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3531:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3532:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3533:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3534:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3535:       v  += 25;
3536:     }
3537:     v        = aa + 25*diag[i];
3538:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3539:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3540:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3541:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3542:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3543:   }

3545:   VecRestoreArrayRead(bb,&b);
3546:   VecRestoreArray(xx,&x);
3547:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3548:   return(0);
3549: }

3553: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3554: {
3555:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3556:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3557:   PetscInt          i,k,nz,idx,idt,jdx;
3558:   PetscErrorCode    ierr;
3559:   const MatScalar   *aa=a->a,*v;
3560:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3561:   const PetscScalar *b;

3564:   VecGetArrayRead(bb,&b);
3565:   VecGetArray(xx,&x);
3566:   /* forward solve the lower triangular */
3567:   idx  = 0;
3568:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3569:   for (i=1; i<n; i++) {
3570:     v   = aa + 25*ai[i];
3571:     vi  = aj + ai[i];
3572:     nz  = ai[i+1] - ai[i];
3573:     idx = 5*i;
3574:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3575:     for (k=0; k<nz; k++) {
3576:       jdx = 5*vi[k];
3577:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3578:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3579:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3580:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3581:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3582:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3583:       v  += 25;
3584:     }
3585:     x[idx]   = s1;
3586:     x[1+idx] = s2;
3587:     x[2+idx] = s3;
3588:     x[3+idx] = s4;
3589:     x[4+idx] = s5;
3590:   }

3592:   /* backward solve the upper triangular */
3593:   for (i=n-1; i>=0; i--) {
3594:     v   = aa + 25*(adiag[i+1]+1);
3595:     vi  = aj + adiag[i+1]+1;
3596:     nz  = adiag[i] - adiag[i+1]-1;
3597:     idt = 5*i;
3598:     s1  = x[idt];  s2 = x[1+idt];
3599:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3600:     for (k=0; k<nz; k++) {
3601:       idx = 5*vi[k];
3602:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3603:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3604:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3605:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3606:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3607:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3608:       v  += 25;
3609:     }
3610:     /* x = inv_diagonal*x */
3611:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3612:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3613:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3614:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3615:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3616:   }

3618:   VecRestoreArrayRead(bb,&b);
3619:   VecRestoreArray(xx,&x);
3620:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3621:   return(0);
3622: }

3626: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3627: {
3628:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3629:   IS                iscol=a->col,isrow=a->row;
3630:   PetscErrorCode    ierr;
3631:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3632:   PetscInt          i,nz,idx,idt,idc;
3633:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3634:   const MatScalar   *aa=a->a,*v;
3635:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3636:   const PetscScalar *b;

3639:   VecGetArrayRead(bb,&b);
3640:   VecGetArray(xx,&x);
3641:   t    = a->solve_work;

3643:   ISGetIndices(isrow,&rout); r = rout;
3644:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3646:   /* forward solve the lower triangular */
3647:   idx  = 4*(*r++);
3648:   t[0] = b[idx];   t[1] = b[1+idx];
3649:   t[2] = b[2+idx]; t[3] = b[3+idx];
3650:   for (i=1; i<n; i++) {
3651:     v   = aa + 16*ai[i];
3652:     vi  = aj + ai[i];
3653:     nz  = diag[i] - ai[i];
3654:     idx = 4*(*r++);
3655:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3656:     while (nz--) {
3657:       idx = 4*(*vi++);
3658:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3659:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3660:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3661:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3662:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3663:       v  += 16;
3664:     }
3665:     idx      = 4*i;
3666:     t[idx]   = s1;t[1+idx] = s2;
3667:     t[2+idx] = s3;t[3+idx] = s4;
3668:   }
3669:   /* backward solve the upper triangular */
3670:   for (i=n-1; i>=0; i--) {
3671:     v   = aa + 16*diag[i] + 16;
3672:     vi  = aj + diag[i] + 1;
3673:     nz  = ai[i+1] - diag[i] - 1;
3674:     idt = 4*i;
3675:     s1  = t[idt];  s2 = t[1+idt];
3676:     s3  = t[2+idt];s4 = t[3+idt];
3677:     while (nz--) {
3678:       idx = 4*(*vi++);
3679:       x1  = t[idx];   x2 = t[1+idx];
3680:       x3  = t[2+idx]; x4 = t[3+idx];
3681:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3682:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3683:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3684:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3685:       v  += 16;
3686:     }
3687:     idc      = 4*(*c--);
3688:     v        = aa + 16*diag[i];
3689:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3690:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3691:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3692:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3693:   }

3695:   ISRestoreIndices(isrow,&rout);
3696:   ISRestoreIndices(iscol,&cout);
3697:   VecRestoreArrayRead(bb,&b);
3698:   VecRestoreArray(xx,&x);
3699:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3700:   return(0);
3701: }

3705: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3706: {
3707:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3708:   IS                iscol=a->col,isrow=a->row;
3709:   PetscErrorCode    ierr;
3710:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3711:   PetscInt          i,nz,idx,idt,idc,m;
3712:   const PetscInt    *r,*c,*rout,*cout;
3713:   const MatScalar   *aa=a->a,*v;
3714:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3715:   const PetscScalar *b;

3718:   VecGetArrayRead(bb,&b);
3719:   VecGetArray(xx,&x);
3720:   t    = a->solve_work;

3722:   ISGetIndices(isrow,&rout); r = rout;
3723:   ISGetIndices(iscol,&cout); c = cout;

3725:   /* forward solve the lower triangular */
3726:   idx  = 4*r[0];
3727:   t[0] = b[idx];   t[1] = b[1+idx];
3728:   t[2] = b[2+idx]; t[3] = b[3+idx];
3729:   for (i=1; i<n; i++) {
3730:     v   = aa + 16*ai[i];
3731:     vi  = aj + ai[i];
3732:     nz  = ai[i+1] - ai[i];
3733:     idx = 4*r[i];
3734:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3735:     for (m=0; m<nz; m++) {
3736:       idx = 4*vi[m];
3737:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3738:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3739:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3740:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3741:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3742:       v  += 16;
3743:     }
3744:     idx      = 4*i;
3745:     t[idx]   = s1;t[1+idx] = s2;
3746:     t[2+idx] = s3;t[3+idx] = s4;
3747:   }
3748:   /* backward solve the upper triangular */
3749:   for (i=n-1; i>=0; i--) {
3750:     v   = aa + 16*(adiag[i+1]+1);
3751:     vi  = aj + adiag[i+1]+1;
3752:     nz  = adiag[i] - adiag[i+1] - 1;
3753:     idt = 4*i;
3754:     s1  = t[idt];  s2 = t[1+idt];
3755:     s3  = t[2+idt];s4 = t[3+idt];
3756:     for (m=0; m<nz; m++) {
3757:       idx = 4*vi[m];
3758:       x1  = t[idx];   x2 = t[1+idx];
3759:       x3  = t[2+idx]; x4 = t[3+idx];
3760:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3761:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3762:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3763:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3764:       v  += 16;
3765:     }
3766:     idc      = 4*c[i];
3767:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3768:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3769:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3770:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3771:   }

3773:   ISRestoreIndices(isrow,&rout);
3774:   ISRestoreIndices(iscol,&cout);
3775:   VecRestoreArrayRead(bb,&b);
3776:   VecRestoreArray(xx,&x);
3777:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3778:   return(0);
3779: }

3783: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3784: {
3785:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3786:   IS                iscol=a->col,isrow=a->row;
3787:   PetscErrorCode    ierr;
3788:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3789:   PetscInt          i,nz,idx,idt,idc;
3790:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3791:   const MatScalar   *aa=a->a,*v;
3792:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3793:   PetscScalar       *x;
3794:   const PetscScalar *b;

3797:   VecGetArrayRead(bb,&b);
3798:   VecGetArray(xx,&x);
3799:   t    = (MatScalar*)a->solve_work;

3801:   ISGetIndices(isrow,&rout); r = rout;
3802:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3804:   /* forward solve the lower triangular */
3805:   idx  = 4*(*r++);
3806:   t[0] = (MatScalar)b[idx];
3807:   t[1] = (MatScalar)b[1+idx];
3808:   t[2] = (MatScalar)b[2+idx];
3809:   t[3] = (MatScalar)b[3+idx];
3810:   for (i=1; i<n; i++) {
3811:     v   = aa + 16*ai[i];
3812:     vi  = aj + ai[i];
3813:     nz  = diag[i] - ai[i];
3814:     idx = 4*(*r++);
3815:     s1  = (MatScalar)b[idx];
3816:     s2  = (MatScalar)b[1+idx];
3817:     s3  = (MatScalar)b[2+idx];
3818:     s4  = (MatScalar)b[3+idx];
3819:     while (nz--) {
3820:       idx = 4*(*vi++);
3821:       x1  = t[idx];
3822:       x2  = t[1+idx];
3823:       x3  = t[2+idx];
3824:       x4  = t[3+idx];
3825:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3826:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3827:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3828:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3829:       v  += 16;
3830:     }
3831:     idx      = 4*i;
3832:     t[idx]   = s1;
3833:     t[1+idx] = s2;
3834:     t[2+idx] = s3;
3835:     t[3+idx] = s4;
3836:   }
3837:   /* backward solve the upper triangular */
3838:   for (i=n-1; i>=0; i--) {
3839:     v   = aa + 16*diag[i] + 16;
3840:     vi  = aj + diag[i] + 1;
3841:     nz  = ai[i+1] - diag[i] - 1;
3842:     idt = 4*i;
3843:     s1  = t[idt];
3844:     s2  = t[1+idt];
3845:     s3  = t[2+idt];
3846:     s4  = t[3+idt];
3847:     while (nz--) {
3848:       idx = 4*(*vi++);
3849:       x1  = t[idx];
3850:       x2  = t[1+idx];
3851:       x3  = t[2+idx];
3852:       x4  = t[3+idx];
3853:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3854:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3855:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3856:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3857:       v  += 16;
3858:     }
3859:     idc      = 4*(*c--);
3860:     v        = aa + 16*diag[i];
3861:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3862:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3863:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3864:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3865:     x[idc]   = (PetscScalar)t[idt];
3866:     x[1+idc] = (PetscScalar)t[1+idt];
3867:     x[2+idc] = (PetscScalar)t[2+idt];
3868:     x[3+idc] = (PetscScalar)t[3+idt];
3869:   }

3871:   ISRestoreIndices(isrow,&rout);
3872:   ISRestoreIndices(iscol,&cout);
3873:   VecRestoreArrayRead(bb,&b);
3874:   VecRestoreArray(xx,&x);
3875:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3876:   return(0);
3877: }

3879: #if defined(PETSC_HAVE_SSE)

3881: #include PETSC_HAVE_SSE

3885: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3886: {
3887:   /*
3888:      Note: This code uses demotion of double
3889:      to float when performing the mixed-mode computation.
3890:      This may not be numerically reasonable for all applications.
3891:   */
3892:   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
3893:   IS             iscol=a->col,isrow=a->row;
3895:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3896:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3897:   MatScalar      *aa=a->a,*v;
3898:   PetscScalar    *x,*b,*t;

3900:   /* Make space in temp stack for 16 Byte Aligned arrays */
3901:   float         ssealignedspace[11],*tmps,*tmpx;
3902:   unsigned long offset;

3905:   SSE_SCOPE_BEGIN;

3907:   offset = (unsigned long)ssealignedspace % 16;
3908:   if (offset) offset = (16 - offset)/4;
3909:   tmps = &ssealignedspace[offset];
3910:   tmpx = &ssealignedspace[offset+4];
3911:   PREFETCH_NTA(aa+16*ai[1]);

3913:   VecGetArray(bb,&b);
3914:   VecGetArray(xx,&x);
3915:   t    = a->solve_work;

3917:   ISGetIndices(isrow,&rout); r = rout;
3918:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3920:   /* forward solve the lower triangular */
3921:   idx  = 4*(*r++);
3922:   t[0] = b[idx];   t[1] = b[1+idx];
3923:   t[2] = b[2+idx]; t[3] = b[3+idx];
3924:   v    =  aa + 16*ai[1];

3926:   for (i=1; i<n; ) {
3927:     PREFETCH_NTA(&v[8]);
3928:     vi  =  aj      + ai[i];
3929:     nz  =  diag[i] - ai[i];
3930:     idx =  4*(*r++);

3932:     /* Demote sum from double to float */
3933:     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3934:     LOAD_PS(tmps,XMM7);

3936:     while (nz--) {
3937:       PREFETCH_NTA(&v[16]);
3938:       idx = 4*(*vi++);

3940:       /* Demote solution (so far) from double to float */
3941:       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

3943:       /* 4x4 Matrix-Vector product with negative accumulation: */
3944:       SSE_INLINE_BEGIN_2(tmpx,v)
3945:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

3947:       /* First Column */
3948:       SSE_COPY_PS(XMM0,XMM6)
3949:       SSE_SHUFFLE(XMM0,XMM0,0x00)
3950:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3951:       SSE_SUB_PS(XMM7,XMM0)

3953:       /* Second Column */
3954:       SSE_COPY_PS(XMM1,XMM6)
3955:       SSE_SHUFFLE(XMM1,XMM1,0x55)
3956:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3957:       SSE_SUB_PS(XMM7,XMM1)

3959:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

3961:       /* Third Column */
3962:       SSE_COPY_PS(XMM2,XMM6)
3963:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
3964:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3965:       SSE_SUB_PS(XMM7,XMM2)

3967:       /* Fourth Column */
3968:       SSE_COPY_PS(XMM3,XMM6)
3969:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
3970:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3971:       SSE_SUB_PS(XMM7,XMM3)
3972:       SSE_INLINE_END_2

3974:       v += 16;
3975:     }
3976:     idx = 4*i;
3977:     v   = aa + 16*ai[++i];
3978:     PREFETCH_NTA(v);
3979:     STORE_PS(tmps,XMM7);

3981:     /* Promote result from float to double */
3982:     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3983:   }
3984:   /* backward solve the upper triangular */
3985:   idt  = 4*(n-1);
3986:   ai16 = 16*diag[n-1];
3987:   v    = aa + ai16 + 16;
3988:   for (i=n-1; i>=0; ) {
3989:     PREFETCH_NTA(&v[8]);
3990:     vi = aj + diag[i] + 1;
3991:     nz = ai[i+1] - diag[i] - 1;

3993:     /* Demote accumulator from double to float */
3994:     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3995:     LOAD_PS(tmps,XMM7);

3997:     while (nz--) {
3998:       PREFETCH_NTA(&v[16]);
3999:       idx = 4*(*vi++);

4001:       /* Demote solution (so far) from double to float */
4002:       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

4004:       /* 4x4 Matrix-Vector Product with negative accumulation: */
4005:       SSE_INLINE_BEGIN_2(tmpx,v)
4006:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4008:       /* First Column */
4009:       SSE_COPY_PS(XMM0,XMM6)
4010:       SSE_SHUFFLE(XMM0,XMM0,0x00)
4011:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4012:       SSE_SUB_PS(XMM7,XMM0)

4014:       /* Second Column */
4015:       SSE_COPY_PS(XMM1,XMM6)
4016:       SSE_SHUFFLE(XMM1,XMM1,0x55)
4017:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4018:       SSE_SUB_PS(XMM7,XMM1)

4020:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

4022:       /* Third Column */
4023:       SSE_COPY_PS(XMM2,XMM6)
4024:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4025:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4026:       SSE_SUB_PS(XMM7,XMM2)

4028:       /* Fourth Column */
4029:       SSE_COPY_PS(XMM3,XMM6)
4030:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4031:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4032:       SSE_SUB_PS(XMM7,XMM3)
4033:       SSE_INLINE_END_2
4034:       v += 16;
4035:     }
4036:     v    = aa + ai16;
4037:     ai16 = 16*diag[--i];
4038:     PREFETCH_NTA(aa+ai16+16);
4039:     /*
4040:        Scale the result by the diagonal 4x4 block,
4041:        which was inverted as part of the factorization
4042:     */
4043:     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4044:     /* First Column */
4045:     SSE_COPY_PS(XMM0,XMM7)
4046:     SSE_SHUFFLE(XMM0,XMM0,0x00)
4047:     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4049:     /* Second Column */
4050:     SSE_COPY_PS(XMM1,XMM7)
4051:     SSE_SHUFFLE(XMM1,XMM1,0x55)
4052:     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4053:     SSE_ADD_PS(XMM0,XMM1)

4055:     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

4057:     /* Third Column */
4058:     SSE_COPY_PS(XMM2,XMM7)
4059:     SSE_SHUFFLE(XMM2,XMM2,0xAA)
4060:     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4061:     SSE_ADD_PS(XMM0,XMM2)

4063:     /* Fourth Column */
4064:     SSE_COPY_PS(XMM3,XMM7)
4065:     SSE_SHUFFLE(XMM3,XMM3,0xFF)
4066:     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4067:     SSE_ADD_PS(XMM0,XMM3)

4069:     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4070:     SSE_INLINE_END_3

4072:     /* Promote solution from float to double */
4073:     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

4075:     /* Apply reordering to t and stream into x.    */
4076:     /* This way, x doesn't pollute the cache.      */
4077:     /* Be careful with size: 2 doubles = 4 floats! */
4078:     idc = 4*(*c--);
4079:     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
4080:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4081:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4082:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4083:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4084:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4085:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4086:     SSE_INLINE_END_2
4087:     v    = aa + ai16 + 16;
4088:     idt -= 4;
4089:   }

4091:   ISRestoreIndices(isrow,&rout);
4092:   ISRestoreIndices(iscol,&cout);
4093:   VecRestoreArray(bb,&b);
4094:   VecRestoreArray(xx,&x);
4095:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4096:   SSE_SCOPE_END;
4097:   return(0);
4098: }

4100: #endif


4103: /*
4104:       Special case where the matrix was ILU(0) factored in the natural
4105:    ordering. This eliminates the need for the column and row permutation.
4106: */
4109: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4110: {
4111:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4112:   PetscInt          n  =a->mbs;
4113:   const PetscInt    *ai=a->i,*aj=a->j;
4114:   PetscErrorCode    ierr;
4115:   const PetscInt    *diag = a->diag;
4116:   const MatScalar   *aa   =a->a;
4117:   PetscScalar       *x;
4118:   const PetscScalar *b;

4121:   VecGetArrayRead(bb,&b);
4122:   VecGetArray(xx,&x);

4124: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4125:   {
4126:     static PetscScalar w[2000]; /* very BAD need to fix */
4127:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4128:   }
4129: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4130:   {
4131:     static PetscScalar w[2000]; /* very BAD need to fix */
4132:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4133:   }
4134: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4135:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4136: #else
4137:   {
4138:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4139:     const MatScalar *v;
4140:     PetscInt        jdx,idt,idx,nz,i,ai16;
4141:     const PetscInt  *vi;

4143:     /* forward solve the lower triangular */
4144:     idx  = 0;
4145:     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4146:     for (i=1; i<n; i++) {
4147:       v    =  aa      + 16*ai[i];
4148:       vi   =  aj      + ai[i];
4149:       nz   =  diag[i] - ai[i];
4150:       idx +=  4;
4151:       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4152:       while (nz--) {
4153:         jdx = 4*(*vi++);
4154:         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4155:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4156:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4157:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4158:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4159:         v  += 16;
4160:       }
4161:       x[idx]   = s1;
4162:       x[1+idx] = s2;
4163:       x[2+idx] = s3;
4164:       x[3+idx] = s4;
4165:     }
4166:     /* backward solve the upper triangular */
4167:     idt = 4*(n-1);
4168:     for (i=n-1; i>=0; i--) {
4169:       ai16 = 16*diag[i];
4170:       v    = aa + ai16 + 16;
4171:       vi   = aj + diag[i] + 1;
4172:       nz   = ai[i+1] - diag[i] - 1;
4173:       s1   = x[idt];  s2 = x[1+idt];
4174:       s3   = x[2+idt];s4 = x[3+idt];
4175:       while (nz--) {
4176:         idx = 4*(*vi++);
4177:         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4178:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4179:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4180:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4181:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4182:         v  += 16;
4183:       }
4184:       v        = aa + ai16;
4185:       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4186:       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4187:       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4188:       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4189:       idt     -= 4;
4190:     }
4191:   }
4192: #endif

4194:   VecRestoreArrayRead(bb,&b);
4195:   VecRestoreArray(xx,&x);
4196:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4197:   return(0);
4198: }

4202: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4203: {
4204:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4205:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4206:   PetscInt          i,k,nz,idx,jdx,idt;
4207:   PetscErrorCode    ierr;
4208:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4209:   const MatScalar   *aa=a->a,*v;
4210:   PetscScalar       *x;
4211:   const PetscScalar *b;
4212:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

4215:   VecGetArrayRead(bb,&b);
4216:   VecGetArray(xx,&x);
4217:   /* forward solve the lower triangular */
4218:   idx  = 0;
4219:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4220:   for (i=1; i<n; i++) {
4221:     v   = aa + bs2*ai[i];
4222:     vi  = aj + ai[i];
4223:     nz  = ai[i+1] - ai[i];
4224:     idx = bs*i;
4225:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4226:     for (k=0; k<nz; k++) {
4227:       jdx = bs*vi[k];
4228:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4229:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4230:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4231:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4232:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

4234:       v +=  bs2;
4235:     }

4237:     x[idx]   = s1;
4238:     x[1+idx] = s2;
4239:     x[2+idx] = s3;
4240:     x[3+idx] = s4;
4241:   }

4243:   /* backward solve the upper triangular */
4244:   for (i=n-1; i>=0; i--) {
4245:     v   = aa + bs2*(adiag[i+1]+1);
4246:     vi  = aj + adiag[i+1]+1;
4247:     nz  = adiag[i] - adiag[i+1]-1;
4248:     idt = bs*i;
4249:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];

4251:     for (k=0; k<nz; k++) {
4252:       idx = bs*vi[k];
4253:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4254:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4255:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4256:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4257:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

4259:       v +=  bs2;
4260:     }
4261:     /* x = inv_diagonal*x */
4262:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4263:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4264:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4265:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

4267:   }

4269:   VecRestoreArrayRead(bb,&b);
4270:   VecRestoreArray(xx,&x);
4271:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4272:   return(0);
4273: }

4277: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4278: {
4279:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4280:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4281:   PetscErrorCode    ierr;
4282:   const MatScalar   *aa=a->a;
4283:   const PetscScalar *b;
4284:   PetscScalar       *x;

4287:   VecGetArrayRead(bb,&b);
4288:   VecGetArray(xx,&x);

4290:   {
4291:     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4292:     const MatScalar *v;
4293:     MatScalar       *t=(MatScalar*)x;
4294:     PetscInt        jdx,idt,idx,nz,i,ai16;
4295:     const PetscInt  *vi;

4297:     /* forward solve the lower triangular */
4298:     idx  = 0;
4299:     t[0] = (MatScalar)b[0];
4300:     t[1] = (MatScalar)b[1];
4301:     t[2] = (MatScalar)b[2];
4302:     t[3] = (MatScalar)b[3];
4303:     for (i=1; i<n; i++) {
4304:       v    =  aa      + 16*ai[i];
4305:       vi   =  aj      + ai[i];
4306:       nz   =  diag[i] - ai[i];
4307:       idx +=  4;
4308:       s1   = (MatScalar)b[idx];
4309:       s2   = (MatScalar)b[1+idx];
4310:       s3   = (MatScalar)b[2+idx];
4311:       s4   = (MatScalar)b[3+idx];
4312:       while (nz--) {
4313:         jdx = 4*(*vi++);
4314:         x1  = t[jdx];
4315:         x2  = t[1+jdx];
4316:         x3  = t[2+jdx];
4317:         x4  = t[3+jdx];
4318:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4319:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4320:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4321:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4322:         v  += 16;
4323:       }
4324:       t[idx]   = s1;
4325:       t[1+idx] = s2;
4326:       t[2+idx] = s3;
4327:       t[3+idx] = s4;
4328:     }
4329:     /* backward solve the upper triangular */
4330:     idt = 4*(n-1);
4331:     for (i=n-1; i>=0; i--) {
4332:       ai16 = 16*diag[i];
4333:       v    = aa + ai16 + 16;
4334:       vi   = aj + diag[i] + 1;
4335:       nz   = ai[i+1] - diag[i] - 1;
4336:       s1   = t[idt];
4337:       s2   = t[1+idt];
4338:       s3   = t[2+idt];
4339:       s4   = t[3+idt];
4340:       while (nz--) {
4341:         idx = 4*(*vi++);
4342:         x1  = (MatScalar)x[idx];
4343:         x2  = (MatScalar)x[1+idx];
4344:         x3  = (MatScalar)x[2+idx];
4345:         x4  = (MatScalar)x[3+idx];
4346:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4347:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4348:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4349:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4350:         v  += 16;
4351:       }
4352:       v        = aa + ai16;
4353:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4354:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4355:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4356:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4357:       idt     -= 4;
4358:     }
4359:   }

4361:   VecRestoreArrayRead(bb,&b);
4362:   VecRestoreArray(xx,&x);
4363:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4364:   return(0);
4365: }

4367: #if defined(PETSC_HAVE_SSE)

4369: #include PETSC_HAVE_SSE
4372: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4373: {
4374:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
4375:   unsigned short *aj=(unsigned short*)a->j;
4377:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4378:   MatScalar      *aa=a->a;
4379:   PetscScalar    *x,*b;

4382:   SSE_SCOPE_BEGIN;
4383:   /*
4384:      Note: This code currently uses demotion of double
4385:      to float when performing the mixed-mode computation.
4386:      This may not be numerically reasonable for all applications.
4387:   */
4388:   PREFETCH_NTA(aa+16*ai[1]);

4390:   VecGetArray(bb,&b);
4391:   VecGetArray(xx,&x);
4392:   {
4393:     /* x will first be computed in single precision then promoted inplace to double */
4394:     MatScalar      *v,*t=(MatScalar*)x;
4395:     int            nz,i,idt,ai16;
4396:     unsigned int   jdx,idx;
4397:     unsigned short *vi;
4398:     /* Forward solve the lower triangular factor. */

4400:     /* First block is the identity. */
4401:     idx = 0;
4402:     CONVERT_DOUBLE4_FLOAT4(t,b);
4403:     v =  aa + 16*((unsigned int)ai[1]);

4405:     for (i=1; i<n; ) {
4406:       PREFETCH_NTA(&v[8]);
4407:       vi   =  aj      + ai[i];
4408:       nz   =  diag[i] - ai[i];
4409:       idx +=  4;

4411:       /* Demote RHS from double to float. */
4412:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4413:       LOAD_PS(&t[idx],XMM7);

4415:       while (nz--) {
4416:         PREFETCH_NTA(&v[16]);
4417:         jdx = 4*((unsigned int)(*vi++));

4419:         /* 4x4 Matrix-Vector product with negative accumulation: */
4420:         SSE_INLINE_BEGIN_2(&t[jdx],v)
4421:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4423:         /* First Column */
4424:         SSE_COPY_PS(XMM0,XMM6)
4425:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4426:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4427:         SSE_SUB_PS(XMM7,XMM0)

4429:         /* Second Column */
4430:         SSE_COPY_PS(XMM1,XMM6)
4431:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4432:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4433:         SSE_SUB_PS(XMM7,XMM1)

4435:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

4437:         /* Third Column */
4438:         SSE_COPY_PS(XMM2,XMM6)
4439:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4440:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4441:         SSE_SUB_PS(XMM7,XMM2)

4443:         /* Fourth Column */
4444:         SSE_COPY_PS(XMM3,XMM6)
4445:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4446:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4447:         SSE_SUB_PS(XMM7,XMM3)
4448:         SSE_INLINE_END_2

4450:         v += 16;
4451:       }
4452:       v =  aa + 16*ai[++i];
4453:       PREFETCH_NTA(v);
4454:       STORE_PS(&t[idx],XMM7);
4455:     }

4457:     /* Backward solve the upper triangular factor.*/

4459:     idt  = 4*(n-1);
4460:     ai16 = 16*diag[n-1];
4461:     v    = aa + ai16 + 16;
4462:     for (i=n-1; i>=0; ) {
4463:       PREFETCH_NTA(&v[8]);
4464:       vi = aj + diag[i] + 1;
4465:       nz = ai[i+1] - diag[i] - 1;

4467:       LOAD_PS(&t[idt],XMM7);

4469:       while (nz--) {
4470:         PREFETCH_NTA(&v[16]);
4471:         idx = 4*((unsigned int)(*vi++));

4473:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4474:         SSE_INLINE_BEGIN_2(&t[idx],v)
4475:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4477:         /* First Column */
4478:         SSE_COPY_PS(XMM0,XMM6)
4479:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4480:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4481:         SSE_SUB_PS(XMM7,XMM0)

4483:         /* Second Column */
4484:         SSE_COPY_PS(XMM1,XMM6)
4485:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4486:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4487:         SSE_SUB_PS(XMM7,XMM1)

4489:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

4491:         /* Third Column */
4492:         SSE_COPY_PS(XMM2,XMM6)
4493:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4494:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4495:         SSE_SUB_PS(XMM7,XMM2)

4497:         /* Fourth Column */
4498:         SSE_COPY_PS(XMM3,XMM6)
4499:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4500:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4501:         SSE_SUB_PS(XMM7,XMM3)
4502:         SSE_INLINE_END_2
4503:         v += 16;
4504:       }
4505:       v    = aa + ai16;
4506:       ai16 = 16*diag[--i];
4507:       PREFETCH_NTA(aa+ai16+16);
4508:       /*
4509:          Scale the result by the diagonal 4x4 block,
4510:          which was inverted as part of the factorization
4511:       */
4512:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4513:       /* First Column */
4514:       SSE_COPY_PS(XMM0,XMM7)
4515:       SSE_SHUFFLE(XMM0,XMM0,0x00)
4516:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4518:       /* Second Column */
4519:       SSE_COPY_PS(XMM1,XMM7)
4520:       SSE_SHUFFLE(XMM1,XMM1,0x55)
4521:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4522:       SSE_ADD_PS(XMM0,XMM1)

4524:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

4526:       /* Third Column */
4527:       SSE_COPY_PS(XMM2,XMM7)
4528:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4529:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4530:       SSE_ADD_PS(XMM0,XMM2)

4532:       /* Fourth Column */
4533:       SSE_COPY_PS(XMM3,XMM7)
4534:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4535:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4536:       SSE_ADD_PS(XMM0,XMM3)

4538:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4539:       SSE_INLINE_END_3

4541:       v    = aa + ai16 + 16;
4542:       idt -= 4;
4543:     }

4545:     /* Convert t from single precision back to double precision (inplace)*/
4546:     idt = 4*(n-1);
4547:     for (i=n-1; i>=0; i--) {
4548:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4549:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4550:       PetscScalar *xtemp=&x[idt];
4551:       MatScalar   *ttemp=&t[idt];
4552:       xtemp[3] = (PetscScalar)ttemp[3];
4553:       xtemp[2] = (PetscScalar)ttemp[2];
4554:       xtemp[1] = (PetscScalar)ttemp[1];
4555:       xtemp[0] = (PetscScalar)ttemp[0];
4556:       idt     -= 4;
4557:     }

4559:   } /* End of artificial scope. */
4560:   VecRestoreArray(bb,&b);
4561:   VecRestoreArray(xx,&x);
4562:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4563:   SSE_SCOPE_END;
4564:   return(0);
4565: }

4569: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4570: {
4571:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
4572:   int            *aj=a->j;
4574:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4575:   MatScalar      *aa=a->a;
4576:   PetscScalar    *x,*b;

4579:   SSE_SCOPE_BEGIN;
4580:   /*
4581:      Note: This code currently uses demotion of double
4582:      to float when performing the mixed-mode computation.
4583:      This may not be numerically reasonable for all applications.
4584:   */
4585:   PREFETCH_NTA(aa+16*ai[1]);

4587:   VecGetArray(bb,&b);
4588:   VecGetArray(xx,&x);
4589:   {
4590:     /* x will first be computed in single precision then promoted inplace to double */
4591:     MatScalar *v,*t=(MatScalar*)x;
4592:     int       nz,i,idt,ai16;
4593:     int       jdx,idx;
4594:     int       *vi;
4595:     /* Forward solve the lower triangular factor. */

4597:     /* First block is the identity. */
4598:     idx = 0;
4599:     CONVERT_DOUBLE4_FLOAT4(t,b);
4600:     v =  aa + 16*ai[1];

4602:     for (i=1; i<n; ) {
4603:       PREFETCH_NTA(&v[8]);
4604:       vi   =  aj      + ai[i];
4605:       nz   =  diag[i] - ai[i];
4606:       idx +=  4;

4608:       /* Demote RHS from double to float. */
4609:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4610:       LOAD_PS(&t[idx],XMM7);

4612:       while (nz--) {
4613:         PREFETCH_NTA(&v[16]);
4614:         jdx = 4*(*vi++);
4615: /*          jdx = *vi++; */

4617:         /* 4x4 Matrix-Vector product with negative accumulation: */
4618:         SSE_INLINE_BEGIN_2(&t[jdx],v)
4619:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4621:         /* First Column */
4622:         SSE_COPY_PS(XMM0,XMM6)
4623:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4624:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4625:         SSE_SUB_PS(XMM7,XMM0)

4627:         /* Second Column */
4628:         SSE_COPY_PS(XMM1,XMM6)
4629:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4630:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4631:         SSE_SUB_PS(XMM7,XMM1)

4633:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

4635:         /* Third Column */
4636:         SSE_COPY_PS(XMM2,XMM6)
4637:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4638:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4639:         SSE_SUB_PS(XMM7,XMM2)

4641:         /* Fourth Column */
4642:         SSE_COPY_PS(XMM3,XMM6)
4643:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4644:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4645:         SSE_SUB_PS(XMM7,XMM3)
4646:         SSE_INLINE_END_2

4648:         v += 16;
4649:       }
4650:       v =  aa + 16*ai[++i];
4651:       PREFETCH_NTA(v);
4652:       STORE_PS(&t[idx],XMM7);
4653:     }

4655:     /* Backward solve the upper triangular factor.*/

4657:     idt  = 4*(n-1);
4658:     ai16 = 16*diag[n-1];
4659:     v    = aa + ai16 + 16;
4660:     for (i=n-1; i>=0; ) {
4661:       PREFETCH_NTA(&v[8]);
4662:       vi = aj + diag[i] + 1;
4663:       nz = ai[i+1] - diag[i] - 1;

4665:       LOAD_PS(&t[idt],XMM7);

4667:       while (nz--) {
4668:         PREFETCH_NTA(&v[16]);
4669:         idx = 4*(*vi++);
4670: /*          idx = *vi++; */

4672:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4673:         SSE_INLINE_BEGIN_2(&t[idx],v)
4674:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4676:         /* First Column */
4677:         SSE_COPY_PS(XMM0,XMM6)
4678:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4679:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4680:         SSE_SUB_PS(XMM7,XMM0)

4682:         /* Second Column */
4683:         SSE_COPY_PS(XMM1,XMM6)
4684:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4685:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4686:         SSE_SUB_PS(XMM7,XMM1)

4688:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

4690:         /* Third Column */
4691:         SSE_COPY_PS(XMM2,XMM6)
4692:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4693:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4694:         SSE_SUB_PS(XMM7,XMM2)

4696:         /* Fourth Column */
4697:         SSE_COPY_PS(XMM3,XMM6)
4698:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4699:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4700:         SSE_SUB_PS(XMM7,XMM3)
4701:         SSE_INLINE_END_2
4702:         v += 16;
4703:       }
4704:       v    = aa + ai16;
4705:       ai16 = 16*diag[--i];
4706:       PREFETCH_NTA(aa+ai16+16);
4707:       /*
4708:          Scale the result by the diagonal 4x4 block,
4709:          which was inverted as part of the factorization
4710:       */
4711:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4712:       /* First Column */
4713:       SSE_COPY_PS(XMM0,XMM7)
4714:       SSE_SHUFFLE(XMM0,XMM0,0x00)
4715:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4717:       /* Second Column */
4718:       SSE_COPY_PS(XMM1,XMM7)
4719:       SSE_SHUFFLE(XMM1,XMM1,0x55)
4720:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4721:       SSE_ADD_PS(XMM0,XMM1)

4723:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

4725:       /* Third Column */
4726:       SSE_COPY_PS(XMM2,XMM7)
4727:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4728:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4729:       SSE_ADD_PS(XMM0,XMM2)

4731:       /* Fourth Column */
4732:       SSE_COPY_PS(XMM3,XMM7)
4733:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4734:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4735:       SSE_ADD_PS(XMM0,XMM3)

4737:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4738:       SSE_INLINE_END_3

4740:       v    = aa + ai16 + 16;
4741:       idt -= 4;
4742:     }

4744:     /* Convert t from single precision back to double precision (inplace)*/
4745:     idt = 4*(n-1);
4746:     for (i=n-1; i>=0; i--) {
4747:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4748:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4749:       PetscScalar *xtemp=&x[idt];
4750:       MatScalar   *ttemp=&t[idt];
4751:       xtemp[3] = (PetscScalar)ttemp[3];
4752:       xtemp[2] = (PetscScalar)ttemp[2];
4753:       xtemp[1] = (PetscScalar)ttemp[1];
4754:       xtemp[0] = (PetscScalar)ttemp[0];
4755:       idt     -= 4;
4756:     }

4758:   } /* End of artificial scope. */
4759:   VecRestoreArray(bb,&b);
4760:   VecRestoreArray(xx,&x);
4761:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4762:   SSE_SCOPE_END;
4763:   return(0);
4764: }

4766: #endif

4770: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4771: {
4772:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
4773:   IS                iscol=a->col,isrow=a->row;
4774:   PetscErrorCode    ierr;
4775:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4776:   PetscInt          i,nz,idx,idt,idc;
4777:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4778:   const MatScalar   *aa=a->a,*v;
4779:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4780:   const PetscScalar *b;

4783:   VecGetArrayRead(bb,&b);
4784:   VecGetArray(xx,&x);
4785:   t    = a->solve_work;

4787:   ISGetIndices(isrow,&rout); r = rout;
4788:   ISGetIndices(iscol,&cout); c = cout + (n-1);

4790:   /* forward solve the lower triangular */
4791:   idx  = 3*(*r++);
4792:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4793:   for (i=1; i<n; i++) {
4794:     v   = aa + 9*ai[i];
4795:     vi  = aj + ai[i];
4796:     nz  = diag[i] - ai[i];
4797:     idx = 3*(*r++);
4798:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4799:     while (nz--) {
4800:       idx = 3*(*vi++);
4801:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4802:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4803:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4804:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4805:       v  += 9;
4806:     }
4807:     idx    = 3*i;
4808:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4809:   }
4810:   /* backward solve the upper triangular */
4811:   for (i=n-1; i>=0; i--) {
4812:     v   = aa + 9*diag[i] + 9;
4813:     vi  = aj + diag[i] + 1;
4814:     nz  = ai[i+1] - diag[i] - 1;
4815:     idt = 3*i;
4816:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4817:     while (nz--) {
4818:       idx = 3*(*vi++);
4819:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4820:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4821:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4822:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4823:       v  += 9;
4824:     }
4825:     idc      = 3*(*c--);
4826:     v        = aa + 9*diag[i];
4827:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4828:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4829:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4830:   }
4831:   ISRestoreIndices(isrow,&rout);
4832:   ISRestoreIndices(iscol,&cout);
4833:   VecRestoreArrayRead(bb,&b);
4834:   VecRestoreArray(xx,&x);
4835:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4836:   return(0);
4837: }

4841: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4842: {
4843:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
4844:   IS                iscol=a->col,isrow=a->row;
4845:   PetscErrorCode    ierr;
4846:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4847:   PetscInt          i,nz,idx,idt,idc,m;
4848:   const PetscInt    *r,*c,*rout,*cout;
4849:   const MatScalar   *aa=a->a,*v;
4850:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4851:   const PetscScalar *b;

4854:   VecGetArrayRead(bb,&b);
4855:   VecGetArray(xx,&x);
4856:   t    = a->solve_work;

4858:   ISGetIndices(isrow,&rout); r = rout;
4859:   ISGetIndices(iscol,&cout); c = cout;

4861:   /* forward solve the lower triangular */
4862:   idx  = 3*r[0];
4863:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4864:   for (i=1; i<n; i++) {
4865:     v   = aa + 9*ai[i];
4866:     vi  = aj + ai[i];
4867:     nz  = ai[i+1] - ai[i];
4868:     idx = 3*r[i];
4869:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4870:     for (m=0; m<nz; m++) {
4871:       idx = 3*vi[m];
4872:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4873:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4874:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4875:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4876:       v  += 9;
4877:     }
4878:     idx    = 3*i;
4879:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4880:   }
4881:   /* backward solve the upper triangular */
4882:   for (i=n-1; i>=0; i--) {
4883:     v   = aa + 9*(adiag[i+1]+1);
4884:     vi  = aj + adiag[i+1]+1;
4885:     nz  = adiag[i] - adiag[i+1] - 1;
4886:     idt = 3*i;
4887:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4888:     for (m=0; m<nz; m++) {
4889:       idx = 3*vi[m];
4890:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4891:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4892:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4893:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4894:       v  += 9;
4895:     }
4896:     idc      = 3*c[i];
4897:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4898:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4899:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4900:   }
4901:   ISRestoreIndices(isrow,&rout);
4902:   ISRestoreIndices(iscol,&cout);
4903:   VecRestoreArrayRead(bb,&b);
4904:   VecRestoreArray(xx,&x);
4905:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4906:   return(0);
4907: }

4909: /*
4910:       Special case where the matrix was ILU(0) factored in the natural
4911:    ordering. This eliminates the need for the column and row permutation.
4912: */
4915: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4916: {
4917:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4918:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j;
4919:   PetscErrorCode    ierr;
4920:   const PetscInt    *diag = a->diag,*vi;
4921:   const MatScalar   *aa   =a->a,*v;
4922:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4923:   const PetscScalar *b;
4924:   PetscInt          jdx,idt,idx,nz,i;

4927:   VecGetArrayRead(bb,&b);
4928:   VecGetArray(xx,&x);

4930:   /* forward solve the lower triangular */
4931:   idx  = 0;
4932:   x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
4933:   for (i=1; i<n; i++) {
4934:     v    =  aa      + 9*ai[i];
4935:     vi   =  aj      + ai[i];
4936:     nz   =  diag[i] - ai[i];
4937:     idx +=  3;
4938:     s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939:     while (nz--) {
4940:       jdx = 3*(*vi++);
4941:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4942:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945:       v  += 9;
4946:     }
4947:     x[idx]   = s1;
4948:     x[1+idx] = s2;
4949:     x[2+idx] = s3;
4950:   }
4951:   /* backward solve the upper triangular */
4952:   for (i=n-1; i>=0; i--) {
4953:     v   = aa + 9*diag[i] + 9;
4954:     vi  = aj + diag[i] + 1;
4955:     nz  = ai[i+1] - diag[i] - 1;
4956:     idt = 3*i;
4957:     s1  = x[idt];  s2 = x[1+idt];
4958:     s3  = x[2+idt];
4959:     while (nz--) {
4960:       idx = 3*(*vi++);
4961:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4962:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4963:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4964:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4965:       v  += 9;
4966:     }
4967:     v        = aa +  9*diag[i];
4968:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4969:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4970:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4971:   }

4973:   VecRestoreArrayRead(bb,&b);
4974:   VecRestoreArray(xx,&x);
4975:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4976:   return(0);
4977: }

4981: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4982: {
4983:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4984:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4985:   PetscErrorCode    ierr;
4986:   PetscInt          i,k,nz,idx,jdx,idt;
4987:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4988:   const MatScalar   *aa=a->a,*v;
4989:   PetscScalar       *x;
4990:   const PetscScalar *b;
4991:   PetscScalar       s1,s2,s3,x1,x2,x3;

4994:   VecGetArrayRead(bb,&b);
4995:   VecGetArray(xx,&x);
4996:   /* forward solve the lower triangular */
4997:   idx  = 0;
4998:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4999:   for (i=1; i<n; i++) {
5000:     v   = aa + bs2*ai[i];
5001:     vi  = aj + ai[i];
5002:     nz  = ai[i+1] - ai[i];
5003:     idx = bs*i;
5004:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5005:     for (k=0; k<nz; k++) {
5006:       jdx = bs*vi[k];
5007:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5008:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5009:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5010:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

5012:       v +=  bs2;
5013:     }

5015:     x[idx]   = s1;
5016:     x[1+idx] = s2;
5017:     x[2+idx] = s3;
5018:   }

5020:   /* backward solve the upper triangular */
5021:   for (i=n-1; i>=0; i--) {
5022:     v   = aa + bs2*(adiag[i+1]+1);
5023:     vi  = aj + adiag[i+1]+1;
5024:     nz  = adiag[i] - adiag[i+1]-1;
5025:     idt = bs*i;
5026:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];

5028:     for (k=0; k<nz; k++) {
5029:       idx = bs*vi[k];
5030:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5031:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5032:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5033:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

5035:       v +=  bs2;
5036:     }
5037:     /* x = inv_diagonal*x */
5038:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5039:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5040:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

5042:   }

5044:   VecRestoreArrayRead(bb,&b);
5045:   VecRestoreArray(xx,&x);
5046:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
5047:   return(0);
5048: }

5052: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5053: {
5054:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
5055:   IS                iscol=a->col,isrow=a->row;
5056:   PetscErrorCode    ierr;
5057:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5058:   PetscInt          i,nz,idx,idt,idc;
5059:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5060:   const MatScalar   *aa=a->a,*v;
5061:   PetscScalar       *x,s1,s2,x1,x2,*t;
5062:   const PetscScalar *b;

5065:   VecGetArrayRead(bb,&b);
5066:   VecGetArray(xx,&x);
5067:   t    = a->solve_work;

5069:   ISGetIndices(isrow,&rout); r = rout;
5070:   ISGetIndices(iscol,&cout); c = cout + (n-1);

5072:   /* forward solve the lower triangular */
5073:   idx  = 2*(*r++);
5074:   t[0] = b[idx]; t[1] = b[1+idx];
5075:   for (i=1; i<n; i++) {
5076:     v   = aa + 4*ai[i];
5077:     vi  = aj + ai[i];
5078:     nz  = diag[i] - ai[i];
5079:     idx = 2*(*r++);
5080:     s1  = b[idx]; s2 = b[1+idx];
5081:     while (nz--) {
5082:       idx = 2*(*vi++);
5083:       x1  = t[idx]; x2 = t[1+idx];
5084:       s1 -= v[0]*x1 + v[2]*x2;
5085:       s2 -= v[1]*x1 + v[3]*x2;
5086:       v  += 4;
5087:     }
5088:     idx    = 2*i;
5089:     t[idx] = s1; t[1+idx] = s2;
5090:   }
5091:   /* backward solve the upper triangular */
5092:   for (i=n-1; i>=0; i--) {
5093:     v   = aa + 4*diag[i] + 4;
5094:     vi  = aj + diag[i] + 1;
5095:     nz  = ai[i+1] - diag[i] - 1;
5096:     idt = 2*i;
5097:     s1  = t[idt]; s2 = t[1+idt];
5098:     while (nz--) {
5099:       idx = 2*(*vi++);
5100:       x1  = t[idx]; x2 = t[1+idx];
5101:       s1 -= v[0]*x1 + v[2]*x2;
5102:       s2 -= v[1]*x1 + v[3]*x2;
5103:       v  += 4;
5104:     }
5105:     idc      = 2*(*c--);
5106:     v        = aa + 4*diag[i];
5107:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5108:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5109:   }
5110:   ISRestoreIndices(isrow,&rout);
5111:   ISRestoreIndices(iscol,&cout);
5112:   VecRestoreArrayRead(bb,&b);
5113:   VecRestoreArray(xx,&x);
5114:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5115:   return(0);
5116: }

5120: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5121: {
5122:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
5123:   IS                iscol=a->col,isrow=a->row;
5124:   PetscErrorCode    ierr;
5125:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5126:   PetscInt          i,nz,idx,jdx,idt,idc,m;
5127:   const PetscInt    *r,*c,*rout,*cout;
5128:   const MatScalar   *aa=a->a,*v;
5129:   PetscScalar       *x,s1,s2,x1,x2,*t;
5130:   const PetscScalar *b;

5133:   VecGetArrayRead(bb,&b);
5134:   VecGetArray(xx,&x);
5135:   t    = a->solve_work;

5137:   ISGetIndices(isrow,&rout); r = rout;
5138:   ISGetIndices(iscol,&cout); c = cout;

5140:   /* forward solve the lower triangular */
5141:   idx  = 2*r[0];
5142:   t[0] = b[idx]; t[1] = b[1+idx];
5143:   for (i=1; i<n; i++) {
5144:     v   = aa + 4*ai[i];
5145:     vi  = aj + ai[i];
5146:     nz  = ai[i+1] - ai[i];
5147:     idx = 2*r[i];
5148:     s1  = b[idx]; s2 = b[1+idx];
5149:     for (m=0; m<nz; m++) {
5150:       jdx = 2*vi[m];
5151:       x1  = t[jdx]; x2 = t[1+jdx];
5152:       s1 -= v[0]*x1 + v[2]*x2;
5153:       s2 -= v[1]*x1 + v[3]*x2;
5154:       v  += 4;
5155:     }
5156:     idx    = 2*i;
5157:     t[idx] = s1; t[1+idx] = s2;
5158:   }
5159:   /* backward solve the upper triangular */
5160:   for (i=n-1; i>=0; i--) {
5161:     v   = aa + 4*(adiag[i+1]+1);
5162:     vi  = aj + adiag[i+1]+1;
5163:     nz  = adiag[i] - adiag[i+1] - 1;
5164:     idt = 2*i;
5165:     s1  = t[idt]; s2 = t[1+idt];
5166:     for (m=0; m<nz; m++) {
5167:       idx = 2*vi[m];
5168:       x1  = t[idx]; x2 = t[1+idx];
5169:       s1 -= v[0]*x1 + v[2]*x2;
5170:       s2 -= v[1]*x1 + v[3]*x2;
5171:       v  += 4;
5172:     }
5173:     idc      = 2*c[i];
5174:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5175:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5176:   }
5177:   ISRestoreIndices(isrow,&rout);
5178:   ISRestoreIndices(iscol,&cout);
5179:   VecRestoreArrayRead(bb,&b);
5180:   VecRestoreArray(xx,&x);
5181:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5182:   return(0);
5183: }

5185: /*
5186:       Special case where the matrix was ILU(0) factored in the natural
5187:    ordering. This eliminates the need for the column and row permutation.
5188: */
5191: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5192: {
5193:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5194:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5195:   PetscErrorCode    ierr;
5196:   const MatScalar   *aa=a->a,*v;
5197:   PetscScalar       *x,s1,s2,x1,x2;
5198:   const PetscScalar *b;
5199:   PetscInt          jdx,idt,idx,nz,i;

5202:   VecGetArrayRead(bb,&b);
5203:   VecGetArray(xx,&x);

5205:   /* forward solve the lower triangular */
5206:   idx  = 0;
5207:   x[0] = b[0]; x[1] = b[1];
5208:   for (i=1; i<n; i++) {
5209:     v    =  aa      + 4*ai[i];
5210:     vi   =  aj      + ai[i];
5211:     nz   =  diag[i] - ai[i];
5212:     idx +=  2;
5213:     s1   =  b[idx];s2 = b[1+idx];
5214:     while (nz--) {
5215:       jdx = 2*(*vi++);
5216:       x1  = x[jdx];x2 = x[1+jdx];
5217:       s1 -= v[0]*x1 + v[2]*x2;
5218:       s2 -= v[1]*x1 + v[3]*x2;
5219:       v  += 4;
5220:     }
5221:     x[idx]   = s1;
5222:     x[1+idx] = s2;
5223:   }
5224:   /* backward solve the upper triangular */
5225:   for (i=n-1; i>=0; i--) {
5226:     v   = aa + 4*diag[i] + 4;
5227:     vi  = aj + diag[i] + 1;
5228:     nz  = ai[i+1] - diag[i] - 1;
5229:     idt = 2*i;
5230:     s1  = x[idt];  s2 = x[1+idt];
5231:     while (nz--) {
5232:       idx = 2*(*vi++);
5233:       x1  = x[idx];   x2 = x[1+idx];
5234:       s1 -= v[0]*x1 + v[2]*x2;
5235:       s2 -= v[1]*x1 + v[3]*x2;
5236:       v  += 4;
5237:     }
5238:     v        = aa +  4*diag[i];
5239:     x[idt]   = v[0]*s1 + v[2]*s2;
5240:     x[1+idt] = v[1]*s1 + v[3]*s2;
5241:   }

5243:   VecRestoreArrayRead(bb,&b);
5244:   VecRestoreArray(xx,&x);
5245:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5246:   return(0);
5247: }

5251: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5252: {
5253:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5254:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5255:   PetscInt          i,k,nz,idx,idt,jdx;
5256:   PetscErrorCode    ierr;
5257:   const MatScalar   *aa=a->a,*v;
5258:   PetscScalar       *x,s1,s2,x1,x2;
5259:   const PetscScalar *b;

5262:   VecGetArrayRead(bb,&b);
5263:   VecGetArray(xx,&x);
5264:   /* forward solve the lower triangular */
5265:   idx  = 0;
5266:   x[0] = b[idx]; x[1] = b[1+idx];
5267:   for (i=1; i<n; i++) {
5268:     v   = aa + 4*ai[i];
5269:     vi  = aj + ai[i];
5270:     nz  = ai[i+1] - ai[i];
5271:     idx = 2*i;
5272:     s1  = b[idx];s2 = b[1+idx];
5273:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5274:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5275:     for (k=0; k<nz; k++) {
5276:       jdx = 2*vi[k];
5277:       x1  = x[jdx];x2 = x[1+jdx];
5278:       s1 -= v[0]*x1 + v[2]*x2;
5279:       s2 -= v[1]*x1 + v[3]*x2;
5280:       v  +=  4;
5281:     }
5282:     x[idx]   = s1;
5283:     x[1+idx] = s2;
5284:   }

5286:   /* backward solve the upper triangular */
5287:   for (i=n-1; i>=0; i--) {
5288:     v   = aa + 4*(adiag[i+1]+1);
5289:     vi  = aj + adiag[i+1]+1;
5290:     nz  = adiag[i] - adiag[i+1]-1;
5291:     idt = 2*i;
5292:     s1  = x[idt];  s2 = x[1+idt];
5293:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5294:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5295:     for (k=0; k<nz; k++) {
5296:       idx = 2*vi[k];
5297:       x1  = x[idx];   x2 = x[1+idx];
5298:       s1 -= v[0]*x1 + v[2]*x2;
5299:       s2 -= v[1]*x1 + v[3]*x2;
5300:       v  += 4;
5301:     }
5302:     /* x = inv_diagonal*x */
5303:     x[idt]   = v[0]*s1 + v[2]*s2;
5304:     x[1+idt] = v[1]*s1 + v[3]*s2;
5305:   }

5307:   VecRestoreArrayRead(bb,&b);
5308:   VecRestoreArray(xx,&x);
5309:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5310:   return(0);
5311: }

5315: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5316: {
5317:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
5318:   IS                iscol=a->col,isrow=a->row;
5319:   PetscErrorCode    ierr;
5320:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5321:   PetscInt          i,nz;
5322:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5323:   const MatScalar   *aa=a->a,*v;
5324:   PetscScalar       *x,s1,*t;
5325:   const PetscScalar *b;

5328:   if (!n) return(0);

5330:   VecGetArrayRead(bb,&b);
5331:   VecGetArray(xx,&x);
5332:   t    = a->solve_work;

5334:   ISGetIndices(isrow,&rout); r = rout;
5335:   ISGetIndices(iscol,&cout); c = cout + (n-1);

5337:   /* forward solve the lower triangular */
5338:   t[0] = b[*r++];
5339:   for (i=1; i<n; i++) {
5340:     v  = aa + ai[i];
5341:     vi = aj + ai[i];
5342:     nz = diag[i] - ai[i];
5343:     s1 = b[*r++];
5344:     while (nz--) {
5345:       s1 -= (*v++)*t[*vi++];
5346:     }
5347:     t[i] = s1;
5348:   }
5349:   /* backward solve the upper triangular */
5350:   for (i=n-1; i>=0; i--) {
5351:     v  = aa + diag[i] + 1;
5352:     vi = aj + diag[i] + 1;
5353:     nz = ai[i+1] - diag[i] - 1;
5354:     s1 = t[i];
5355:     while (nz--) {
5356:       s1 -= (*v++)*t[*vi++];
5357:     }
5358:     x[*c--] = t[i] = aa[diag[i]]*s1;
5359:   }

5361:   ISRestoreIndices(isrow,&rout);
5362:   ISRestoreIndices(iscol,&cout);
5363:   VecRestoreArrayRead(bb,&b);
5364:   VecRestoreArray(xx,&x);
5365:   PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
5366:   return(0);
5367: }

5371: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5372: {
5373:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
5374:   IS                iscol = a->col,isrow = a->row;
5375:   PetscErrorCode    ierr;
5376:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5377:   const PetscInt    *rout,*cout,*r,*c;
5378:   PetscScalar       *x,*tmp,sum;
5379:   const PetscScalar *b;
5380:   const MatScalar   *aa = a->a,*v;

5383:   if (!n) return(0);

5385:   VecGetArrayRead(bb,&b);
5386:   VecGetArray(xx,&x);
5387:   tmp  = a->solve_work;

5389:   ISGetIndices(isrow,&rout); r = rout;
5390:   ISGetIndices(iscol,&cout); c = cout;

5392:   /* forward solve the lower triangular */
5393:   tmp[0] = b[r[0]];
5394:   v      = aa;
5395:   vi     = aj;
5396:   for (i=1; i<n; i++) {
5397:     nz  = ai[i+1] - ai[i];
5398:     sum = b[r[i]];
5399:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5400:     tmp[i] = sum;
5401:     v     += nz; vi += nz;
5402:   }

5404:   /* backward solve the upper triangular */
5405:   for (i=n-1; i>=0; i--) {
5406:     v   = aa + adiag[i+1]+1;
5407:     vi  = aj + adiag[i+1]+1;
5408:     nz  = adiag[i]-adiag[i+1]-1;
5409:     sum = tmp[i];
5410:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5411:     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5412:   }

5414:   ISRestoreIndices(isrow,&rout);
5415:   ISRestoreIndices(iscol,&cout);
5416:   VecRestoreArrayRead(bb,&b);
5417:   VecRestoreArray(xx,&x);
5418:   PetscLogFlops(2*a->nz - A->cmap->n);
5419:   return(0);
5420: }

5422: /*
5423:       Special case where the matrix was ILU(0) factored in the natural
5424:    ordering. This eliminates the need for the column and row permutation.
5425: */
5428: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5429: {
5430:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5431:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5432:   PetscErrorCode    ierr;
5433:   const MatScalar   *aa=a->a,*v;
5434:   PetscScalar       *x;
5435:   const PetscScalar *b;
5436:   PetscScalar       s1,x1;
5437:   PetscInt          jdx,idt,idx,nz,i;

5440:   VecGetArrayRead(bb,&b);
5441:   VecGetArray(xx,&x);

5443:   /* forward solve the lower triangular */
5444:   idx  = 0;
5445:   x[0] = b[0];
5446:   for (i=1; i<n; i++) {
5447:     v    =  aa      + ai[i];
5448:     vi   =  aj      + ai[i];
5449:     nz   =  diag[i] - ai[i];
5450:     idx +=  1;
5451:     s1   =  b[idx];
5452:     while (nz--) {
5453:       jdx = *vi++;
5454:       x1  = x[jdx];
5455:       s1 -= v[0]*x1;
5456:       v  += 1;
5457:     }
5458:     x[idx] = s1;
5459:   }
5460:   /* backward solve the upper triangular */
5461:   for (i=n-1; i>=0; i--) {
5462:     v   = aa + diag[i] + 1;
5463:     vi  = aj + diag[i] + 1;
5464:     nz  = ai[i+1] - diag[i] - 1;
5465:     idt = i;
5466:     s1  = x[idt];
5467:     while (nz--) {
5468:       idx = *vi++;
5469:       x1  = x[idx];
5470:       s1 -= v[0]*x1;
5471:       v  += 1;
5472:     }
5473:     v      = aa +  diag[i];
5474:     x[idt] = v[0]*s1;
5475:   }
5476:   VecRestoreArrayRead(bb,&b);
5477:   VecRestoreArray(xx,&x);
5478:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
5479:   return(0);
5480: }


5485: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5486: {
5487:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5488:   PetscErrorCode    ierr;
5489:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5490:   PetscScalar       *x,sum;
5491:   const PetscScalar *b;
5492:   const MatScalar   *aa = a->a,*v;
5493:   PetscInt          i,nz;

5496:   if (!n) return(0);

5498:   VecGetArrayRead(bb,&b);
5499:   VecGetArray(xx,&x);

5501:   /* forward solve the lower triangular */
5502:   x[0] = b[0];
5503:   v    = aa;
5504:   vi   = aj;
5505:   for (i=1; i<n; i++) {
5506:     nz  = ai[i+1] - ai[i];
5507:     sum = b[i];
5508:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5509:     v   += nz;
5510:     vi  += nz;
5511:     x[i] = sum;
5512:   }

5514:   /* backward solve the upper triangular */
5515:   for (i=n-1; i>=0; i--) {
5516:     v   = aa + adiag[i+1] + 1;
5517:     vi  = aj + adiag[i+1] + 1;
5518:     nz  = adiag[i] - adiag[i+1]-1;
5519:     sum = x[i];
5520:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5521:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5522:   }

5524:   PetscLogFlops(2.0*a->nz - A->cmap->n);
5525:   VecRestoreArrayRead(bb,&b);
5526:   VecRestoreArray(xx,&x);
5527:   return(0);
5528: }

5530: /* ----------------------------------------------------------------*/
5531: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);

5535: /*
5536:    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5537: */
5538: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5539: {
5540:   Mat             C =B;
5541:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
5542:   PetscErrorCode  ierr;
5543:   PetscInt        i,j,k,ipvt[15];
5544:   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5545:   PetscInt        nz,nzL,row;
5546:   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5547:   const MatScalar *v,*aa=a->a;
5548:   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5549:   PetscInt        sol_ver;

5552:   PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,NULL);

5554:   /* generate work space needed by the factorization */
5555:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
5556:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

5558:   for (i=0; i<n; i++) {
5559:     /* zero rtmp */
5560:     /* L part */
5561:     nz    = bi[i+1] - bi[i];
5562:     bjtmp = bj + bi[i];
5563:     for  (j=0; j<nz; j++) {
5564:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5565:     }

5567:     /* U part */
5568:     nz    = bdiag[i] - bdiag[i+1];
5569:     bjtmp = bj + bdiag[i+1]+1;
5570:     for  (j=0; j<nz; j++) {
5571:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5572:     }

5574:     /* load in initial (unfactored row) */
5575:     nz    = ai[i+1] - ai[i];
5576:     ajtmp = aj + ai[i];
5577:     v     = aa + bs2*ai[i];
5578:     for (j=0; j<nz; j++) {
5579:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
5580:     }

5582:     /* elimination */
5583:     bjtmp = bj + bi[i];
5584:     nzL   = bi[i+1] - bi[i];
5585:     for (k=0; k < nzL; k++) {
5586:       row = bjtmp[k];
5587:       pc  = rtmp + bs2*row;
5588:       for (flg=0,j=0; j<bs2; j++) {
5589:         if (pc[j]!=0.0) {
5590:           flg = 1;
5591:           break;
5592:         }
5593:       }
5594:       if (flg) {
5595:         pv = b->a + bs2*bdiag[row];
5596:         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
5597:         /*PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);*/
5598:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5599:         pv = b->a + bs2*(bdiag[row+1]+1);
5600:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5601:         for (j=0; j<nz; j++) {
5602:           vv = rtmp + bs2*pj[j];
5603:           PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5604:           /* PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
5605:           pv += bs2;
5606:         }
5607:         PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5608:       }
5609:     }

5611:     /* finished row so stick it into b->a */
5612:     /* L part */
5613:     pv = b->a + bs2*bi[i];
5614:     pj = b->j + bi[i];
5615:     nz = bi[i+1] - bi[i];
5616:     for (j=0; j<nz; j++) {
5617:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5618:     }

5620:     /* Mark diagonal and invert diagonal for simplier triangular solves */
5621:     pv   = b->a + bs2*bdiag[i];
5622:     pj   = b->j + bdiag[i];
5623:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5624:     /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
5625:     PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);

5627:     /* U part */
5628:     pv = b->a + bs2*(bdiag[i+1]+1);
5629:     pj = b->j + bdiag[i+1]+1;
5630:     nz = bdiag[i] - bdiag[i+1] - 1;
5631:     for (j=0; j<nz; j++) {
5632:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5633:     }
5634:   }

5636:   PetscFree2(rtmp,mwork);

5638:   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5639:   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5640:   C->assembled           = PETSC_TRUE;

5642:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5643:   return(0);
5644: }

5648: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5649: {
5650:   Mat            C     =B;
5651:   Mat_SeqBAIJ    *a    =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
5652:   IS             isrow = b->row,isicol = b->icol;
5654:   const PetscInt *r,*ic;
5655:   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5656:   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5657:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5658:   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5659:   MatScalar      *v_work;
5660:   PetscBool      col_identity,row_identity,both_identity;

5663:   ISGetIndices(isrow,&r);
5664:   ISGetIndices(isicol,&ic);

5666:   PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);
5667:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

5669:   /* generate work space needed by dense LU factorization */
5670:   PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);

5672:   for (i=0; i<n; i++) {
5673:     /* zero rtmp */
5674:     /* L part */
5675:     nz    = bi[i+1] - bi[i];
5676:     bjtmp = bj + bi[i];
5677:     for  (j=0; j<nz; j++) {
5678:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5679:     }

5681:     /* U part */
5682:     nz    = bdiag[i] - bdiag[i+1];
5683:     bjtmp = bj + bdiag[i+1]+1;
5684:     for  (j=0; j<nz; j++) {
5685:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5686:     }

5688:     /* load in initial (unfactored row) */
5689:     nz    = ai[r[i]+1] - ai[r[i]];
5690:     ajtmp = aj + ai[r[i]];
5691:     v     = aa + bs2*ai[r[i]];
5692:     for (j=0; j<nz; j++) {
5693:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
5694:     }

5696:     /* elimination */
5697:     bjtmp = bj + bi[i];
5698:     nzL   = bi[i+1] - bi[i];
5699:     for (k=0; k < nzL; k++) {
5700:       row = bjtmp[k];
5701:       pc  = rtmp + bs2*row;
5702:       for (flg=0,j=0; j<bs2; j++) {
5703:         if (pc[j]!=0.0) {
5704:           flg = 1;
5705:           break;
5706:         }
5707:       }
5708:       if (flg) {
5709:         pv = b->a + bs2*bdiag[row];
5710:         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5711:         pj = b->j + bdiag[row+1]+1;         /* begining of U(row,:) */
5712:         pv = b->a + bs2*(bdiag[row+1]+1);
5713:         nz = bdiag[row] - bdiag[row+1] - 1;         /* num of entries inU(row,:), excluding diag */
5714:         for (j=0; j<nz; j++) {
5715:           PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5716:         }
5717:         PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5718:       }
5719:     }

5721:     /* finished row so stick it into b->a */
5722:     /* L part */
5723:     pv = b->a + bs2*bi[i];
5724:     pj = b->j + bi[i];
5725:     nz = bi[i+1] - bi[i];
5726:     for (j=0; j<nz; j++) {
5727:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5728:     }

5730:     /* Mark diagonal and invert diagonal for simplier triangular solves */
5731:     pv = b->a + bs2*bdiag[i];
5732:     pj = b->j + bdiag[i];
5733:     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5734:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5735:     PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);

5737:     /* U part */
5738:     pv = b->a + bs2*(bdiag[i+1]+1);
5739:     pj = b->j + bdiag[i+1]+1;
5740:     nz = bdiag[i] - bdiag[i+1] - 1;
5741:     for (j=0; j<nz; j++) {
5742:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5743:     }
5744:   }

5746:   PetscFree(rtmp);
5747:   PetscFree3(v_work,mwork,v_pivots);
5748:   ISRestoreIndices(isicol,&ic);
5749:   ISRestoreIndices(isrow,&r);

5751:   ISIdentity(isrow,&row_identity);
5752:   ISIdentity(isicol,&col_identity);

5754:   both_identity = (PetscBool) (row_identity && col_identity);
5755:   if (both_identity) {
5756:     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5757:   } else {
5758:     C->ops->solve = MatSolve_SeqBAIJ_N;
5759:   }
5760:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;

5762:   C->assembled = PETSC_TRUE;

5764:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5765:   return(0);
5766: }

5768: /*
5769:    ilu(0) with natural ordering under new data structure.
5770:    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5771:    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5772: */

5776: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5777: {

5779:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5781:   PetscInt       n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5782:   PetscInt       i,j,nz,*bi,*bj,*bdiag,bi_temp;

5785:   MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
5786:   b    = (Mat_SeqBAIJ*)(fact)->data;

5788:   /* allocate matrix arrays for new data structure */
5789:   PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);
5790:   PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));

5792:   b->singlemalloc    = PETSC_TRUE;
5793:   b->free_a          = PETSC_TRUE;
5794:   b->free_ij         = PETSC_TRUE;
5795:   fact->preallocated = PETSC_TRUE;
5796:   fact->assembled    = PETSC_TRUE;
5797:   if (!b->diag) {
5798:     PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);
5799:     PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));
5800:   }
5801:   bdiag = b->diag;

5803:   if (n > 0) {
5804:     PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
5805:   }

5807:   /* set bi and bj with new data structure */
5808:   bi = b->i;
5809:   bj = b->j;

5811:   /* L part */
5812:   bi[0] = 0;
5813:   for (i=0; i<n; i++) {
5814:     nz      = adiag[i] - ai[i];
5815:     bi[i+1] = bi[i] + nz;
5816:     aj      = a->j + ai[i];
5817:     for (j=0; j<nz; j++) {
5818:       *bj = aj[j]; bj++;
5819:     }
5820:   }

5822:   /* U part */
5823:   bi_temp  = bi[n];
5824:   bdiag[n] = bi[n]-1;
5825:   for (i=n-1; i>=0; i--) {
5826:     nz      = ai[i+1] - adiag[i] - 1;
5827:     bi_temp = bi_temp + nz + 1;
5828:     aj      = a->j + adiag[i] + 1;
5829:     for (j=0; j<nz; j++) {
5830:       *bj = aj[j]; bj++;
5831:     }
5832:     /* diag[i] */
5833:     *bj      = i; bj++;
5834:     bdiag[i] = bi_temp - 1;
5835:   }
5836:   return(0);
5837: }

5841: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5842: {
5843:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5844:   IS                 isicol;
5845:   PetscErrorCode     ierr;
5846:   const PetscInt     *r,*ic;
5847:   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5848:   PetscInt           *bi,*cols,nnz,*cols_lvl;
5849:   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5850:   PetscInt           i,levels,diagonal_fill;
5851:   PetscBool          col_identity,row_identity,both_identity;
5852:   PetscReal          f;
5853:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL;
5854:   PetscBT            lnkbt;
5855:   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5856:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
5857:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
5858:   PetscBool          missing;
5859:   PetscInt           bs=A->rmap->bs,bs2=a->bs2;

5862:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5863:   if (bs>1) {  /* check shifttype */
5864:     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5865:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5866:   }

5868:   MatMissingDiagonal(A,&missing,&d);
5869:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

5871:   f             = info->fill;
5872:   levels        = (PetscInt)info->levels;
5873:   diagonal_fill = (PetscInt)info->diagonal_fill;

5875:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);

5877:   ISIdentity(isrow,&row_identity);
5878:   ISIdentity(iscol,&col_identity);

5880:   both_identity = (PetscBool) (row_identity && col_identity);

5882:   if (!levels && both_identity) {
5883:     /* special case: ilu(0) with natural ordering */
5884:     MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
5885:     MatSeqBAIJSetNumericFactorization(fact,both_identity);

5887:     fact->factortype               = MAT_FACTOR_ILU;
5888:     (fact)->info.factor_mallocs    = 0;
5889:     (fact)->info.fill_ratio_given  = info->fill;
5890:     (fact)->info.fill_ratio_needed = 1.0;

5892:     b                = (Mat_SeqBAIJ*)(fact)->data;
5893:     b->row           = isrow;
5894:     b->col           = iscol;
5895:     b->icol          = isicol;
5896:     PetscObjectReference((PetscObject)isrow);
5897:     PetscObjectReference((PetscObject)iscol);
5898:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;

5900:     PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5901:     return(0);
5902:   }

5904:   ISGetIndices(isrow,&r);
5905:   ISGetIndices(isicol,&ic);

5907:   /* get new row pointers */
5908:   PetscMalloc((n+1)*sizeof(PetscInt),&bi);
5909:   bi[0] = 0;
5910:   /* bdiag is location of diagonal in factor */
5911:   PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);
5912:   bdiag[0] = 0;

5914:   PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);

5916:   /* create a linked list for storing column indices of the active row */
5917:   nlnk = n + 1;
5918:   PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);

5920:   /* initial FreeSpace size is f*(ai[n]+1) */
5921:   PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
5922:   current_space     = free_space;
5923:   PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
5924:   current_space_lvl = free_space_lvl;

5926:   for (i=0; i<n; i++) {
5927:     nzi = 0;
5928:     /* copy current row into linked list */
5929:     nnz = ai[r[i]+1] - ai[r[i]];
5930:     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5931:     cols   = aj + ai[r[i]];
5932:     lnk[i] = -1; /* marker to indicate if diagonal exists */
5933:     PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
5934:     nzi   += nlnk;

5936:     /* make sure diagonal entry is included */
5937:     if (diagonal_fill && lnk[i] == -1) {
5938:       fm = n;
5939:       while (lnk[fm] < i) fm = lnk[fm];
5940:       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5941:       lnk[fm]    = i;
5942:       lnk_lvl[i] = 0;
5943:       nzi++; dcount++;
5944:     }

5946:     /* add pivot rows into the active row */
5947:     nzbd = 0;
5948:     prow = lnk[n];
5949:     while (prow < i) {
5950:       nnz      = bdiag[prow];
5951:       cols     = bj_ptr[prow] + nnz + 1;
5952:       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5953:       nnz      = bi[prow+1] - bi[prow] - nnz - 1;

5955:       PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
5956:       nzi += nlnk;
5957:       prow = lnk[prow];
5958:       nzbd++;
5959:     }
5960:     bdiag[i] = nzbd;
5961:     bi[i+1]  = bi[i] + nzi;

5963:     /* if free space is not available, make more free space */
5964:     if (current_space->local_remaining<nzi) {
5965:       nnz  = 2*nzi*(n - i); /* estimated and max additional space needed */
5966:       PetscFreeSpaceGet(nnz,&current_space);
5967:       PetscFreeSpaceGet(nnz,&current_space_lvl);
5968:       reallocs++;
5969:     }

5971:     /* copy data into free_space and free_space_lvl, then initialize lnk */
5972:     PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

5974:     bj_ptr[i]    = current_space->array;
5975:     bjlvl_ptr[i] = current_space_lvl->array;

5977:     /* make sure the active row i has diagonal entry */
5978:     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);

5980:     current_space->array           += nzi;
5981:     current_space->local_used      += nzi;
5982:     current_space->local_remaining -= nzi;

5984:     current_space_lvl->array           += nzi;
5985:     current_space_lvl->local_used      += nzi;
5986:     current_space_lvl->local_remaining -= nzi;
5987:   }

5989:   ISRestoreIndices(isrow,&r);
5990:   ISRestoreIndices(isicol,&ic);

5992:   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5993:   PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);
5994:   PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);

5996:   PetscIncompleteLLDestroy(lnk,lnkbt);
5997:   PetscFreeSpaceDestroy(free_space_lvl);
5998:   PetscFree2(bj_ptr,bjlvl_ptr);

6000: #if defined(PETSC_USE_INFO)
6001:   {
6002:     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6003:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
6004:     PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);
6005:     PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);
6006:     PetscInfo(A,"for best performance.\n");
6007:     if (diagonal_fill) {
6008:       PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);
6009:     }
6010:   }
6011: #endif

6013:   /* put together the new matrix */
6014:   MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
6015:   PetscLogObjectParent(fact,isicol);

6017:   b               = (Mat_SeqBAIJ*)(fact)->data;
6018:   b->free_a       = PETSC_TRUE;
6019:   b->free_ij      = PETSC_TRUE;
6020:   b->singlemalloc = PETSC_FALSE;

6022:   PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);

6024:   b->j          = bj;
6025:   b->i          = bi;
6026:   b->diag       = bdiag;
6027:   b->free_diag  = PETSC_TRUE;
6028:   b->ilen       = 0;
6029:   b->imax       = 0;
6030:   b->row        = isrow;
6031:   b->col        = iscol;
6032:   PetscObjectReference((PetscObject)isrow);
6033:   PetscObjectReference((PetscObject)iscol);
6034:   b->icol       = isicol;

6036:   PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6037:   /* In b structure:  Free imax, ilen, old a, old j.
6038:      Allocate bdiag, solve_work, new a, new j */
6039:   PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
6040:   b->maxnz = b->nz = bdiag[0]+1;

6042:   fact->info.factor_mallocs    = reallocs;
6043:   fact->info.fill_ratio_given  = f;
6044:   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);

6046:   MatSeqBAIJSetNumericFactorization(fact,both_identity);
6047:   return(0);
6048: }

6050: /*
6051:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6052:    except that the data structure of Mat_SeqAIJ is slightly different.
6053:    Not a good example of code reuse.
6054: */
6057: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6058: {
6059:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6060:   IS             isicol;
6062:   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6063:   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6064:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6065:   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6066:   PetscBool      col_identity,row_identity,both_identity,flg;
6067:   PetscReal      f;

6070:   MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
6071:   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);

6073:   f             = info->fill;
6074:   levels        = (PetscInt)info->levels;
6075:   diagonal_fill = (PetscInt)info->diagonal_fill;

6077:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);

6079:   ISIdentity(isrow,&row_identity);
6080:   ISIdentity(iscol,&col_identity);
6081:   both_identity = (PetscBool) (row_identity && col_identity);

6083:   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6084:     MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
6085:     MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);

6087:     fact->factortype = MAT_FACTOR_ILU;
6088:     b                = (Mat_SeqBAIJ*)fact->data;
6089:     b->row           = isrow;
6090:     b->col           = iscol;
6091:     PetscObjectReference((PetscObject)isrow);
6092:     PetscObjectReference((PetscObject)iscol);
6093:     b->icol          = isicol;
6094:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;

6096:     PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
6097:     return(0);
6098:   }

6100:   /* general case perform the symbolic factorization */
6101:   ISGetIndices(isrow,&r);
6102:   ISGetIndices(isicol,&ic);

6104:   /* get new row pointers */
6105:   PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
6106:   ainew[0] = 0;
6107:   /* don't know how many column pointers are needed so estimate */
6108:   jmax = (PetscInt)(f*ai[n] + 1);
6109:   PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
6110:   /* ajfill is level of fill for each fill entry */
6111:   PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
6112:   /* fill is a linked list of nonzeros in active row */
6113:   PetscMalloc((n+1)*sizeof(PetscInt),&fill);
6114:   /* im is level for each filled value */
6115:   PetscMalloc((n+1)*sizeof(PetscInt),&im);
6116:   /* dloc is location of diagonal in factor */
6117:   PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
6118:   dloc[0] = 0;
6119:   for (prow=0; prow<n; prow++) {

6121:     /* copy prow into linked list */
6122:     nzf = nz = ai[r[prow]+1] - ai[r[prow]];
6123:     if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6124:     xi         = aj + ai[r[prow]];
6125:     fill[n]    = n;
6126:     fill[prow] = -1;   /* marker for diagonal entry */
6127:     while (nz--) {
6128:       fm  = n;
6129:       idx = ic[*xi++];
6130:       do {
6131:         m  = fm;
6132:         fm = fill[m];
6133:       } while (fm < idx);
6134:       fill[m]   = idx;
6135:       fill[idx] = fm;
6136:       im[idx]   = 0;
6137:     }

6139:     /* make sure diagonal entry is included */
6140:     if (diagonal_fill && fill[prow] == -1) {
6141:       fm = n;
6142:       while (fill[fm] < prow) fm = fill[fm];
6143:       fill[prow] = fill[fm];    /* insert diagonal into linked list */
6144:       fill[fm]   = prow;
6145:       im[prow]   = 0;
6146:       nzf++;
6147:       dcount++;
6148:     }

6150:     nzi = 0;
6151:     row = fill[n];
6152:     while (row < prow) {
6153:       incrlev = im[row] + 1;
6154:       nz      = dloc[row];
6155:       xi      = ajnew  + ainew[row] + nz + 1;
6156:       flev    = ajfill + ainew[row] + nz + 1;
6157:       nnz     = ainew[row+1] - ainew[row] - nz - 1;
6158:       fm      = row;
6159:       while (nnz-- > 0) {
6160:         idx = *xi++;
6161:         if (*flev + incrlev > levels) {
6162:           flev++;
6163:           continue;
6164:         }
6165:         do {
6166:           m  = fm;
6167:           fm = fill[m];
6168:         } while (fm < idx);
6169:         if (fm != idx) {
6170:           im[idx]   = *flev + incrlev;
6171:           fill[m]   = idx;
6172:           fill[idx] = fm;
6173:           fm        = idx;
6174:           nzf++;
6175:         } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6176:         flev++;
6177:       }
6178:       row = fill[row];
6179:       nzi++;
6180:     }
6181:     /* copy new filled row into permanent storage */
6182:     ainew[prow+1] = ainew[prow] + nzf;
6183:     if (ainew[prow+1] > jmax) {

6185:       /* estimate how much additional space we will need */
6186:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6187:       /* just double the memory each time */
6188:       PetscInt maxadd = jmax;
6189:       /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6190:       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6191:       jmax += maxadd;

6193:       /* allocate a longer ajnew and ajfill */
6194:       PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6195:       PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
6196:       PetscFree(ajnew);
6197:       ajnew  = xitmp;
6198:       PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6199:       PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
6200:       PetscFree(ajfill);
6201:       ajfill = xitmp;
6202:       reallocate++;   /* count how many reallocations are needed */
6203:     }
6204:     xitmp      = ajnew + ainew[prow];
6205:     flev       = ajfill + ainew[prow];
6206:     dloc[prow] = nzi;
6207:     fm         = fill[n];
6208:     while (nzf--) {
6209:       *xitmp++ = fm;
6210:       *flev++  = im[fm];
6211:       fm       = fill[fm];
6212:     }
6213:     /* make sure row has diagonal entry */
6214:     if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6215:                                                         try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6216:   }
6217:   PetscFree(ajfill);
6218:   ISRestoreIndices(isrow,&r);
6219:   ISRestoreIndices(isicol,&ic);
6220:   PetscFree(fill);
6221:   PetscFree(im);

6223: #if defined(PETSC_USE_INFO)
6224:   {
6225:     PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6226:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
6227:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
6228:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
6229:     PetscInfo(A,"for best performance.\n");
6230:     if (diagonal_fill) {
6231:       PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
6232:     }
6233:   }
6234: #endif

6236:   /* put together the new matrix */
6237:   MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
6238:   PetscLogObjectParent(fact,isicol);
6239:   b    = (Mat_SeqBAIJ*)fact->data;

6241:   b->free_a       = PETSC_TRUE;
6242:   b->free_ij      = PETSC_TRUE;
6243:   b->singlemalloc = PETSC_FALSE;

6245:   PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);

6247:   b->j          = ajnew;
6248:   b->i          = ainew;
6249:   for (i=0; i<n; i++) dloc[i] += ainew[i];
6250:   b->diag          = dloc;
6251:   b->free_diag     = PETSC_TRUE;
6252:   b->ilen          = 0;
6253:   b->imax          = 0;
6254:   b->row           = isrow;
6255:   b->col           = iscol;
6256:   b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;

6258:   PetscObjectReference((PetscObject)isrow);
6259:   PetscObjectReference((PetscObject)iscol);
6260:   b->icol = isicol;
6261:   PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6262:   /* In b structure:  Free imax, ilen, old a, old j.
6263:      Allocate dloc, solve_work, new a, new j */
6264:   PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
6265:   b->maxnz = b->nz = ainew[n];

6267:   fact->info.factor_mallocs    = reallocate;
6268:   fact->info.fill_ratio_given  = f;
6269:   fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);

6271:   MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6272:   return(0);
6273: }

6277: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6278: {
6279:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
6280:   /* int i,*AJ=a->j,nz=a->nz; */

6283:   /* Undo Column scaling */
6284:   /*    while (nz--) { */
6285:   /*      AJ[i] = AJ[i]/4; */
6286:   /*    } */
6287:   /* This should really invoke a push/pop logic, but we don't have that yet. */
6288:   A->ops->setunfactored = NULL;
6289:   return(0);
6290: }

6294: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6295: {
6296:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
6297:   PetscInt       *AJ=a->j,nz=a->nz;
6298:   unsigned short *aj=(unsigned short*)AJ;

6301:   /* Is this really necessary? */
6302:   while (nz--) {
6303:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6304:   }
6305:   A->ops->setunfactored = NULL;
6306:   return(0);
6307: }