Actual source code: baijfact2.c

petsc-3.3-p7 2013-05-11
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */

  6: #include <../src/mat/impls/baij/seq/baij.h>
  7: #include <../src/mat/blockinvert.h>
  8: #include <petscbt.h>
  9: #include <../src/mat/utils/freespace.h>

 13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
 16:   PetscErrorCode    ierr;
 17:   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
 18:   PetscInt          i,n = a->mbs,j;
 19:   PetscInt          nz;
 20:   PetscScalar       *x,*tmp,s1;
 21:   const MatScalar   *aa = a->a,*v;
 22:   const PetscScalar *b;

 25:   VecGetArrayRead(bb,&b);
 26:   VecGetArray(xx,&x);
 27:   tmp  = a->solve_work;


 30:   /* copy the b into temp work space according to permutation */
 31:   for (i=0; i<n; i++) tmp[i] = b[i];

 33:   /* forward solve the U^T */
 34:   for (i=0; i<n; i++) {
 35:     v   = aa + adiag[i+1] + 1;
 36:     vi  = aj + adiag[i+1] + 1;
 37:     nz  = adiag[i] - adiag[i+1] - 1;
 38:     s1  = tmp[i];
 39:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
 40:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 41:     tmp[i] = s1;
 42:   }

 44:   /* backward solve the L^T */
 45:   for (i=n-1; i>=0; i--){
 46:     v   = aa + ai[i];
 47:     vi  = aj + ai[i];
 48:     nz  = ai[i+1] - ai[i];
 49:     s1  = tmp[i];
 50:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
 51:   }

 53:   /* copy tmp into x according to permutation */
 54:   for (i=0; i<n; i++) x[i] = tmp[i];

 56:   VecRestoreArrayRead(bb,&b);
 57:   VecRestoreArray(xx,&x);

 59:   PetscLogFlops(2.0*a->nz-A->cmap->n);
 60:   return(0);
 61: }

 65: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 66: {
 67:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
 68:   PetscErrorCode    ierr;
 69:   PetscInt          i,nz;
 70:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 71:   const MatScalar   *aa=a->a,*v;
 72:   PetscScalar       s1,*x;

 75:   VecCopy(bb,xx);
 76:   VecGetArray(xx,&x);
 77: 
 78:   /* forward solve the U^T */
 79:   for (i=0; i<n; i++) {

 81:     v     = aa + diag[i];
 82:     /* multiply by the inverse of the block diagonal */
 83:     s1    = (*v++)*x[i];
 84:     vi    = aj + diag[i] + 1;
 85:     nz    = ai[i+1] - diag[i] - 1;
 86:     while (nz--) {
 87:       x[*vi++]  -= (*v++)*s1;
 88:     }
 89:     x[i]   = s1;
 90:   }
 91:   /* backward solve the L^T */
 92:   for (i=n-1; i>=0; i--){
 93:     v    = aa + diag[i] - 1;
 94:     vi   = aj + diag[i] - 1;
 95:     nz   = diag[i] - ai[i];
 96:     s1   = x[i];
 97:     while (nz--) {
 98:       x[*vi--]   -=  (*v--)*s1;
 99:     }
100:   }
101:   VecRestoreArray(xx,&x);
102:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
103:   return(0);
104: }

108: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109: {
110:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
111:   PetscErrorCode    ierr;
112:   PetscInt          i,nz,idx,idt,oidx;
113:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114:   const MatScalar   *aa=a->a,*v;
115:   PetscScalar       s1,s2,x1,x2,*x;

118:   VecCopy(bb,xx);
119:   VecGetArray(xx,&x);

121:   /* forward solve the U^T */
122:   idx = 0;
123:   for (i=0; i<n; i++) {

125:     v     = aa + 4*diag[i];
126:     /* multiply by the inverse of the block diagonal */
127:     x1 = x[idx];   x2 = x[1+idx];
128:     s1 = v[0]*x1  +  v[1]*x2;
129:     s2 = v[2]*x1  +  v[3]*x2;
130:     v += 4;

132:     vi    = aj + diag[i] + 1;
133:     nz    = ai[i+1] - diag[i] - 1;
134:     while (nz--) {
135:       oidx = 2*(*vi++);
136:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138:       v  += 4;
139:     }
140:     x[idx]   = s1;x[1+idx] = s2;
141:     idx += 2;
142:   }
143:   /* backward solve the L^T */
144:   for (i=n-1; i>=0; i--){
145:     v    = aa + 4*diag[i] - 4;
146:     vi   = aj + diag[i] - 1;
147:     nz   = diag[i] - ai[i];
148:     idt  = 2*i;
149:     s1   = x[idt];  s2 = x[1+idt];
150:     while (nz--) {
151:       idx   = 2*(*vi--);
152:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154:       v -= 4;
155:     }
156:   }
157:   VecRestoreArray(xx,&x);
158:   PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
159:   return(0);
160: }

164: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165: {
166:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
167:   PetscErrorCode    ierr;
168:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169:   PetscInt          nz,idx,idt,j,i,oidx;
170:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
171:   const MatScalar   *aa=a->a,*v;
172:   PetscScalar       s1,s2,x1,x2,*x;

175:   VecCopy(bb,xx);
176:   VecGetArray(xx,&x);

178:   /* forward solve the U^T */
179:   idx = 0;
180:   for (i=0; i<n; i++) {
181:     v     = aa + bs2*diag[i];
182:     /* multiply by the inverse of the block diagonal */
183:     x1 = x[idx];   x2 = x[1+idx];
184:     s1 = v[0]*x1  +  v[1]*x2;
185:     s2 = v[2]*x1  +  v[3]*x2;
186:     v -= bs2;

188:     vi    = aj + diag[i] - 1;
189:     nz    = diag[i] - diag[i+1] - 1;
190:     for(j=0;j>-nz;j--){
191:       oidx = bs*vi[j];
192:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
193:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
194:       v  -= bs2;
195:     }
196:     x[idx]   = s1;x[1+idx] = s2;
197:     idx += bs;
198:   }
199:   /* backward solve the L^T */
200:   for (i=n-1; i>=0; i--){
201:     v    = aa + bs2*ai[i];
202:     vi   = aj + ai[i];
203:     nz   = ai[i+1] - ai[i];
204:     idt  = bs*i;
205:     s1   = x[idt];  s2 = x[1+idt];
206:     for(j=0;j<nz;j++){
207:       idx   = bs*vi[j];
208:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
209:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
210:       v += bs2;
211:     }
212:   }
213:   VecRestoreArray(xx,&x);
214:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
215:   return(0);
216: }

220: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221: {
222:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
223:   PetscErrorCode    ierr;
224:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225:   PetscInt          i,nz,idx,idt,oidx;
226:   const MatScalar   *aa=a->a,*v;
227:   PetscScalar       s1,s2,s3,x1,x2,x3,*x;

230:   VecCopy(bb,xx);
231:   VecGetArray(xx,&x);

233:   /* forward solve the U^T */
234:   idx = 0;
235:   for (i=0; i<n; i++) {

237:     v     = aa + 9*diag[i];
238:     /* multiply by the inverse of the block diagonal */
239:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243:     v += 9;

245:     vi    = aj + diag[i] + 1;
246:     nz    = ai[i+1] - diag[i] - 1;
247:     while (nz--) {
248:       oidx = 3*(*vi++);
249:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252:       v  += 9;
253:     }
254:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
255:     idx += 3;
256:   }
257:   /* backward solve the L^T */
258:   for (i=n-1; i>=0; i--){
259:     v    = aa + 9*diag[i] - 9;
260:     vi   = aj + diag[i] - 1;
261:     nz   = diag[i] - ai[i];
262:     idt  = 3*i;
263:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264:     while (nz--) {
265:       idx   = 3*(*vi--);
266:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269:       v -= 9;
270:     }
271:   }
272:   VecRestoreArray(xx,&x);
273:   PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
274:   return(0);
275: }

279: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280: {
281:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
282:   PetscErrorCode    ierr;
283:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284:   PetscInt          nz,idx,idt,j,i,oidx;
285:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
286:   const MatScalar   *aa=a->a,*v;
287:   PetscScalar       s1,s2,s3,x1,x2,x3,*x;

290:   VecCopy(bb,xx);
291:   VecGetArray(xx,&x);

293:   /* forward solve the U^T */
294:   idx = 0;
295:   for (i=0; i<n; i++) {
296:     v     = aa + bs2*diag[i];
297:     /* multiply by the inverse of the block diagonal */
298:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
299:     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
300:     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
301:     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
302:     v -= bs2;

304:     vi    = aj + diag[i] - 1;
305:     nz    = diag[i] - diag[i+1] - 1;
306:     for(j=0;j>-nz;j--){
307:       oidx = bs*vi[j];
308:       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
309:       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
310:       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
311:       v  -= bs2;
312:     }
313:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
314:     idx += bs;
315:   }
316:   /* backward solve the L^T */
317:   for (i=n-1; i>=0; i--){
318:     v    = aa + bs2*ai[i];
319:     vi   = aj + ai[i];
320:     nz   = ai[i+1] - ai[i];
321:     idt  = bs*i;
322:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
323:     for(j=0;j<nz;j++){
324:       idx   = bs*vi[j];
325:       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
326:       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
327:       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
328:       v += bs2;
329:     }
330:   }
331:   VecRestoreArray(xx,&x);
332:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
333:   return(0);
334: }

338: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339: {
340:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
341:   PetscErrorCode    ierr;
342:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343:   PetscInt          i,nz,idx,idt,oidx;
344:   const MatScalar   *aa=a->a,*v;
345:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;

348:   VecCopy(bb,xx);
349:   VecGetArray(xx,&x);

351:   /* forward solve the U^T */
352:   idx = 0;
353:   for (i=0; i<n; i++) {

355:     v     = aa + 16*diag[i];
356:     /* multiply by the inverse of the block diagonal */
357:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362:     v += 16;

364:     vi    = aj + diag[i] + 1;
365:     nz    = ai[i+1] - diag[i] - 1;
366:     while (nz--) {
367:       oidx = 4*(*vi++);
368:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372:       v  += 16;
373:     }
374:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375:     idx += 4;
376:   }
377:   /* backward solve the L^T */
378:   for (i=n-1; i>=0; i--){
379:     v    = aa + 16*diag[i] - 16;
380:     vi   = aj + diag[i] - 1;
381:     nz   = diag[i] - ai[i];
382:     idt  = 4*i;
383:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384:     while (nz--) {
385:       idx   = 4*(*vi--);
386:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390:       v -= 16;
391:     }
392:   }
393:   VecRestoreArray(xx,&x);
394:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
395:   return(0);
396: }

400: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401: {
402:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
403:   PetscErrorCode    ierr;
404:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405:   PetscInt          nz,idx,idt,j,i,oidx;
406:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
407:   const MatScalar   *aa=a->a,*v;
408:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;

411:   VecCopy(bb,xx);
412:   VecGetArray(xx,&x);

414:   /* forward solve the U^T */
415:   idx = 0;
416:   for (i=0; i<n; i++) {
417:     v     = aa + bs2*diag[i];
418:     /* multiply by the inverse of the block diagonal */
419:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
420:     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
421:     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
422:     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
423:     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
424:     v -= bs2;

426:     vi    = aj + diag[i] - 1;
427:     nz    = diag[i] - diag[i+1] - 1;
428:     for(j=0;j>-nz;j--){
429:       oidx = bs*vi[j];
430:       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
431:       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
432:       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
433:       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
434:       v  -= bs2;
435:     }
436:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
437:     idx += bs;
438:   }
439:   /* backward solve the L^T */
440:   for (i=n-1; i>=0; i--){
441:     v    = aa + bs2*ai[i];
442:     vi   = aj + ai[i];
443:     nz   = ai[i+1] - ai[i];
444:     idt  = bs*i;
445:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
446:     for(j=0;j<nz;j++){
447:       idx   = bs*vi[j];
448:       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
449:       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
450:       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
451:       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
452:       v += bs2;
453:     }
454:   }
455:   VecRestoreArray(xx,&x);
456:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
457:   return(0);
458: }

462: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463: {
464:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
465:   PetscErrorCode    ierr;
466:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467:   PetscInt          i,nz,idx,idt,oidx;
468:   const MatScalar   *aa=a->a,*v;
469:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

472:   VecCopy(bb,xx);
473:   VecGetArray(xx,&x);

475:   /* forward solve the U^T */
476:   idx = 0;
477:   for (i=0; i<n; i++) {

479:     v     = aa + 25*diag[i];
480:     /* multiply by the inverse of the block diagonal */
481:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487:     v += 25;

489:     vi    = aj + diag[i] + 1;
490:     nz    = ai[i+1] - diag[i] - 1;
491:     while (nz--) {
492:       oidx = 5*(*vi++);
493:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498:       v  += 25;
499:     }
500:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501:     idx += 5;
502:   }
503:   /* backward solve the L^T */
504:   for (i=n-1; i>=0; i--){
505:     v    = aa + 25*diag[i] - 25;
506:     vi   = aj + diag[i] - 1;
507:     nz   = diag[i] - ai[i];
508:     idt  = 5*i;
509:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510:     while (nz--) {
511:       idx   = 5*(*vi--);
512:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517:       v -= 25;
518:     }
519:   }
520:   VecRestoreArray(xx,&x);
521:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
522:   return(0);
523: }

527: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528: {
529:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
531:   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532:   PetscInt       nz,idx,idt,j,i,oidx;
533:   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
534:   const MatScalar      *aa=a->a,*v;
535:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;

538:   VecCopy(bb,xx);
539:   VecGetArray(xx,&x);

541:   /* forward solve the U^T */
542:   idx = 0;
543:   for (i=0; i<n; i++) {
544:     v     = aa + bs2*diag[i];
545:     /* multiply by the inverse of the block diagonal */
546:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
547:     x5 = x[4+idx];
548:     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
549:     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
550:     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
551:     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
552:     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
553:     v -= bs2;

555:     vi    = aj + diag[i] - 1;
556:     nz    = diag[i] - diag[i+1] - 1;
557:     for(j=0;j>-nz;j--){
558:       oidx = bs*vi[j];
559:       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
560:       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
561:       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
562:       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
563:       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
564:       v  -= bs2;
565:     }
566:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
567:     idx += bs;
568:   }
569:   /* backward solve the L^T */
570:   for (i=n-1; i>=0; i--){
571:     v    = aa + bs2*ai[i];
572:     vi   = aj + ai[i];
573:     nz   = ai[i+1] - ai[i];
574:     idt  = bs*i;
575:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
576:     for(j=0;j<nz;j++){
577:       idx   = bs*vi[j];
578:       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
579:       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
580:       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
581:       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
582:       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
583:       v += bs2;
584:     }
585:   }
586:   VecRestoreArray(xx,&x);
587:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
588:   return(0);
589: }

593: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594: {
595:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
596:   PetscErrorCode    ierr;
597:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598:   PetscInt          i,nz,idx,idt,oidx;
599:   const MatScalar   *aa=a->a,*v;
600:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

603:   VecCopy(bb,xx);
604:   VecGetArray(xx,&x);

606:   /* forward solve the U^T */
607:   idx = 0;
608:   for (i=0; i<n; i++) {

610:     v     = aa + 36*diag[i];
611:     /* multiply by the inverse of the block diagonal */
612:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613:     x6    = x[5+idx];
614:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620:     v += 36;

622:     vi    = aj + diag[i] + 1;
623:     nz    = ai[i+1] - diag[i] - 1;
624:     while (nz--) {
625:       oidx = 6*(*vi++);
626:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632:       v  += 36;
633:     }
634:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635:     x[5+idx] = s6;
636:     idx += 6;
637:   }
638:   /* backward solve the L^T */
639:   for (i=n-1; i>=0; i--){
640:     v    = aa + 36*diag[i] - 36;
641:     vi   = aj + diag[i] - 1;
642:     nz   = diag[i] - ai[i];
643:     idt  = 6*i;
644:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645:     s6 = x[5+idt];
646:     while (nz--) {
647:       idx   = 6*(*vi--);
648:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654:       v -= 36;
655:     }
656:   }
657:   VecRestoreArray(xx,&x);
658:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
659:   return(0);
660: }

664: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665: {
666:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
667:   PetscErrorCode    ierr;
668:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669:   PetscInt          nz,idx,idt,j,i,oidx;
670:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
671:   const MatScalar   *aa=a->a,*v;
672:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;

675:   VecCopy(bb,xx);
676:   VecGetArray(xx,&x);

678:   /* forward solve the U^T */
679:   idx = 0;
680:   for (i=0; i<n; i++) {
681:     v     = aa + bs2*diag[i];
682:     /* multiply by the inverse of the block diagonal */
683:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
684:     x5 = x[4+idx]; x6 = x[5+idx];
685:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
686:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
687:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691:     v -= bs2;

693:     vi    = aj + diag[i] - 1;
694:     nz    = diag[i] - diag[i+1] - 1;
695:     for(j=0;j>-nz;j--){
696:       oidx = bs*vi[j];
697:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
698:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
699:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703:       v  -= bs2;
704:     }
705:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
706:     x[5+idx] = s6;
707:     idx += bs;
708:   }
709:   /* backward solve the L^T */
710:   for (i=n-1; i>=0; i--){
711:     v    = aa + bs2*ai[i];
712:     vi   = aj + ai[i];
713:     nz   = ai[i+1] - ai[i];
714:     idt  = bs*i;
715:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
716:     s6   = x[5+idt];
717:     for(j=0;j<nz;j++){
718:       idx   = bs*vi[j];
719:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
720:       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
721:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725:       v += bs2;
726:     }
727:   }
728:   VecRestoreArray(xx,&x);
729:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
730:   return(0);
731: }

735: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736: {
737:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
738:   PetscErrorCode    ierr;
739:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740:   PetscInt          i,nz,idx,idt,oidx;
741:   const MatScalar   *aa=a->a,*v;
742:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

745:   VecCopy(bb,xx);
746:   VecGetArray(xx,&x);

748:   /* forward solve the U^T */
749:   idx = 0;
750:   for (i=0; i<n; i++) {

752:     v     = aa + 49*diag[i];
753:     /* multiply by the inverse of the block diagonal */
754:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755:     x6    = x[5+idx]; x7 = x[6+idx];
756:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763:     v += 49;

765:     vi    = aj + diag[i] + 1;
766:     nz    = ai[i+1] - diag[i] - 1;
767:     while (nz--) {
768:       oidx = 7*(*vi++);
769:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776:       v  += 49;
777:     }
778:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779:     x[5+idx] = s6;x[6+idx] = s7;
780:     idx += 7;
781:   }
782:   /* backward solve the L^T */
783:   for (i=n-1; i>=0; i--){
784:     v    = aa + 49*diag[i] - 49;
785:     vi   = aj + diag[i] - 1;
786:     nz   = diag[i] - ai[i];
787:     idt  = 7*i;
788:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789:     s6 = x[5+idt];s7 = x[6+idt];
790:     while (nz--) {
791:       idx   = 7*(*vi--);
792:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799:       v -= 49;
800:     }
801:   }
802:   VecRestoreArray(xx,&x);
803:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
804:   return(0);
805: }
808: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809: {
810:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
811:   PetscErrorCode    ierr;
812:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813:   PetscInt          nz,idx,idt,j,i,oidx;
814:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
815:   const MatScalar   *aa=a->a,*v;
816:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;

819:   VecCopy(bb,xx);
820:   VecGetArray(xx,&x);

822:   /* forward solve the U^T */
823:   idx = 0;
824:   for (i=0; i<n; i++) {
825:     v     = aa + bs2*diag[i];
826:     /* multiply by the inverse of the block diagonal */
827:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
828:     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
829:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
830:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836:     v -= bs2;
837:     vi    = aj + diag[i] - 1;
838:     nz    = diag[i] - diag[i+1] - 1;
839:     for(j=0;j>-nz;j--){
840:       oidx = bs*vi[j];
841:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
842:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848:       v  -= bs2;
849:     }
850:     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
851:     x[5+idx] = s6;  x[6+idx] = s7;
852:     idx += bs;
853:   }
854:   /* backward solve the L^T */
855:   for (i=n-1; i>=0; i--){
856:     v    = aa + bs2*ai[i];
857:     vi   = aj + ai[i];
858:     nz   = ai[i+1] - ai[i];
859:     idt  = bs*i;
860:     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
861:     s6   = x[5+idt];  s7 = x[6+idt];
862:     for(j=0;j<nz;j++){
863:       idx   = bs*vi[j];
864:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
865:       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871:       v += bs2;
872:     }
873:   }
874:   VecRestoreArray(xx,&x);
875:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
876:   return(0);
877: }

879: /*---------------------------------------------------------------------------------------------*/
882: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883: {
884:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
885:   IS                iscol = a->col,isrow = a->row;
886:   PetscErrorCode    ierr;
887:   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888:   PetscInt          i,n = a->mbs,j;
889:   PetscInt          nz;
890:   PetscScalar       *x,*tmp,s1;
891:   const MatScalar   *aa = a->a,*v;
892:   const PetscScalar *b;

895:   VecGetArrayRead(bb,&b);
896:   VecGetArray(xx,&x);
897:   tmp  = a->solve_work;

899:   ISGetIndices(isrow,&rout); r = rout;
900:   ISGetIndices(iscol,&cout); c = cout;

902:   /* copy the b into temp work space according to permutation */
903:   for (i=0; i<n; i++) tmp[i] = b[c[i]];

905:   /* forward solve the U^T */
906:   for (i=0; i<n; i++) {
907:     v   = aa + adiag[i+1] + 1;
908:     vi  = aj + adiag[i+1] + 1;
909:     nz  = adiag[i] - adiag[i+1] - 1;
910:     s1  = tmp[i];
911:     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
912:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913:     tmp[i] = s1;
914:   }

916:   /* backward solve the L^T */
917:   for (i=n-1; i>=0; i--){
918:     v   = aa + ai[i];
919:     vi  = aj + ai[i];
920:     nz  = ai[i+1] - ai[i];
921:     s1  = tmp[i];
922:     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923:   }

925:   /* copy tmp into x according to permutation */
926:   for (i=0; i<n; i++) x[r[i]] = tmp[i];

928:   ISRestoreIndices(isrow,&rout);
929:   ISRestoreIndices(iscol,&cout);
930:   VecRestoreArrayRead(bb,&b);
931:   VecRestoreArray(xx,&x);

933:   PetscLogFlops(2.0*a->nz-A->cmap->n);
934:   return(0);
935: }

939: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940: {
941:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
942:   IS                iscol=a->col,isrow=a->row;
943:   PetscErrorCode    ierr;
944:   const PetscInt    *r,*c,*rout,*cout;
945:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946:   PetscInt          i,nz;
947:   const MatScalar   *aa=a->a,*v;
948:   PetscScalar       s1,*x,*t;
949:   const PetscScalar *b;

952:   VecGetArrayRead(bb,&b);
953:   VecGetArray(xx,&x);
954:   t  = a->solve_work;

956:   ISGetIndices(isrow,&rout); r = rout;
957:   ISGetIndices(iscol,&cout); c = cout;

959:   /* copy the b into temp work space according to permutation */
960:   for (i=0; i<n; i++) {
961:     t[i] = b[c[i]];
962:   }

964:   /* forward solve the U^T */
965:   for (i=0; i<n; i++) {

967:     v     = aa + diag[i];
968:     /* multiply by the inverse of the block diagonal */
969:     s1    = (*v++)*t[i];
970:     vi    = aj + diag[i] + 1;
971:     nz    = ai[i+1] - diag[i] - 1;
972:     while (nz--) {
973:       t[*vi++]  -= (*v++)*s1;
974:     }
975:     t[i]   = s1;
976:   }
977:   /* backward solve the L^T */
978:   for (i=n-1; i>=0; i--){
979:     v    = aa + diag[i] - 1;
980:     vi   = aj + diag[i] - 1;
981:     nz   = diag[i] - ai[i];
982:     s1   = t[i];
983:     while (nz--) {
984:       t[*vi--]   -=  (*v--)*s1;
985:     }
986:   }

988:   /* copy t into x according to permutation */
989:   for (i=0; i<n; i++) {
990:     x[r[i]]   = t[i];
991:   }

993:   ISRestoreIndices(isrow,&rout);
994:   ISRestoreIndices(iscol,&cout);
995:   VecRestoreArrayRead(bb,&b);
996:   VecRestoreArray(xx,&x);
997:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
998:   return(0);
999: }

1003: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1004: {
1005:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1006:   IS                iscol=a->col,isrow=a->row;
1007:   PetscErrorCode    ierr;
1008:   const PetscInt    *r,*c,*rout,*cout;
1009:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1010:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1011:   const MatScalar   *aa=a->a,*v;
1012:   PetscScalar       s1,s2,x1,x2,*x,*t;
1013:   const PetscScalar *b;

1016:   VecGetArrayRead(bb,&b);
1017:   VecGetArray(xx,&x);
1018:   t  = a->solve_work;

1020:   ISGetIndices(isrow,&rout); r = rout;
1021:   ISGetIndices(iscol,&cout); c = cout;

1023:   /* copy the b into temp work space according to permutation */
1024:   ii = 0;
1025:   for (i=0; i<n; i++) {
1026:     ic      = 2*c[i];
1027:     t[ii]   = b[ic];
1028:     t[ii+1] = b[ic+1];
1029:     ii += 2;
1030:   }

1032:   /* forward solve the U^T */
1033:   idx = 0;
1034:   for (i=0; i<n; i++) {

1036:     v     = aa + 4*diag[i];
1037:     /* multiply by the inverse of the block diagonal */
1038:     x1    = t[idx];   x2 = t[1+idx];
1039:     s1 = v[0]*x1  +  v[1]*x2;
1040:     s2 = v[2]*x1  +  v[3]*x2;
1041:     v += 4;

1043:     vi    = aj + diag[i] + 1;
1044:     nz    = ai[i+1] - diag[i] - 1;
1045:     while (nz--) {
1046:       oidx = 2*(*vi++);
1047:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1048:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1049:       v  += 4;
1050:     }
1051:     t[idx]   = s1;t[1+idx] = s2;
1052:     idx += 2;
1053:   }
1054:   /* backward solve the L^T */
1055:   for (i=n-1; i>=0; i--){
1056:     v    = aa + 4*diag[i] - 4;
1057:     vi   = aj + diag[i] - 1;
1058:     nz   = diag[i] - ai[i];
1059:     idt  = 2*i;
1060:     s1 = t[idt];  s2 = t[1+idt];
1061:     while (nz--) {
1062:       idx   = 2*(*vi--);
1063:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1064:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1065:       v -= 4;
1066:     }
1067:   }

1069:   /* copy t into x according to permutation */
1070:   ii = 0;
1071:   for (i=0; i<n; i++) {
1072:     ir      = 2*r[i];
1073:     x[ir]   = t[ii];
1074:     x[ir+1] = t[ii+1];
1075:     ii += 2;
1076:   }

1078:   ISRestoreIndices(isrow,&rout);
1079:   ISRestoreIndices(iscol,&cout);
1080:   VecRestoreArrayRead(bb,&b);
1081:   VecRestoreArray(xx,&x);
1082:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1083:   return(0);
1084: }

1088: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1089: {
1090:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1091:   PetscErrorCode    ierr;
1092:   IS                iscol=a->col,isrow=a->row;
1093:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1094:   const PetscInt    *r,*c,*rout,*cout;
1095:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1096:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1097:   const MatScalar   *aa=a->a,*v;
1098:   PetscScalar       s1,s2,x1,x2,*x,*t;
1099:   const PetscScalar *b;

1102:   VecGetArrayRead(bb,&b);
1103:   VecGetArray(xx,&x);
1104:   t = a->solve_work;

1106:   ISGetIndices(isrow,&rout); r = rout;
1107:   ISGetIndices(iscol,&cout); c = cout;

1109:   /* copy b into temp work space according to permutation */
1110:   for(i=0;i<n;i++){
1111:     ii = bs*i; ic = bs*c[i];
1112:     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1113:   }

1115:   /* forward solve the U^T */
1116:   idx = 0;
1117:   for (i=0; i<n; i++) {
1118:     v     = aa + bs2*diag[i];
1119:     /* multiply by the inverse of the block diagonal */
1120:     x1 = t[idx];   x2 = t[1+idx];
1121:     s1 = v[0]*x1  +  v[1]*x2;
1122:     s2 = v[2]*x1  +  v[3]*x2;
1123:     v -= bs2;

1125:     vi    = aj + diag[i] - 1;
1126:     nz    = diag[i] - diag[i+1] - 1;
1127:     for(j=0;j>-nz;j--){
1128:       oidx = bs*vi[j];
1129:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1130:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1131:       v  -= bs2;
1132:     }
1133:     t[idx]   = s1;t[1+idx] = s2;
1134:     idx += bs;
1135:   }
1136:   /* backward solve the L^T */
1137:   for (i=n-1; i>=0; i--){
1138:     v    = aa + bs2*ai[i];
1139:     vi   = aj + ai[i];
1140:     nz   = ai[i+1] - ai[i];
1141:     idt  = bs*i;
1142:     s1   = t[idt];  s2 = t[1+idt];
1143:     for(j=0;j<nz;j++){
1144:       idx   = bs*vi[j];
1145:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1146:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1147:       v += bs2;
1148:     }
1149:   }

1151:   /* copy t into x according to permutation */
1152:   for(i=0;i<n;i++){
1153:     ii = bs*i;  ir = bs*r[i];
1154:     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1155:   }

1157:   ISRestoreIndices(isrow,&rout);
1158:   ISRestoreIndices(iscol,&cout);
1159:   VecRestoreArrayRead(bb,&b);
1160:   VecRestoreArray(xx,&x);
1161:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1162:   return(0);
1163: }

1167: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1168: {
1169:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1170:   IS                iscol=a->col,isrow=a->row;
1171:   PetscErrorCode    ierr;
1172:   const PetscInt    *r,*c,*rout,*cout;
1173:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1174:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1175:   const MatScalar   *aa=a->a,*v;
1176:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1177:   const PetscScalar *b;

1180:   VecGetArrayRead(bb,&b);
1181:   VecGetArray(xx,&x);
1182:   t  = a->solve_work;

1184:   ISGetIndices(isrow,&rout); r = rout;
1185:   ISGetIndices(iscol,&cout); c = cout;

1187:   /* copy the b into temp work space according to permutation */
1188:   ii = 0;
1189:   for (i=0; i<n; i++) {
1190:     ic      = 3*c[i];
1191:     t[ii]   = b[ic];
1192:     t[ii+1] = b[ic+1];
1193:     t[ii+2] = b[ic+2];
1194:     ii += 3;
1195:   }

1197:   /* forward solve the U^T */
1198:   idx = 0;
1199:   for (i=0; i<n; i++) {

1201:     v     = aa + 9*diag[i];
1202:     /* multiply by the inverse of the block diagonal */
1203:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1204:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1205:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1206:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1207:     v += 9;

1209:     vi    = aj + diag[i] + 1;
1210:     nz    = ai[i+1] - diag[i] - 1;
1211:     while (nz--) {
1212:       oidx = 3*(*vi++);
1213:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1214:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1215:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1216:       v  += 9;
1217:     }
1218:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1219:     idx += 3;
1220:   }
1221:   /* backward solve the L^T */
1222:   for (i=n-1; i>=0; i--){
1223:     v    = aa + 9*diag[i] - 9;
1224:     vi   = aj + diag[i] - 1;
1225:     nz   = diag[i] - ai[i];
1226:     idt  = 3*i;
1227:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1228:     while (nz--) {
1229:       idx   = 3*(*vi--);
1230:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1231:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1232:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233:       v -= 9;
1234:     }
1235:   }

1237:   /* copy t into x according to permutation */
1238:   ii = 0;
1239:   for (i=0; i<n; i++) {
1240:     ir      = 3*r[i];
1241:     x[ir]   = t[ii];
1242:     x[ir+1] = t[ii+1];
1243:     x[ir+2] = t[ii+2];
1244:     ii += 3;
1245:   }

1247:   ISRestoreIndices(isrow,&rout);
1248:   ISRestoreIndices(iscol,&cout);
1249:   VecRestoreArrayRead(bb,&b);
1250:   VecRestoreArray(xx,&x);
1251:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1252:   return(0);
1253: }

1257: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1258: {
1259:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1260:   PetscErrorCode    ierr;
1261:   IS                iscol=a->col,isrow=a->row;
1262:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1263:   const PetscInt    *r,*c,*rout,*cout;
1264:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1265:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1266:   const MatScalar   *aa=a->a,*v;
1267:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1268:   const PetscScalar *b;

1271:   VecGetArrayRead(bb,&b);
1272:   VecGetArray(xx,&x);
1273:   t = a->solve_work;

1275:   ISGetIndices(isrow,&rout); r = rout;
1276:   ISGetIndices(iscol,&cout); c = cout;

1278:   /* copy b into temp work space according to permutation */
1279:   for(i=0;i<n;i++){
1280:     ii = bs*i; ic = bs*c[i];
1281:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1282:   }

1284:   /* forward solve the U^T */
1285:   idx = 0;
1286:   for (i=0; i<n; i++) {
1287:     v     = aa + bs2*diag[i];
1288:     /* multiply by the inverse of the block diagonal */
1289:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1290:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1291:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1292:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1293:     v -= bs2;

1295:     vi    = aj + diag[i] - 1;
1296:     nz    = diag[i] - diag[i+1] - 1;
1297:     for(j=0;j>-nz;j--){
1298:       oidx = bs*vi[j];
1299:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1300:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1301:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1302:       v  -= bs2;
1303:     }
1304:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1305:     idx += bs;
1306:   }
1307:   /* backward solve the L^T */
1308:   for (i=n-1; i>=0; i--){
1309:     v    = aa + bs2*ai[i];
1310:     vi   = aj + ai[i];
1311:     nz   = ai[i+1] - ai[i];
1312:     idt  = bs*i;
1313:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1314:     for(j=0;j<nz;j++){
1315:       idx   = bs*vi[j];
1316:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1317:       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1318:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1319:       v += bs2;
1320:     }
1321:   }

1323:   /* copy t into x according to permutation */
1324:   for(i=0;i<n;i++){
1325:     ii = bs*i;  ir = bs*r[i];
1326:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1327:   }

1329:   ISRestoreIndices(isrow,&rout);
1330:   ISRestoreIndices(iscol,&cout);
1331:   VecRestoreArrayRead(bb,&b);
1332:   VecRestoreArray(xx,&x);
1333:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1334:   return(0);
1335: }

1339: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1340: {
1341:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1342:   IS                iscol=a->col,isrow=a->row;
1343:   PetscErrorCode    ierr;
1344:   const PetscInt    *r,*c,*rout,*cout;
1345:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1346:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1347:   const MatScalar   *aa=a->a,*v;
1348:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1349:   const PetscScalar *b;

1352:   VecGetArrayRead(bb,&b);
1353:   VecGetArray(xx,&x);
1354:   t  = a->solve_work;

1356:   ISGetIndices(isrow,&rout); r = rout;
1357:   ISGetIndices(iscol,&cout); c = cout;

1359:   /* copy the b into temp work space according to permutation */
1360:   ii = 0;
1361:   for (i=0; i<n; i++) {
1362:     ic      = 4*c[i];
1363:     t[ii]   = b[ic];
1364:     t[ii+1] = b[ic+1];
1365:     t[ii+2] = b[ic+2];
1366:     t[ii+3] = b[ic+3];
1367:     ii += 4;
1368:   }

1370:   /* forward solve the U^T */
1371:   idx = 0;
1372:   for (i=0; i<n; i++) {

1374:     v     = aa + 16*diag[i];
1375:     /* multiply by the inverse of the block diagonal */
1376:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1377:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1378:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1379:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1380:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1381:     v += 16;

1383:     vi    = aj + diag[i] + 1;
1384:     nz    = ai[i+1] - diag[i] - 1;
1385:     while (nz--) {
1386:       oidx = 4*(*vi++);
1387:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1388:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1389:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1390:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1391:       v  += 16;
1392:     }
1393:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1394:     idx += 4;
1395:   }
1396:   /* backward solve the L^T */
1397:   for (i=n-1; i>=0; i--){
1398:     v    = aa + 16*diag[i] - 16;
1399:     vi   = aj + diag[i] - 1;
1400:     nz   = diag[i] - ai[i];
1401:     idt  = 4*i;
1402:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1403:     while (nz--) {
1404:       idx   = 4*(*vi--);
1405:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1406:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1407:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1408:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1409:       v -= 16;
1410:     }
1411:   }

1413:   /* copy t into x according to permutation */
1414:   ii = 0;
1415:   for (i=0; i<n; i++) {
1416:     ir      = 4*r[i];
1417:     x[ir]   = t[ii];
1418:     x[ir+1] = t[ii+1];
1419:     x[ir+2] = t[ii+2];
1420:     x[ir+3] = t[ii+3];
1421:     ii += 4;
1422:   }

1424:   ISRestoreIndices(isrow,&rout);
1425:   ISRestoreIndices(iscol,&cout);
1426:   VecRestoreArrayRead(bb,&b);
1427:   VecRestoreArray(xx,&x);
1428:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1429:   return(0);
1430: }

1434: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1435: {
1436:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1437:   PetscErrorCode    ierr;
1438:   IS                iscol=a->col,isrow=a->row;
1439:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1440:   const PetscInt    *r,*c,*rout,*cout;
1441:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1442:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1443:   const MatScalar   *aa=a->a,*v;
1444:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1445:   const PetscScalar *b;

1448:   VecGetArrayRead(bb,&b);
1449:   VecGetArray(xx,&x);
1450:   t = a->solve_work;

1452:   ISGetIndices(isrow,&rout); r = rout;
1453:   ISGetIndices(iscol,&cout); c = cout;

1455:   /* copy b into temp work space according to permutation */
1456:   for(i=0;i<n;i++){
1457:     ii = bs*i; ic = bs*c[i];
1458:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1459:   }

1461:   /* forward solve the U^T */
1462:   idx = 0;
1463:   for (i=0; i<n; i++) {
1464:     v     = aa + bs2*diag[i];
1465:     /* multiply by the inverse of the block diagonal */
1466:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1467:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1468:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1469:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1470:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1471:     v -= bs2;

1473:     vi    = aj + diag[i] - 1;
1474:     nz    = diag[i] - diag[i+1] - 1;
1475:     for(j=0;j>-nz;j--){
1476:       oidx = bs*vi[j];
1477:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1478:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1479:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1480:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1481:       v  -= bs2;
1482:     }
1483:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1484:     idx += bs;
1485:   }
1486:   /* backward solve the L^T */
1487:   for (i=n-1; i>=0; i--){
1488:     v    = aa + bs2*ai[i];
1489:     vi   = aj + ai[i];
1490:     nz   = ai[i+1] - ai[i];
1491:     idt  = bs*i;
1492:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1493:     for(j=0;j<nz;j++){
1494:       idx   = bs*vi[j];
1495:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1496:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1497:       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1498:       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1499:       v += bs2;
1500:     }
1501:   }

1503:   /* copy t into x according to permutation */
1504:   for(i=0;i<n;i++){
1505:     ii = bs*i;  ir = bs*r[i];
1506:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1507:   }

1509:   ISRestoreIndices(isrow,&rout);
1510:   ISRestoreIndices(iscol,&cout);
1511:   VecRestoreArrayRead(bb,&b);
1512:   VecRestoreArray(xx,&x);
1513:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1514:   return(0);
1515: }

1519: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1520: {
1521:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1522:   IS                iscol=a->col,isrow=a->row;
1523:   PetscErrorCode    ierr;
1524:   const PetscInt    *r,*c,*rout,*cout;
1525:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1526:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1527:   const MatScalar   *aa=a->a,*v;
1528:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1529:   const PetscScalar *b;

1532:   VecGetArrayRead(bb,&b);
1533:   VecGetArray(xx,&x);
1534:   t  = a->solve_work;

1536:   ISGetIndices(isrow,&rout); r = rout;
1537:   ISGetIndices(iscol,&cout); c = cout;

1539:   /* copy the b into temp work space according to permutation */
1540:   ii = 0;
1541:   for (i=0; i<n; i++) {
1542:     ic      = 5*c[i];
1543:     t[ii]   = b[ic];
1544:     t[ii+1] = b[ic+1];
1545:     t[ii+2] = b[ic+2];
1546:     t[ii+3] = b[ic+3];
1547:     t[ii+4] = b[ic+4];
1548:     ii += 5;
1549:   }

1551:   /* forward solve the U^T */
1552:   idx = 0;
1553:   for (i=0; i<n; i++) {

1555:     v     = aa + 25*diag[i];
1556:     /* multiply by the inverse of the block diagonal */
1557:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1558:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1559:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1560:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1561:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1562:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1563:     v += 25;

1565:     vi    = aj + diag[i] + 1;
1566:     nz    = ai[i+1] - diag[i] - 1;
1567:     while (nz--) {
1568:       oidx = 5*(*vi++);
1569:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1570:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1571:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1572:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1573:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1574:       v  += 25;
1575:     }
1576:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1577:     idx += 5;
1578:   }
1579:   /* backward solve the L^T */
1580:   for (i=n-1; i>=0; i--){
1581:     v    = aa + 25*diag[i] - 25;
1582:     vi   = aj + diag[i] - 1;
1583:     nz   = diag[i] - ai[i];
1584:     idt  = 5*i;
1585:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1586:     while (nz--) {
1587:       idx   = 5*(*vi--);
1588:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1589:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1590:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1591:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1592:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1593:       v -= 25;
1594:     }
1595:   }

1597:   /* copy t into x according to permutation */
1598:   ii = 0;
1599:   for (i=0; i<n; i++) {
1600:     ir      = 5*r[i];
1601:     x[ir]   = t[ii];
1602:     x[ir+1] = t[ii+1];
1603:     x[ir+2] = t[ii+2];
1604:     x[ir+3] = t[ii+3];
1605:     x[ir+4] = t[ii+4];
1606:     ii += 5;
1607:   }

1609:   ISRestoreIndices(isrow,&rout);
1610:   ISRestoreIndices(iscol,&cout);
1611:   VecRestoreArrayRead(bb,&b);
1612:   VecRestoreArray(xx,&x);
1613:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1614:   return(0);
1615: }

1619: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1620: {
1621:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1622:   PetscErrorCode    ierr;
1623:   IS                iscol=a->col,isrow=a->row;
1624:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1625:   const PetscInt    *r,*c,*rout,*cout;
1626:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1627:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1628:   const MatScalar   *aa=a->a,*v;
1629:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1630:   const PetscScalar *b;

1633:   VecGetArrayRead(bb,&b);
1634:   VecGetArray(xx,&x);
1635:   t = a->solve_work;

1637:   ISGetIndices(isrow,&rout); r = rout;
1638:   ISGetIndices(iscol,&cout); c = cout;

1640:   /* copy b into temp work space according to permutation */
1641:   for(i=0;i<n;i++){
1642:     ii = bs*i; ic = bs*c[i];
1643:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1644:     t[ii+4] = b[ic+4];
1645:   }

1647:   /* forward solve the U^T */
1648:   idx = 0;
1649:   for (i=0; i<n; i++) {
1650:     v     = aa + bs2*diag[i];
1651:     /* multiply by the inverse of the block diagonal */
1652:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1653:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1654:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1655:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1656:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1657:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1658:     v -= bs2;

1660:     vi    = aj + diag[i] - 1;
1661:     nz    = diag[i] - diag[i+1] - 1;
1662:     for(j=0;j>-nz;j--){
1663:       oidx = bs*vi[j];
1664:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1665:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1666:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1667:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1668:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1669:       v  -= bs2;
1670:     }
1671:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1672:     idx += bs;
1673:   }
1674:   /* backward solve the L^T */
1675:   for (i=n-1; i>=0; i--){
1676:     v    = aa + bs2*ai[i];
1677:     vi   = aj + ai[i];
1678:     nz   = ai[i+1] - ai[i];
1679:     idt  = bs*i;
1680:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1681:     for(j=0;j<nz;j++){
1682:       idx   = bs*vi[j];
1683:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1684:       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1685:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1686:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1687:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1688:       v += bs2;
1689:     }
1690:   }

1692:   /* copy t into x according to permutation */
1693:   for(i=0;i<n;i++){
1694:     ii = bs*i;  ir = bs*r[i];
1695:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1696:     x[ir+4] = t[ii+4];
1697:   }

1699:   ISRestoreIndices(isrow,&rout);
1700:   ISRestoreIndices(iscol,&cout);
1701:   VecRestoreArrayRead(bb,&b);
1702:   VecRestoreArray(xx,&x);
1703:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1704:   return(0);
1705: }

1709: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1710: {
1711:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1712:   IS                iscol=a->col,isrow=a->row;
1713:   PetscErrorCode    ierr;
1714:   const PetscInt    *r,*c,*rout,*cout;
1715:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1716:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1717:   const MatScalar   *aa=a->a,*v;
1718:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1719:   const PetscScalar *b;

1722:   VecGetArrayRead(bb,&b);
1723:   VecGetArray(xx,&x);
1724:   t  = a->solve_work;

1726:   ISGetIndices(isrow,&rout); r = rout;
1727:   ISGetIndices(iscol,&cout); c = cout;

1729:   /* copy the b into temp work space according to permutation */
1730:   ii = 0;
1731:   for (i=0; i<n; i++) {
1732:     ic      = 6*c[i];
1733:     t[ii]   = b[ic];
1734:     t[ii+1] = b[ic+1];
1735:     t[ii+2] = b[ic+2];
1736:     t[ii+3] = b[ic+3];
1737:     t[ii+4] = b[ic+4];
1738:     t[ii+5] = b[ic+5];
1739:     ii += 6;
1740:   }

1742:   /* forward solve the U^T */
1743:   idx = 0;
1744:   for (i=0; i<n; i++) {

1746:     v     = aa + 36*diag[i];
1747:     /* multiply by the inverse of the block diagonal */
1748:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1749:     x6    = t[5+idx];
1750:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1751:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1752:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1753:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1754:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1755:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1756:     v += 36;

1758:     vi    = aj + diag[i] + 1;
1759:     nz    = ai[i+1] - diag[i] - 1;
1760:     while (nz--) {
1761:       oidx = 6*(*vi++);
1762:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1763:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1764:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1765:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1766:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1767:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1768:       v  += 36;
1769:     }
1770:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1771:     t[5+idx] = s6;
1772:     idx += 6;
1773:   }
1774:   /* backward solve the L^T */
1775:   for (i=n-1; i>=0; i--){
1776:     v    = aa + 36*diag[i] - 36;
1777:     vi   = aj + diag[i] - 1;
1778:     nz   = diag[i] - ai[i];
1779:     idt  = 6*i;
1780:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1781:     s6 = t[5+idt];
1782:     while (nz--) {
1783:       idx   = 6*(*vi--);
1784:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1785:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1786:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1787:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1788:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1789:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1790:       v -= 36;
1791:     }
1792:   }

1794:   /* copy t into x according to permutation */
1795:   ii = 0;
1796:   for (i=0; i<n; i++) {
1797:     ir      = 6*r[i];
1798:     x[ir]   = t[ii];
1799:     x[ir+1] = t[ii+1];
1800:     x[ir+2] = t[ii+2];
1801:     x[ir+3] = t[ii+3];
1802:     x[ir+4] = t[ii+4];
1803:     x[ir+5] = t[ii+5];
1804:     ii += 6;
1805:   }

1807:   ISRestoreIndices(isrow,&rout);
1808:   ISRestoreIndices(iscol,&cout);
1809:   VecRestoreArrayRead(bb,&b);
1810:   VecRestoreArray(xx,&x);
1811:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
1812:   return(0);
1813: }

1817: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1818: {
1819:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1820:   PetscErrorCode    ierr;
1821:   IS                iscol=a->col,isrow=a->row;
1822:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1823:   const PetscInt    *r,*c,*rout,*cout;
1824:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1825:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1826:   const MatScalar   *aa=a->a,*v;
1827:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1828:   const PetscScalar *b;

1831:   VecGetArrayRead(bb,&b);
1832:   VecGetArray(xx,&x);
1833:   t = a->solve_work;

1835:   ISGetIndices(isrow,&rout); r = rout;
1836:   ISGetIndices(iscol,&cout); c = cout;

1838:   /* copy b into temp work space according to permutation */
1839:   for(i=0;i<n;i++){
1840:     ii = bs*i; ic = bs*c[i];
1841:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1842:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1843:   }

1845:   /* forward solve the U^T */
1846:   idx = 0;
1847:   for (i=0; i<n; i++) {
1848:     v     = aa + bs2*diag[i];
1849:     /* multiply by the inverse of the block diagonal */
1850:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1851:     x6    = t[5+idx];
1852:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1853:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1854:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1855:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1856:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1857:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1858:     v -= bs2;

1860:     vi    = aj + diag[i] - 1;
1861:     nz    = diag[i] - diag[i+1] - 1;
1862:     for(j=0;j>-nz;j--){
1863:       oidx = bs*vi[j];
1864:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1865:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1866:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1867:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1868:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1869:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1870:       v  -= bs2;
1871:     }
1872:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1873:     t[5+idx] = s6;
1874:     idx += bs;
1875:   }
1876:   /* backward solve the L^T */
1877:   for (i=n-1; i>=0; i--){
1878:     v    = aa + bs2*ai[i];
1879:     vi   = aj + ai[i];
1880:     nz   = ai[i+1] - ai[i];
1881:     idt  = bs*i;
1882:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1883:     s6   = t[5+idt];
1884:    for(j=0;j<nz;j++){
1885:       idx   = bs*vi[j];
1886:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1887:       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1888:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1889:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1890:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1891:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1892:       v += bs2;
1893:     }
1894:   }

1896:   /* copy t into x according to permutation */
1897:   for(i=0;i<n;i++){
1898:     ii = bs*i;  ir = bs*r[i];
1899:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1900:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1901:   }

1903:   ISRestoreIndices(isrow,&rout);
1904:   ISRestoreIndices(iscol,&cout);
1905:   VecRestoreArrayRead(bb,&b);
1906:   VecRestoreArray(xx,&x);
1907:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1908:   return(0);
1909: }

1913: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1914: {
1915:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1916:   IS                iscol=a->col,isrow=a->row;
1917:   PetscErrorCode    ierr;
1918:   const PetscInt    *r,*c,*rout,*cout;
1919:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1920:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1921:   const MatScalar   *aa=a->a,*v;
1922:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1923:   const PetscScalar *b;

1926:   VecGetArrayRead(bb,&b);
1927:   VecGetArray(xx,&x);
1928:   t  = a->solve_work;

1930:   ISGetIndices(isrow,&rout); r = rout;
1931:   ISGetIndices(iscol,&cout); c = cout;

1933:   /* copy the b into temp work space according to permutation */
1934:   ii = 0;
1935:   for (i=0; i<n; i++) {
1936:     ic      = 7*c[i];
1937:     t[ii]   = b[ic];
1938:     t[ii+1] = b[ic+1];
1939:     t[ii+2] = b[ic+2];
1940:     t[ii+3] = b[ic+3];
1941:     t[ii+4] = b[ic+4];
1942:     t[ii+5] = b[ic+5];
1943:     t[ii+6] = b[ic+6];
1944:     ii += 7;
1945:   }

1947:   /* forward solve the U^T */
1948:   idx = 0;
1949:   for (i=0; i<n; i++) {

1951:     v     = aa + 49*diag[i];
1952:     /* multiply by the inverse of the block diagonal */
1953:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1954:     x6    = t[5+idx]; x7 = t[6+idx];
1955:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1956:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1957:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1958:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1959:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1960:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1961:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1962:     v += 49;

1964:     vi    = aj + diag[i] + 1;
1965:     nz    = ai[i+1] - diag[i] - 1;
1966:     while (nz--) {
1967:       oidx = 7*(*vi++);
1968:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1969:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1970:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1971:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1972:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1973:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1974:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1975:       v  += 49;
1976:     }
1977:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1978:     t[5+idx] = s6;t[6+idx] = s7;
1979:     idx += 7;
1980:   }
1981:   /* backward solve the L^T */
1982:   for (i=n-1; i>=0; i--){
1983:     v    = aa + 49*diag[i] - 49;
1984:     vi   = aj + diag[i] - 1;
1985:     nz   = diag[i] - ai[i];
1986:     idt  = 7*i;
1987:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1988:     s6 = t[5+idt];s7 = t[6+idt];
1989:     while (nz--) {
1990:       idx   = 7*(*vi--);
1991:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1992:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1993:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1994:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1995:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1996:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1997:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1998:       v -= 49;
1999:     }
2000:   }

2002:   /* copy t into x according to permutation */
2003:   ii = 0;
2004:   for (i=0; i<n; i++) {
2005:     ir      = 7*r[i];
2006:     x[ir]   = t[ii];
2007:     x[ir+1] = t[ii+1];
2008:     x[ir+2] = t[ii+2];
2009:     x[ir+3] = t[ii+3];
2010:     x[ir+4] = t[ii+4];
2011:     x[ir+5] = t[ii+5];
2012:     x[ir+6] = t[ii+6];
2013:     ii += 7;
2014:   }

2016:   ISRestoreIndices(isrow,&rout);
2017:   ISRestoreIndices(iscol,&cout);
2018:   VecRestoreArrayRead(bb,&b);
2019:   VecRestoreArray(xx,&x);
2020:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2021:   return(0);
2022: }
2025: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2026: {
2027:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2028:   PetscErrorCode    ierr;
2029:   IS                iscol=a->col,isrow=a->row;
2030:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2031:   const PetscInt    *r,*c,*rout,*cout;
2032:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2033:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2034:   const MatScalar   *aa=a->a,*v;
2035:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2036:   const PetscScalar *b;

2039:   VecGetArrayRead(bb,&b);
2040:   VecGetArray(xx,&x);
2041:   t = a->solve_work;

2043:   ISGetIndices(isrow,&rout); r = rout;
2044:   ISGetIndices(iscol,&cout); c = cout;

2046:   /* copy b into temp work space according to permutation */
2047:   for(i=0;i<n;i++){
2048:     ii = bs*i; ic = bs*c[i];
2049:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2050:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2051:   }

2053:   /* forward solve the U^T */
2054:   idx = 0;
2055:   for (i=0; i<n; i++) {
2056:     v     = aa + bs2*diag[i];
2057:     /* multiply by the inverse of the block diagonal */
2058:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2059:     x6    = t[5+idx]; x7 = t[6+idx];
2060:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2061:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2062:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2063:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2064:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2065:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2066:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2067:     v -= bs2;

2069:     vi    = aj + diag[i] - 1;
2070:     nz    = diag[i] - diag[i+1] - 1;
2071:     for(j=0;j>-nz;j--){
2072:       oidx = bs*vi[j];
2073:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2074:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2075:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2076:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2077:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2078:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2079:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2080:       v  -= bs2;
2081:     }
2082:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2083:     t[5+idx] = s6;  t[6+idx] = s7;
2084:     idx += bs;
2085:   }
2086:   /* backward solve the L^T */
2087:   for (i=n-1; i>=0; i--){
2088:     v    = aa + bs2*ai[i];
2089:     vi   = aj + ai[i];
2090:     nz   = ai[i+1] - ai[i];
2091:     idt  = bs*i;
2092:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2093:     s6   = t[5+idt];  s7 = t[6+idt];
2094:    for(j=0;j<nz;j++){
2095:       idx   = bs*vi[j];
2096:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2097:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2098:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2099:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2100:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2101:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2102:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2103:       v += bs2;
2104:     }
2105:   }

2107:   /* copy t into x according to permutation */
2108:   for(i=0;i<n;i++){
2109:     ii = bs*i;  ir = bs*r[i];
2110:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2111:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2112:   }

2114:   ISRestoreIndices(isrow,&rout);
2115:   ISRestoreIndices(iscol,&cout);
2116:   VecRestoreArrayRead(bb,&b);
2117:   VecRestoreArray(xx,&x);
2118:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2119:   return(0);
2120: }

2122: /* ----------------------------------------------------------- */
2125: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2126: {
2127:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2128:   IS                iscol=a->col,isrow=a->row;
2129:   PetscErrorCode    ierr;
2130:   const PetscInt    *r,*c,*rout,*cout;
2131:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2132:   PetscInt          i,nz;
2133:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2134:   const MatScalar   *aa=a->a,*v;
2135:   PetscScalar       *x,*s,*t,*ls;
2136:   const PetscScalar *b;

2139:   VecGetArrayRead(bb,&b);
2140:   VecGetArray(xx,&x);
2141:   t  = a->solve_work;

2143:   ISGetIndices(isrow,&rout); r = rout;
2144:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2146:   /* forward solve the lower triangular */
2147:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
2148:   for (i=1; i<n; i++) {
2149:     v   = aa + bs2*ai[i];
2150:     vi  = aj + ai[i];
2151:     nz  = a->diag[i] - ai[i];
2152:     s = t + bs*i;
2153:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
2154:     while (nz--) {
2155:       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2156:       v += bs2;
2157:     }
2158:   }
2159:   /* backward solve the upper triangular */
2160:   ls = a->solve_work + A->cmap->n;
2161:   for (i=n-1; i>=0; i--){
2162:     v   = aa + bs2*(a->diag[i] + 1);
2163:     vi  = aj + a->diag[i] + 1;
2164:     nz  = ai[i+1] - a->diag[i] - 1;
2165:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2166:     while (nz--) {
2167:       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2168:       v += bs2;
2169:     }
2170:     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2171:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
2172:   }

2174:   ISRestoreIndices(isrow,&rout);
2175:   ISRestoreIndices(iscol,&cout);
2176:   VecRestoreArrayRead(bb,&b);
2177:   VecRestoreArray(xx,&x);
2178:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2179:   return(0);
2180: }

2182: /* ----------------------------------------------------------- */
2185: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2186: {
2187:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188:   IS                iscol=a->col,isrow=a->row;
2189:   PetscErrorCode    ierr;
2190:   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2191:   PetscInt          i,nz,j;
2192:   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2193:   const MatScalar   *aa=a->a,*v;
2194:   PetscScalar       *x,*t,*ls;
2195:   const PetscScalar *b;
2197:   VecGetArrayRead(bb,&b);
2198:   VecGetArray(xx,&x);
2199:   t    = a->solve_work;

2201:   ISGetIndices(isrow,&rout); r = rout;
2202:   ISGetIndices(iscol,&cout); c = cout;

2204:   /* copy the b into temp work space according to permutation */
2205:   for (i=0; i<n; i++) {
2206:     for (j=0; j<bs; j++) {
2207:       t[i*bs+j] = b[c[i]*bs+j];
2208:     }
2209:   }


2212:   /* forward solve the upper triangular transpose */
2213:   ls = a->solve_work + A->cmap->n;
2214:   for (i=0; i<n; i++){
2215:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2216:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2217:     v   = aa + bs2*(a->diag[i] + 1);
2218:     vi  = aj + a->diag[i] + 1;
2219:     nz  = ai[i+1] - a->diag[i] - 1;
2220:     while (nz--) {
2221:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2222:       v += bs2;
2223:     }
2224:   }

2226:   /* backward solve the lower triangular transpose */
2227:   for (i=n-1; i>=0; i--) {
2228:     v   = aa + bs2*ai[i];
2229:     vi  = aj + ai[i];
2230:     nz  = a->diag[i] - ai[i];
2231:     while (nz--) {
2232:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2233:       v += bs2;
2234:     }
2235:   }

2237:   /* copy t into x according to permutation */
2238:   for (i=0; i<n; i++) {
2239:     for (j=0; j<bs; j++) {
2240:       x[bs*r[i]+j]   = t[bs*i+j];
2241:     }
2242:   }

2244:   ISRestoreIndices(isrow,&rout);
2245:   ISRestoreIndices(iscol,&cout);
2246:   VecRestoreArrayRead(bb,&b);
2247:   VecRestoreArray(xx,&x);
2248:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2249:   return(0);
2250: }

2254: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2255: {
2256:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2257:   IS                iscol=a->col,isrow=a->row;
2258:   PetscErrorCode    ierr;
2259:   const PetscInt    *r,*c,*rout,*cout;
2260:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2261:   PetscInt          i,j,nz;
2262:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2263:   const MatScalar   *aa=a->a,*v;
2264:   PetscScalar       *x,*t,*ls;
2265:   const PetscScalar *b;

2268:   VecGetArrayRead(bb,&b);
2269:   VecGetArray(xx,&x);
2270:   t    = a->solve_work;

2272:   ISGetIndices(isrow,&rout); r = rout;
2273:   ISGetIndices(iscol,&cout); c = cout;

2275:   /* copy the b into temp work space according to permutation */
2276:   for (i=0; i<n; i++) {
2277:     for (j=0; j<bs; j++) {
2278:       t[i*bs+j] = b[c[i]*bs+j];
2279:     }
2280:   }


2283:   /* forward solve the upper triangular transpose */
2284:   ls = a->solve_work + A->cmap->n;
2285:   for (i=0; i<n; i++){
2286:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2287:     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2288:     v   = aa + bs2*(diag[i] - 1);
2289:     vi  = aj + diag[i] - 1;
2290:     nz  = diag[i] - diag[i+1] - 1;
2291:     for(j=0;j>-nz;j--){
2292:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2293:       v -= bs2;
2294:     }
2295:   }

2297:   /* backward solve the lower triangular transpose */
2298:   for (i=n-1; i>=0; i--) {
2299:     v   = aa + bs2*ai[i];
2300:     vi  = aj + ai[i];
2301:     nz  = ai[i+1] - ai[i];
2302:     for(j=0;j<nz;j++){
2303:       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2304:       v += bs2;
2305:     }
2306:   }

2308:   /* copy t into x according to permutation */
2309:   for (i=0; i<n; i++) {
2310:     for (j=0; j<bs; j++) {
2311:       x[bs*r[i]+j]   = t[bs*i+j];
2312:     }
2313:   }

2315:   ISRestoreIndices(isrow,&rout);
2316:   ISRestoreIndices(iscol,&cout);
2317:   VecRestoreArrayRead(bb,&b);
2318:   VecRestoreArray(xx,&x);
2319:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2320:   return(0);
2321: }

2323: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */

2327: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2328: {
2329:   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2330:   PetscErrorCode    ierr;
2331:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2332:   PetscInt          i,nz,idx,idt,m;
2333:   const MatScalar   *aa=a->a,*v;
2334:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2335:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2336:   PetscScalar       *x;
2337:   const PetscScalar *b;

2340:   VecGetArrayRead(bb,&b);
2341:   VecGetArray(xx,&x);

2343:   /* forward solve the lower triangular */
2344:   idx    = 0;
2345:   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2346:   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2347:   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];

2349:   for (i=1; i<n; i++) {
2350:     v     = aa + bs2*ai[i];
2351:     vi    = aj + ai[i];
2352:     nz    = ai[i+1] - ai[i];
2353:     idt   = bs*i;
2354:     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2355:     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2356:     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2357:     for(m=0;m<nz;m++){
2358:       idx   = bs*vi[m];
2359:       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2360:       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2361:       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

2363: 
2364:       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2365:       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2366:       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2367:       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2368:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2369:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2370:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2371:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2372:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2373:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2374:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2375:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2376:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2377:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2378:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2379: 
2380:       v += bs2;
2381:     }
2382:     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2383:     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2384:     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2385: 
2386:   }
2387:   /* backward solve the upper triangular */
2388:   for (i=n-1; i>=0; i--){
2389:     v    = aa + bs2*(adiag[i+1]+1);
2390:     vi   = aj + adiag[i+1]+1;
2391:     nz   = adiag[i] - adiag[i+1] - 1;
2392:     idt  = bs*i;
2393:     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2394:     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2395:     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2396: 
2397:     for(m=0;m<nz;m++){
2398:       idx   = bs*vi[m];
2399:       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2400:       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2401:       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

2403:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2404:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2405:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2406:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2407:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2408:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2409:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2410:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2411:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2412:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2413:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2414:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2415:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2416:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2417:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

2419:       v += bs2;
2420:     }

2422:     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2423:     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2424:     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2425:     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2426:     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2427:     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2428:     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2429:     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2430:     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2431:     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2432:     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2433:     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2434:     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2435:     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2436:     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;

2438:   }

2440:   VecRestoreArrayRead(bb,&b);
2441:   VecRestoreArray(xx,&x);
2442:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2443:   return(0);
2444: }

2446: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2447: /* Default MatSolve for block size 15 */

2451: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2452: {
2453:   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2454:   PetscErrorCode    ierr;
2455:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2456:   PetscInt          i,k,nz,idx,idt,m;
2457:   const MatScalar   *aa=a->a,*v;
2458:   PetscScalar       s[15];
2459:   PetscScalar       *x,xv;
2460:   const PetscScalar *b;

2463:   VecGetArrayRead(bb,&b);
2464:   VecGetArray(xx,&x);

2466:   /* forward solve the lower triangular */
2467:   for (i=0; i<n; i++) {
2468:     v     = aa + bs2*ai[i];
2469:     vi    = aj + ai[i];
2470:     nz    = ai[i+1] - ai[i];
2471:     idt   = bs*i;
2472:     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2473:     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2474:     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2475:     for(m=0;m<nz;m++){
2476:       idx   = bs*vi[m];
2477:       for(k=0;k<15;k++){
2478:         xv        = x[k + idx];
2479:         x[idt]    -= v[0]*xv;
2480:         x[1+idt]  -= v[1]*xv;
2481:         x[2+idt]  -= v[2]*xv;
2482:         x[3+idt]  -= v[3]*xv;
2483:         x[4+idt]  -= v[4]*xv;
2484:         x[5+idt]  -= v[5]*xv;
2485:         x[6+idt]  -= v[6]*xv;
2486:         x[7+idt]  -= v[7]*xv;
2487:         x[8+idt]  -= v[8]*xv;
2488:         x[9+idt]  -= v[9]*xv;
2489:         x[10+idt] -= v[10]*xv;
2490:         x[11+idt] -= v[11]*xv;
2491:         x[12+idt] -= v[12]*xv;
2492:         x[13+idt] -= v[13]*xv;
2493:         x[14+idt] -= v[14]*xv;
2494:         v += 15;
2495:       }
2496:     }
2497:   }
2498:   /* backward solve the upper triangular */
2499:   for (i=n-1; i>=0; i--){
2500:     v    = aa + bs2*(adiag[i+1]+1);
2501:     vi   = aj + adiag[i+1]+1;
2502:     nz   = adiag[i] - adiag[i+1] - 1;
2503:     idt  = bs*i;
2504:     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2505:     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2506:     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2507: 
2508:     for(m=0;m<nz;m++){
2509:       idx   = bs*vi[m];
2510:       for(k=0;k<15;k++){
2511:         xv = x[k + idx];
2512:         s[0]  -= v[0]*xv;
2513:         s[1]  -= v[1]*xv;
2514:         s[2]  -= v[2]*xv;
2515:         s[3]  -= v[3]*xv;
2516:         s[4]  -= v[4]*xv;
2517:         s[5]  -= v[5]*xv;
2518:         s[6]  -= v[6]*xv;
2519:         s[7]  -= v[7]*xv;
2520:         s[8]  -= v[8]*xv;
2521:         s[9]  -= v[9]*xv;
2522:         s[10] -= v[10]*xv;
2523:         s[11] -= v[11]*xv;
2524:         s[12] -= v[12]*xv;
2525:         s[13] -= v[13]*xv;
2526:         s[14] -= v[14]*xv;
2527:         v += 15;
2528:       }
2529:     }
2530:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
2531:     for(k=0;k<15;k++){
2532:       x[idt]    += v[0]*s[k];
2533:       x[1+idt]  += v[1]*s[k];
2534:       x[2+idt]  += v[2]*s[k];
2535:       x[3+idt]  += v[3]*s[k];
2536:       x[4+idt]  += v[4]*s[k];
2537:       x[5+idt]  += v[5]*s[k];
2538:       x[6+idt]  += v[6]*s[k];
2539:       x[7+idt]  += v[7]*s[k];
2540:       x[8+idt]  += v[8]*s[k];
2541:       x[9+idt]  += v[9]*s[k];
2542:       x[10+idt] += v[10]*s[k];
2543:       x[11+idt] += v[11]*s[k];
2544:       x[12+idt] += v[12]*s[k];
2545:       x[13+idt] += v[13]*s[k];
2546:       x[14+idt] += v[14]*s[k];
2547:       v += 15;
2548:     }
2549:   }
2550:   VecRestoreArrayRead(bb,&b);
2551:   VecRestoreArray(xx,&x);
2552:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2553:   return(0);
2554: }


2559: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2560: {
2561:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2562:   IS                iscol=a->col,isrow=a->row;
2563:   PetscErrorCode    ierr;
2564:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2565:   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2566:   PetscInt          i,nz,idx,idt,idc;
2567:   const MatScalar   *aa=a->a,*v;
2568:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2569:   const PetscScalar *b;

2572:   VecGetArrayRead(bb,&b);
2573:   VecGetArray(xx,&x);
2574:   t  = a->solve_work;

2576:   ISGetIndices(isrow,&rout); r = rout;
2577:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2579:   /* forward solve the lower triangular */
2580:   idx    = 7*(*r++);
2581:   t[0] = b[idx];   t[1] = b[1+idx];
2582:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2583:   t[5] = b[5+idx]; t[6] = b[6+idx];

2585:   for (i=1; i<n; i++) {
2586:     v     = aa + 49*ai[i];
2587:     vi    = aj + ai[i];
2588:     nz    = diag[i] - ai[i];
2589:     idx   = 7*(*r++);
2590:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2591:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2592:     while (nz--) {
2593:       idx   = 7*(*vi++);
2594:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2595:       x4    = t[3+idx];x5 = t[4+idx];
2596:       x6    = t[5+idx];x7 = t[6+idx];
2597:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2598:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2599:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2600:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2601:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2602:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2603:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2604:       v += 49;
2605:     }
2606:     idx = 7*i;
2607:     t[idx]   = s1;t[1+idx] = s2;
2608:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2609:     t[5+idx] = s6;t[6+idx] = s7;
2610:   }
2611:   /* backward solve the upper triangular */
2612:   for (i=n-1; i>=0; i--){
2613:     v    = aa + 49*diag[i] + 49;
2614:     vi   = aj + diag[i] + 1;
2615:     nz   = ai[i+1] - diag[i] - 1;
2616:     idt  = 7*i;
2617:     s1 = t[idt];  s2 = t[1+idt];
2618:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2619:     s6 = t[5+idt];s7 = t[6+idt];
2620:     while (nz--) {
2621:       idx   = 7*(*vi++);
2622:       x1    = t[idx];   x2 = t[1+idx];
2623:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2624:       x6    = t[5+idx]; x7 = t[6+idx];
2625:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2626:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2627:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2628:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2629:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2630:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2631:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2632:       v += 49;
2633:     }
2634:     idc = 7*(*c--);
2635:     v   = aa + 49*diag[i];
2636:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2637:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2638:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2639:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2640:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2641:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2642:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2643:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2644:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2645:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2646:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2647:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2648:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2649:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2650:   }

2652:   ISRestoreIndices(isrow,&rout);
2653:   ISRestoreIndices(iscol,&cout);
2654:   VecRestoreArrayRead(bb,&b);
2655:   VecRestoreArray(xx,&x);
2656:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2657:   return(0);
2658: }

2662: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2663: {
2664:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2665:   IS                iscol=a->col,isrow=a->row;
2666:   PetscErrorCode    ierr;
2667:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2668:   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2669:   PetscInt          i,nz,idx,idt,idc,m;
2670:   const MatScalar   *aa=a->a,*v;
2671:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2672:   const PetscScalar *b;

2675:   VecGetArrayRead(bb,&b);
2676:   VecGetArray(xx,&x);
2677:   t  = a->solve_work;

2679:   ISGetIndices(isrow,&rout); r = rout;
2680:   ISGetIndices(iscol,&cout); c = cout;

2682:   /* forward solve the lower triangular */
2683:   idx    = 7*r[0];
2684:   t[0] = b[idx];   t[1] = b[1+idx];
2685:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2686:   t[5] = b[5+idx]; t[6] = b[6+idx];

2688:   for (i=1; i<n; i++) {
2689:     v     = aa + 49*ai[i];
2690:     vi    = aj + ai[i];
2691:     nz    = ai[i+1] - ai[i];
2692:     idx   = 7*r[i];
2693:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2694:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2695:     for(m=0;m<nz;m++){
2696:       idx   = 7*vi[m];
2697:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2698:       x4    = t[3+idx];x5 = t[4+idx];
2699:       x6    = t[5+idx];x7 = t[6+idx];
2700:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2701:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2702:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2703:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2704:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2705:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2706:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2707:       v += 49;
2708:     }
2709:     idx = 7*i;
2710:     t[idx]   = s1;t[1+idx] = s2;
2711:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2712:     t[5+idx] = s6;t[6+idx] = s7;
2713:   }
2714:   /* backward solve the upper triangular */
2715:   for (i=n-1; i>=0; i--){
2716:     v    = aa + 49*(adiag[i+1]+1);
2717:     vi   = aj + adiag[i+1]+1;
2718:     nz   = adiag[i] - adiag[i+1] - 1;
2719:     idt  = 7*i;
2720:     s1 = t[idt];  s2 = t[1+idt];
2721:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2722:     s6 = t[5+idt];s7 = t[6+idt];
2723:     for(m=0;m<nz;m++){
2724:       idx   = 7*vi[m];
2725:       x1    = t[idx];   x2 = t[1+idx];
2726:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2727:       x6    = t[5+idx]; x7 = t[6+idx];
2728:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2729:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2730:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2731:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2732:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2733:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2734:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2735:       v += 49;
2736:     }
2737:     idc = 7*c[i];
2738:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2739:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2740:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2741:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2742:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2743:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2744:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2745:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2746:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2747:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2748:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2749:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2750:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2751:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2752:   }

2754:   ISRestoreIndices(isrow,&rout);
2755:   ISRestoreIndices(iscol,&cout);
2756:   VecRestoreArrayRead(bb,&b);
2757:   VecRestoreArray(xx,&x);
2758:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2759:   return(0);
2760: }

2764: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2765: {
2766:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2767:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2768:   PetscErrorCode    ierr;
2769:   PetscInt          i,nz,idx,idt,jdx;
2770:   const MatScalar   *aa=a->a,*v;
2771:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2772:   const PetscScalar *b;

2775:   VecGetArrayRead(bb,&b);
2776:   VecGetArray(xx,&x);
2777:   /* forward solve the lower triangular */
2778:   idx    = 0;
2779:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2780:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2781:   x[6] = b[6+idx];
2782:   for (i=1; i<n; i++) {
2783:     v     =  aa + 49*ai[i];
2784:     vi    =  aj + ai[i];
2785:     nz    =  diag[i] - ai[i];
2786:     idx   =  7*i;
2787:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2788:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2789:     s7  =  b[6+idx];
2790:     while (nz--) {
2791:       jdx   = 7*(*vi++);
2792:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2793:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2794:       x7    = x[6+jdx];
2795:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2796:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2797:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2798:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2799:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2800:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2801:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2802:       v += 49;
2803:      }
2804:     x[idx]   = s1;
2805:     x[1+idx] = s2;
2806:     x[2+idx] = s3;
2807:     x[3+idx] = s4;
2808:     x[4+idx] = s5;
2809:     x[5+idx] = s6;
2810:     x[6+idx] = s7;
2811:   }
2812:   /* backward solve the upper triangular */
2813:   for (i=n-1; i>=0; i--){
2814:     v    = aa + 49*diag[i] + 49;
2815:     vi   = aj + diag[i] + 1;
2816:     nz   = ai[i+1] - diag[i] - 1;
2817:     idt  = 7*i;
2818:     s1 = x[idt];   s2 = x[1+idt];
2819:     s3 = x[2+idt]; s4 = x[3+idt];
2820:     s5 = x[4+idt]; s6 = x[5+idt];
2821:     s7 = x[6+idt];
2822:     while (nz--) {
2823:       idx   = 7*(*vi++);
2824:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2825:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2826:       x7    = x[6+idx];
2827:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2828:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2829:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2830:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2831:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2832:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2833:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2834:       v += 49;
2835:     }
2836:     v        = aa + 49*diag[i];
2837:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2838:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2839:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2840:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2841:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2842:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2843:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2844:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2845:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2846:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2847:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2848:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2849:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2850:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2851:   }

2853:   VecRestoreArrayRead(bb,&b);
2854:   VecRestoreArray(xx,&x);
2855:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2856:   return(0);
2857: }

2861: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2862: {
2863:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2864:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2865:     PetscErrorCode    ierr;
2866:     PetscInt          i,k,nz,idx,jdx,idt;
2867:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2868:     const MatScalar   *aa=a->a,*v;
2869:     PetscScalar       *x;
2870:     const PetscScalar *b;
2871:     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

2874:     VecGetArrayRead(bb,&b);
2875:     VecGetArray(xx,&x);
2876:     /* forward solve the lower triangular */
2877:     idx    = 0;
2878:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2879:     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2880:     for (i=1; i<n; i++) {
2881:        v    = aa + bs2*ai[i];
2882:        vi   = aj + ai[i];
2883:        nz   = ai[i+1] - ai[i];
2884:       idx   = bs*i;
2885:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2886:        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2887:        for(k=0;k<nz;k++) {
2888:           jdx   = bs*vi[k];
2889:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2890:           x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2891:           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2892:           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2893:           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2894:           s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2895:           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2896:           s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2897:           s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2898:           v   +=  bs2;
2899:         }

2901:        x[idx]   = s1;
2902:        x[1+idx] = s2;
2903:        x[2+idx] = s3;
2904:        x[3+idx] = s4;
2905:        x[4+idx] = s5;
2906:        x[5+idx] = s6;
2907:        x[6+idx] = s7;
2908:     }
2909: 
2910:    /* backward solve the upper triangular */
2911:   for (i=n-1; i>=0; i--){
2912:     v   = aa + bs2*(adiag[i+1]+1);
2913:      vi  = aj + adiag[i+1]+1;
2914:      nz  = adiag[i] - adiag[i+1]-1;
2915:      idt = bs*i;
2916:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2917:      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2918:     for(k=0;k<nz;k++) {
2919:       idx   = bs*vi[k];
2920:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2921:        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2922:        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2923:        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2924:        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2925:        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2926:        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2927:        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2928:        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2929:         v   +=  bs2;
2930:     }
2931:     /* x = inv_diagonal*x */
2932:     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2933:     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2934:     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2935:     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2936:     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2937:     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2938:     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2939:   }

2941:   VecRestoreArrayRead(bb,&b);
2942:   VecRestoreArray(xx,&x);
2943:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2944:   return(0);
2945: }

2949: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2950: {
2951:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2952:   IS                iscol=a->col,isrow=a->row;
2953:   PetscErrorCode    ierr;
2954:   const PetscInt    *r,*c,*rout,*cout;
2955:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2956:   PetscInt          i,nz,idx,idt,idc;
2957:   const MatScalar   *aa=a->a,*v;
2958:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2959:   const PetscScalar *b;

2962:   VecGetArrayRead(bb,&b);
2963:   VecGetArray(xx,&x);
2964:   t  = a->solve_work;

2966:   ISGetIndices(isrow,&rout); r = rout;
2967:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2969:   /* forward solve the lower triangular */
2970:   idx    = 6*(*r++);
2971:   t[0] = b[idx];   t[1] = b[1+idx];
2972:   t[2] = b[2+idx]; t[3] = b[3+idx];
2973:   t[4] = b[4+idx]; t[5] = b[5+idx];
2974:   for (i=1; i<n; i++) {
2975:     v     = aa + 36*ai[i];
2976:     vi    = aj + ai[i];
2977:     nz    = diag[i] - ai[i];
2978:     idx   = 6*(*r++);
2979:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2980:     s5  = b[4+idx]; s6 = b[5+idx];
2981:     while (nz--) {
2982:       idx   = 6*(*vi++);
2983:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2984:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2985:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2986:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2987:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2988:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2989:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2990:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2991:       v += 36;
2992:     }
2993:     idx = 6*i;
2994:     t[idx]   = s1;t[1+idx] = s2;
2995:     t[2+idx] = s3;t[3+idx] = s4;
2996:     t[4+idx] = s5;t[5+idx] = s6;
2997:   }
2998:   /* backward solve the upper triangular */
2999:   for (i=n-1; i>=0; i--){
3000:     v    = aa + 36*diag[i] + 36;
3001:     vi   = aj + diag[i] + 1;
3002:     nz   = ai[i+1] - diag[i] - 1;
3003:     idt  = 6*i;
3004:     s1 = t[idt];  s2 = t[1+idt];
3005:     s3 = t[2+idt];s4 = t[3+idt];
3006:     s5 = t[4+idt];s6 = t[5+idt];
3007:     while (nz--) {
3008:       idx   = 6*(*vi++);
3009:       x1    = t[idx];   x2 = t[1+idx];
3010:       x3    = t[2+idx]; x4 = t[3+idx];
3011:       x5    = t[4+idx]; x6 = t[5+idx];
3012:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3013:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3014:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3015:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3016:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3017:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3018:       v += 36;
3019:     }
3020:     idc = 6*(*c--);
3021:     v   = aa + 36*diag[i];
3022:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3023:                                  v[18]*s4+v[24]*s5+v[30]*s6;
3024:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3025:                                  v[19]*s4+v[25]*s5+v[31]*s6;
3026:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3027:                                  v[20]*s4+v[26]*s5+v[32]*s6;
3028:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3029:                                  v[21]*s4+v[27]*s5+v[33]*s6;
3030:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3031:                                  v[22]*s4+v[28]*s5+v[34]*s6;
3032:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3033:                                  v[23]*s4+v[29]*s5+v[35]*s6;
3034:   }

3036:   ISRestoreIndices(isrow,&rout);
3037:   ISRestoreIndices(iscol,&cout);
3038:   VecRestoreArrayRead(bb,&b);
3039:   VecRestoreArray(xx,&x);
3040:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3041:   return(0);
3042: }

3046: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3047: {
3048:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3049:   IS                iscol=a->col,isrow=a->row;
3050:   PetscErrorCode    ierr;
3051:   const PetscInt    *r,*c,*rout,*cout;
3052:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3053:   PetscInt          i,nz,idx,idt,idc,m;
3054:   const MatScalar   *aa=a->a,*v;
3055:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3056:   const PetscScalar *b;

3059:   VecGetArrayRead(bb,&b);
3060:   VecGetArray(xx,&x);
3061:   t  = a->solve_work;

3063:   ISGetIndices(isrow,&rout); r = rout;
3064:   ISGetIndices(iscol,&cout); c = cout;

3066:   /* forward solve the lower triangular */
3067:   idx    = 6*r[0];
3068:   t[0] = b[idx];   t[1] = b[1+idx];
3069:   t[2] = b[2+idx]; t[3] = b[3+idx];
3070:   t[4] = b[4+idx]; t[5] = b[5+idx];
3071:   for (i=1; i<n; i++) {
3072:     v     = aa + 36*ai[i];
3073:     vi    = aj + ai[i];
3074:     nz    = ai[i+1] - ai[i];
3075:     idx   = 6*r[i];
3076:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3077:     s5  = b[4+idx]; s6 = b[5+idx];
3078:     for(m=0;m<nz;m++){
3079:       idx   = 6*vi[m];
3080:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3081:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3082:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3083:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3084:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3085:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3086:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3087:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3088:       v += 36;
3089:     }
3090:     idx = 6*i;
3091:     t[idx]   = s1;t[1+idx] = s2;
3092:     t[2+idx] = s3;t[3+idx] = s4;
3093:     t[4+idx] = s5;t[5+idx] = s6;
3094:   }
3095:   /* backward solve the upper triangular */
3096:   for (i=n-1; i>=0; i--){
3097:     v    = aa + 36*(adiag[i+1]+1);
3098:     vi   = aj + adiag[i+1]+1;
3099:     nz   = adiag[i] - adiag[i+1] - 1;
3100:     idt  = 6*i;
3101:     s1 = t[idt];  s2 = t[1+idt];
3102:     s3 = t[2+idt];s4 = t[3+idt];
3103:     s5 = t[4+idt];s6 = t[5+idt];
3104:     for(m=0;m<nz;m++){
3105:       idx   = 6*vi[m];
3106:       x1    = t[idx];   x2 = t[1+idx];
3107:       x3    = t[2+idx]; x4 = t[3+idx];
3108:       x5    = t[4+idx]; x6 = t[5+idx];
3109:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3110:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3111:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3112:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3113:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3114:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3115:       v += 36;
3116:     }
3117:     idc = 6*c[i];
3118:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3119:                                  v[18]*s4+v[24]*s5+v[30]*s6;
3120:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3121:                                  v[19]*s4+v[25]*s5+v[31]*s6;
3122:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3123:                                  v[20]*s4+v[26]*s5+v[32]*s6;
3124:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3125:                                  v[21]*s4+v[27]*s5+v[33]*s6;
3126:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3127:                                  v[22]*s4+v[28]*s5+v[34]*s6;
3128:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3129:                                  v[23]*s4+v[29]*s5+v[35]*s6;
3130:   }

3132:   ISRestoreIndices(isrow,&rout);
3133:   ISRestoreIndices(iscol,&cout);
3134:   VecRestoreArrayRead(bb,&b);
3135:   VecRestoreArray(xx,&x);
3136:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3137:   return(0);
3138: }

3142: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3143: {
3144:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3145:   PetscInt          i,nz,idx,idt,jdx;
3146:   PetscErrorCode    ierr;
3147:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3148:   const MatScalar   *aa=a->a,*v;
3149:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3150:   const PetscScalar *b;

3153:   VecGetArrayRead(bb,&b);
3154:   VecGetArray(xx,&x);
3155:   /* forward solve the lower triangular */
3156:   idx    = 0;
3157:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3158:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3159:   for (i=1; i<n; i++) {
3160:     v     =  aa + 36*ai[i];
3161:     vi    =  aj + ai[i];
3162:     nz    =  diag[i] - ai[i];
3163:     idx   =  6*i;
3164:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3165:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3166:     while (nz--) {
3167:       jdx   = 6*(*vi++);
3168:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3169:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3170:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3171:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3172:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3173:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3174:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3175:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3176:       v += 36;
3177:      }
3178:     x[idx]   = s1;
3179:     x[1+idx] = s2;
3180:     x[2+idx] = s3;
3181:     x[3+idx] = s4;
3182:     x[4+idx] = s5;
3183:     x[5+idx] = s6;
3184:   }
3185:   /* backward solve the upper triangular */
3186:   for (i=n-1; i>=0; i--){
3187:     v    = aa + 36*diag[i] + 36;
3188:     vi   = aj + diag[i] + 1;
3189:     nz   = ai[i+1] - diag[i] - 1;
3190:     idt  = 6*i;
3191:     s1 = x[idt];   s2 = x[1+idt];
3192:     s3 = x[2+idt]; s4 = x[3+idt];
3193:     s5 = x[4+idt]; s6 = x[5+idt];
3194:     while (nz--) {
3195:       idx   = 6*(*vi++);
3196:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3197:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3198:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3199:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3200:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3201:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3202:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3203:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3204:       v += 36;
3205:     }
3206:     v        = aa + 36*diag[i];
3207:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3208:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3209:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3210:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3211:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3212:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3213:   }

3215:   VecRestoreArrayRead(bb,&b);
3216:   VecRestoreArray(xx,&x);
3217:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3218:   return(0);
3219: }

3223: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3224: {
3225:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3226:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3227:     PetscErrorCode    ierr;
3228:     PetscInt          i,k,nz,idx,jdx,idt;
3229:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3230:     const MatScalar   *aa=a->a,*v;
3231:     PetscScalar       *x;
3232:     const PetscScalar *b;
3233:     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

3236:     VecGetArrayRead(bb,&b);
3237:     VecGetArray(xx,&x);
3238:     /* forward solve the lower triangular */
3239:     idx    = 0;
3240:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3241:     x[4] = b[4+idx];x[5] = b[5+idx];
3242:     for (i=1; i<n; i++) {
3243:        v    = aa + bs2*ai[i];
3244:        vi   = aj + ai[i];
3245:        nz   = ai[i+1] - ai[i];
3246:       idx   = bs*i;
3247:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3248:        s5   = b[4+idx];s6 = b[5+idx];
3249:        for(k=0;k<nz;k++){
3250:           jdx   = bs*vi[k];
3251:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3252:           x5    = x[4+jdx]; x6 = x[5+jdx];
3253:           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3254:           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3255:           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3256:           s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3257:           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3258:           s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3259:           v   +=  bs2;
3260:         }

3262:        x[idx]   = s1;
3263:        x[1+idx] = s2;
3264:        x[2+idx] = s3;
3265:        x[3+idx] = s4;
3266:        x[4+idx] = s5;
3267:        x[5+idx] = s6;
3268:     }
3269: 
3270:    /* backward solve the upper triangular */
3271:   for (i=n-1; i>=0; i--){
3272:     v   = aa + bs2*(adiag[i+1]+1);
3273:      vi  = aj + adiag[i+1]+1;
3274:      nz  = adiag[i] - adiag[i+1]-1;
3275:      idt = bs*i;
3276:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3277:      s5 = x[4+idt];s6 = x[5+idt];
3278:      for(k=0;k<nz;k++){
3279:       idx   = bs*vi[k];
3280:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3281:        x5    = x[4+idx];x6 = x[5+idx];
3282:        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3283:        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3284:        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3285:        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3286:        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3287:        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3288:         v   +=  bs2;
3289:     }
3290:     /* x = inv_diagonal*x */
3291:    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3292:    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3293:    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3294:    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3295:    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3296:    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3297:   }

3299:   VecRestoreArrayRead(bb,&b);
3300:   VecRestoreArray(xx,&x);
3301:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
3302:   return(0);
3303: }

3307: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3308: {
3309:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3310:   IS                iscol=a->col,isrow=a->row;
3311:   PetscErrorCode    ierr;
3312:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3313:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3314:   PetscInt          i,nz,idx,idt,idc;
3315:   const MatScalar   *aa=a->a,*v;
3316:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3317:   const PetscScalar *b;

3320:   VecGetArrayRead(bb,&b);
3321:   VecGetArray(xx,&x);
3322:   t  = a->solve_work;

3324:   ISGetIndices(isrow,&rout); r = rout;
3325:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3327:   /* forward solve the lower triangular */
3328:   idx    = 5*(*r++);
3329:   t[0] = b[idx];   t[1] = b[1+idx];
3330:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3331:   for (i=1; i<n; i++) {
3332:     v     = aa + 25*ai[i];
3333:     vi    = aj + ai[i];
3334:     nz    = diag[i] - ai[i];
3335:     idx   = 5*(*r++);
3336:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3337:     s5  = b[4+idx];
3338:     while (nz--) {
3339:       idx   = 5*(*vi++);
3340:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3341:       x4    = t[3+idx];x5 = t[4+idx];
3342:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3343:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3344:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3345:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3346:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3347:       v += 25;
3348:     }
3349:     idx = 5*i;
3350:     t[idx]   = s1;t[1+idx] = s2;
3351:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3352:   }
3353:   /* backward solve the upper triangular */
3354:   for (i=n-1; i>=0; i--){
3355:     v    = aa + 25*diag[i] + 25;
3356:     vi   = aj + diag[i] + 1;
3357:     nz   = ai[i+1] - diag[i] - 1;
3358:     idt  = 5*i;
3359:     s1 = t[idt];  s2 = t[1+idt];
3360:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3361:     while (nz--) {
3362:       idx   = 5*(*vi++);
3363:       x1    = t[idx];   x2 = t[1+idx];
3364:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3365:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3366:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3367:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3368:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3369:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3370:       v += 25;
3371:     }
3372:     idc = 5*(*c--);
3373:     v   = aa + 25*diag[i];
3374:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3375:                                  v[15]*s4+v[20]*s5;
3376:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3377:                                  v[16]*s4+v[21]*s5;
3378:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3379:                                  v[17]*s4+v[22]*s5;
3380:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3381:                                  v[18]*s4+v[23]*s5;
3382:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3383:                                  v[19]*s4+v[24]*s5;
3384:   }

3386:   ISRestoreIndices(isrow,&rout);
3387:   ISRestoreIndices(iscol,&cout);
3388:   VecRestoreArrayRead(bb,&b);
3389:   VecRestoreArray(xx,&x);
3390:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3391:   return(0);
3392: }

3396: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3397: {
3398:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3399:   IS                iscol=a->col,isrow=a->row;
3400:   PetscErrorCode    ierr;
3401:   const PetscInt    *r,*c,*rout,*cout;
3402:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3403:   PetscInt          i,nz,idx,idt,idc,m;
3404:   const MatScalar   *aa=a->a,*v;
3405:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3406:   const PetscScalar *b;

3409:   VecGetArrayRead(bb,&b);
3410:   VecGetArray(xx,&x);
3411:   t  = a->solve_work;

3413:   ISGetIndices(isrow,&rout); r = rout;
3414:   ISGetIndices(iscol,&cout); c = cout;

3416:   /* forward solve the lower triangular */
3417:   idx    = 5*r[0];
3418:   t[0] = b[idx];   t[1] = b[1+idx];
3419:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3420:   for (i=1; i<n; i++) {
3421:     v     = aa + 25*ai[i];
3422:     vi    = aj + ai[i];
3423:     nz    = ai[i+1] - ai[i];
3424:     idx   = 5*r[i];
3425:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3426:     s5  = b[4+idx];
3427:     for(m=0;m<nz;m++){
3428:       idx   = 5*vi[m];
3429:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3430:       x4    = t[3+idx];x5 = t[4+idx];
3431:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3432:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3433:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3434:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3435:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3436:       v += 25;
3437:     }
3438:     idx = 5*i;
3439:     t[idx]   = s1;t[1+idx] = s2;
3440:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3441:   }
3442:   /* backward solve the upper triangular */
3443:   for (i=n-1; i>=0; i--){
3444:     v    = aa + 25*(adiag[i+1]+1);
3445:     vi   = aj + adiag[i+1]+1;
3446:     nz   = adiag[i] - adiag[i+1] - 1;
3447:     idt  = 5*i;
3448:     s1 = t[idt];  s2 = t[1+idt];
3449:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3450:     for(m=0;m<nz;m++){
3451:       idx   = 5*vi[m];
3452:       x1    = t[idx];   x2 = t[1+idx];
3453:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3454:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3455:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3456:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3457:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3458:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3459:       v += 25;
3460:     }
3461:     idc = 5*c[i];
3462:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3463:                                  v[15]*s4+v[20]*s5;
3464:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3465:                                  v[16]*s4+v[21]*s5;
3466:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3467:                                  v[17]*s4+v[22]*s5;
3468:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3469:                                  v[18]*s4+v[23]*s5;
3470:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3471:                                  v[19]*s4+v[24]*s5;
3472:   }

3474:   ISRestoreIndices(isrow,&rout);
3475:   ISRestoreIndices(iscol,&cout);
3476:   VecRestoreArrayRead(bb,&b);
3477:   VecRestoreArray(xx,&x);
3478:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3479:   return(0);
3480: }

3484: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3485: {
3486:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3487:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3488:   PetscInt          i,nz,idx,idt,jdx;
3489:   PetscErrorCode    ierr;
3490:   const MatScalar   *aa=a->a,*v;
3491:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3492:   const PetscScalar *b;

3495:   VecGetArrayRead(bb,&b);
3496:   VecGetArray(xx,&x);
3497:   /* forward solve the lower triangular */
3498:   idx    = 0;
3499:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3500:   for (i=1; i<n; i++) {
3501:     v     =  aa + 25*ai[i];
3502:     vi    =  aj + ai[i];
3503:     nz    =  diag[i] - ai[i];
3504:     idx   =  5*i;
3505:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3506:     while (nz--) {
3507:       jdx   = 5*(*vi++);
3508:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3509:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3510:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3511:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3512:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3513:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3514:       v    += 25;
3515:     }
3516:     x[idx]   = s1;
3517:     x[1+idx] = s2;
3518:     x[2+idx] = s3;
3519:     x[3+idx] = s4;
3520:     x[4+idx] = s5;
3521:   }
3522:   /* backward solve the upper triangular */
3523:   for (i=n-1; i>=0; i--){
3524:     v    = aa + 25*diag[i] + 25;
3525:     vi   = aj + diag[i] + 1;
3526:     nz   = ai[i+1] - diag[i] - 1;
3527:     idt  = 5*i;
3528:     s1 = x[idt];  s2 = x[1+idt];
3529:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3530:     while (nz--) {
3531:       idx   = 5*(*vi++);
3532:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3533:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3534:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3535:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3536:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3537:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3538:       v    += 25;
3539:     }
3540:     v        = aa + 25*diag[i];
3541:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3542:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3543:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3544:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3545:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3546:   }

3548:   VecRestoreArrayRead(bb,&b);
3549:   VecRestoreArray(xx,&x);
3550:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3551:   return(0);
3552: }

3556: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3557: {
3558:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3559:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3560:   PetscInt          i,k,nz,idx,idt,jdx;
3561:   PetscErrorCode    ierr;
3562:   const MatScalar   *aa=a->a,*v;
3563:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3564:   const PetscScalar *b;

3567:   VecGetArrayRead(bb,&b);
3568:   VecGetArray(xx,&x);
3569:   /* forward solve the lower triangular */
3570:   idx    = 0;
3571:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3572:   for (i=1; i<n; i++) {
3573:     v   = aa + 25*ai[i];
3574:     vi  = aj + ai[i];
3575:     nz  = ai[i+1] - ai[i];
3576:     idx = 5*i;
3577:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3578:     for(k=0;k<nz;k++) {
3579:       jdx   = 5*vi[k];
3580:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3581:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3582:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3583:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3584:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3585:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3586:       v    += 25;
3587:     }
3588:     x[idx]   = s1;
3589:     x[1+idx] = s2;
3590:     x[2+idx] = s3;
3591:     x[3+idx] = s4;
3592:     x[4+idx] = s5;
3593:   }

3595:   /* backward solve the upper triangular */
3596:   for (i=n-1; i>=0; i--){
3597:     v   = aa + 25*(adiag[i+1]+1);
3598:     vi  = aj + adiag[i+1]+1;
3599:     nz  = adiag[i] - adiag[i+1]-1;
3600:     idt = 5*i;
3601:     s1 = x[idt];  s2 = x[1+idt];
3602:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3603:     for(k=0;k<nz;k++){
3604:       idx   = 5*vi[k];
3605:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3606:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3607:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3608:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3609:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3610:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3611:       v    += 25;
3612:     }
3613:     /* x = inv_diagonal*x */
3614:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3615:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3616:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3617:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3618:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3619:   }

3621:   VecRestoreArrayRead(bb,&b);
3622:   VecRestoreArray(xx,&x);
3623:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3624:   return(0);
3625: }

3629: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3630: {
3631:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3632:   IS                iscol=a->col,isrow=a->row;
3633:   PetscErrorCode    ierr;
3634:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3635:   PetscInt          i,nz,idx,idt,idc;
3636:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3637:   const MatScalar   *aa=a->a,*v;
3638:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3639:   const PetscScalar *b;

3642:   VecGetArrayRead(bb,&b);
3643:   VecGetArray(xx,&x);
3644:   t  = a->solve_work;

3646:   ISGetIndices(isrow,&rout); r = rout;
3647:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3649:   /* forward solve the lower triangular */
3650:   idx    = 4*(*r++);
3651:   t[0] = b[idx];   t[1] = b[1+idx];
3652:   t[2] = b[2+idx]; t[3] = b[3+idx];
3653:   for (i=1; i<n; i++) {
3654:     v     = aa + 16*ai[i];
3655:     vi    = aj + ai[i];
3656:     nz    = diag[i] - ai[i];
3657:     idx   = 4*(*r++);
3658:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3659:     while (nz--) {
3660:       idx   = 4*(*vi++);
3661:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3662:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3663:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3664:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3665:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3666:       v    += 16;
3667:     }
3668:     idx        = 4*i;
3669:     t[idx]   = s1;t[1+idx] = s2;
3670:     t[2+idx] = s3;t[3+idx] = s4;
3671:   }
3672:   /* backward solve the upper triangular */
3673:   for (i=n-1; i>=0; i--){
3674:     v    = aa + 16*diag[i] + 16;
3675:     vi   = aj + diag[i] + 1;
3676:     nz   = ai[i+1] - diag[i] - 1;
3677:     idt  = 4*i;
3678:     s1 = t[idt];  s2 = t[1+idt];
3679:     s3 = t[2+idt];s4 = t[3+idt];
3680:     while (nz--) {
3681:       idx   = 4*(*vi++);
3682:       x1    = t[idx];   x2 = t[1+idx];
3683:       x3    = t[2+idx]; x4 = t[3+idx];
3684:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3685:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3686:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3687:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3688:       v += 16;
3689:     }
3690:     idc      = 4*(*c--);
3691:     v        = aa + 16*diag[i];
3692:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3693:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3694:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3695:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3696:   }

3698:   ISRestoreIndices(isrow,&rout);
3699:   ISRestoreIndices(iscol,&cout);
3700:   VecRestoreArrayRead(bb,&b);
3701:   VecRestoreArray(xx,&x);
3702:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3703:   return(0);
3704: }

3708: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3709: {
3710:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3711:   IS                iscol=a->col,isrow=a->row;
3712:   PetscErrorCode    ierr;
3713:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3714:   PetscInt          i,nz,idx,idt,idc,m;
3715:   const PetscInt    *r,*c,*rout,*cout;
3716:   const MatScalar   *aa=a->a,*v;
3717:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3718:   const PetscScalar *b;

3721:   VecGetArrayRead(bb,&b);
3722:   VecGetArray(xx,&x);
3723:   t  = a->solve_work;

3725:   ISGetIndices(isrow,&rout); r = rout;
3726:   ISGetIndices(iscol,&cout); c = cout;

3728:   /* forward solve the lower triangular */
3729:   idx    = 4*r[0];
3730:   t[0] = b[idx];   t[1] = b[1+idx];
3731:   t[2] = b[2+idx]; t[3] = b[3+idx];
3732:   for (i=1; i<n; i++) {
3733:     v     = aa + 16*ai[i];
3734:     vi    = aj + ai[i];
3735:     nz    = ai[i+1] - ai[i];
3736:     idx   = 4*r[i];
3737:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3738:     for(m=0;m<nz;m++){
3739:       idx   = 4*vi[m];
3740:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3741:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3742:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3743:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3744:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3745:       v    += 16;
3746:     }
3747:     idx        = 4*i;
3748:     t[idx]   = s1;t[1+idx] = s2;
3749:     t[2+idx] = s3;t[3+idx] = s4;
3750:   }
3751:   /* backward solve the upper triangular */
3752:   for (i=n-1; i>=0; i--){
3753:     v    = aa + 16*(adiag[i+1]+1);
3754:     vi   = aj + adiag[i+1]+1;
3755:     nz   = adiag[i] - adiag[i+1] - 1;
3756:     idt  = 4*i;
3757:     s1 = t[idt];  s2 = t[1+idt];
3758:     s3 = t[2+idt];s4 = t[3+idt];
3759:     for(m=0;m<nz;m++){
3760:       idx   = 4*vi[m];
3761:       x1    = t[idx];   x2 = t[1+idx];
3762:       x3    = t[2+idx]; x4 = t[3+idx];
3763:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3764:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3765:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3766:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3767:       v += 16;
3768:     }
3769:     idc      = 4*c[i];
3770:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3771:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3772:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3773:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3774:   }

3776:   ISRestoreIndices(isrow,&rout);
3777:   ISRestoreIndices(iscol,&cout);
3778:   VecRestoreArrayRead(bb,&b);
3779:   VecRestoreArray(xx,&x);
3780:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3781:   return(0);
3782: }

3786: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3787: {
3788:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3789:   IS                iscol=a->col,isrow=a->row;
3790:   PetscErrorCode    ierr;
3791:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3792:   PetscInt          i,nz,idx,idt,idc;
3793:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3794:   const MatScalar   *aa=a->a,*v;
3795:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3796:   PetscScalar       *x;
3797:   const PetscScalar *b;

3800:   VecGetArrayRead(bb,&b);
3801:   VecGetArray(xx,&x);
3802:   t  = (MatScalar *)a->solve_work;

3804:   ISGetIndices(isrow,&rout); r = rout;
3805:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3807:   /* forward solve the lower triangular */
3808:   idx    = 4*(*r++);
3809:   t[0] = (MatScalar)b[idx];
3810:   t[1] = (MatScalar)b[1+idx];
3811:   t[2] = (MatScalar)b[2+idx];
3812:   t[3] = (MatScalar)b[3+idx];
3813:   for (i=1; i<n; i++) {
3814:     v     = aa + 16*ai[i];
3815:     vi    = aj + ai[i];
3816:     nz    = diag[i] - ai[i];
3817:     idx   = 4*(*r++);
3818:     s1 = (MatScalar)b[idx];
3819:     s2 = (MatScalar)b[1+idx];
3820:     s3 = (MatScalar)b[2+idx];
3821:     s4 = (MatScalar)b[3+idx];
3822:     while (nz--) {
3823:       idx   = 4*(*vi++);
3824:       x1  = t[idx];
3825:       x2  = t[1+idx];
3826:       x3  = t[2+idx];
3827:       x4  = t[3+idx];
3828:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3829:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3830:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3831:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3832:       v    += 16;
3833:     }
3834:     idx        = 4*i;
3835:     t[idx]   = s1;
3836:     t[1+idx] = s2;
3837:     t[2+idx] = s3;
3838:     t[3+idx] = s4;
3839:   }
3840:   /* backward solve the upper triangular */
3841:   for (i=n-1; i>=0; i--){
3842:     v    = aa + 16*diag[i] + 16;
3843:     vi   = aj + diag[i] + 1;
3844:     nz   = ai[i+1] - diag[i] - 1;
3845:     idt  = 4*i;
3846:     s1 = t[idt];
3847:     s2 = t[1+idt];
3848:     s3 = t[2+idt];
3849:     s4 = t[3+idt];
3850:     while (nz--) {
3851:       idx   = 4*(*vi++);
3852:       x1  = t[idx];
3853:       x2  = t[1+idx];
3854:       x3  = t[2+idx];
3855:       x4  = t[3+idx];
3856:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3857:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3858:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3859:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3860:       v += 16;
3861:     }
3862:     idc      = 4*(*c--);
3863:     v        = aa + 16*diag[i];
3864:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3865:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3866:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3867:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3868:     x[idc]   = (PetscScalar)t[idt];
3869:     x[1+idc] = (PetscScalar)t[1+idt];
3870:     x[2+idc] = (PetscScalar)t[2+idt];
3871:     x[3+idc] = (PetscScalar)t[3+idt];
3872:  }

3874:   ISRestoreIndices(isrow,&rout);
3875:   ISRestoreIndices(iscol,&cout);
3876:   VecRestoreArrayRead(bb,&b);
3877:   VecRestoreArray(xx,&x);
3878:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3879:   return(0);
3880: }

3882: #if defined (PETSC_HAVE_SSE)

3884: #include PETSC_HAVE_SSE

3888: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3889: {
3890:   /* 
3891:      Note: This code uses demotion of double
3892:      to float when performing the mixed-mode computation.
3893:      This may not be numerically reasonable for all applications.
3894:   */
3895:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3896:   IS             iscol=a->col,isrow=a->row;
3898:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3899:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3900:   MatScalar      *aa=a->a,*v;
3901:   PetscScalar    *x,*b,*t;

3903:   /* Make space in temp stack for 16 Byte Aligned arrays */
3904:   float           ssealignedspace[11],*tmps,*tmpx;
3905:   unsigned long   offset;
3906: 
3908:   SSE_SCOPE_BEGIN;

3910:     offset = (unsigned long)ssealignedspace % 16;
3911:     if (offset) offset = (16 - offset)/4;
3912:     tmps = &ssealignedspace[offset];
3913:     tmpx = &ssealignedspace[offset+4];
3914:     PREFETCH_NTA(aa+16*ai[1]);

3916:     VecGetArray(bb,&b);
3917:     VecGetArray(xx,&x);
3918:     t  = a->solve_work;

3920:     ISGetIndices(isrow,&rout); r = rout;
3921:     ISGetIndices(iscol,&cout); c = cout + (n-1);

3923:     /* forward solve the lower triangular */
3924:     idx  = 4*(*r++);
3925:     t[0] = b[idx];   t[1] = b[1+idx];
3926:     t[2] = b[2+idx]; t[3] = b[3+idx];
3927:     v    =  aa + 16*ai[1];

3929:     for (i=1; i<n;) {
3930:       PREFETCH_NTA(&v[8]);
3931:       vi   =  aj      + ai[i];
3932:       nz   =  diag[i] - ai[i];
3933:       idx  =  4*(*r++);

3935:       /* Demote sum from double to float */
3936:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3937:       LOAD_PS(tmps,XMM7);

3939:       while (nz--) {
3940:         PREFETCH_NTA(&v[16]);
3941:         idx = 4*(*vi++);
3942: 
3943:         /* Demote solution (so far) from double to float */
3944:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

3946:         /* 4x4 Matrix-Vector product with negative accumulation: */
3947:         SSE_INLINE_BEGIN_2(tmpx,v)
3948:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

3950:           /* First Column */
3951:           SSE_COPY_PS(XMM0,XMM6)
3952:           SSE_SHUFFLE(XMM0,XMM0,0x00)
3953:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3954:           SSE_SUB_PS(XMM7,XMM0)
3955: 
3956:           /* Second Column */
3957:           SSE_COPY_PS(XMM1,XMM6)
3958:           SSE_SHUFFLE(XMM1,XMM1,0x55)
3959:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3960:           SSE_SUB_PS(XMM7,XMM1)
3961: 
3962:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3963: 
3964:           /* Third Column */
3965:           SSE_COPY_PS(XMM2,XMM6)
3966:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3967:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3968:           SSE_SUB_PS(XMM7,XMM2)

3970:           /* Fourth Column */
3971:           SSE_COPY_PS(XMM3,XMM6)
3972:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3973:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3974:           SSE_SUB_PS(XMM7,XMM3)
3975:         SSE_INLINE_END_2
3976: 
3977:         v  += 16;
3978:       }
3979:       idx = 4*i;
3980:       v   = aa + 16*ai[++i];
3981:       PREFETCH_NTA(v);
3982:       STORE_PS(tmps,XMM7);

3984:       /* Promote result from float to double */
3985:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3986:     }
3987:     /* backward solve the upper triangular */
3988:     idt  = 4*(n-1);
3989:     ai16 = 16*diag[n-1];
3990:     v    = aa + ai16 + 16;
3991:     for (i=n-1; i>=0;){
3992:       PREFETCH_NTA(&v[8]);
3993:       vi = aj + diag[i] + 1;
3994:       nz = ai[i+1] - diag[i] - 1;
3995: 
3996:       /* Demote accumulator from double to float */
3997:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3998:       LOAD_PS(tmps,XMM7);

4000:       while (nz--) {
4001:         PREFETCH_NTA(&v[16]);
4002:         idx = 4*(*vi++);

4004:         /* Demote solution (so far) from double to float */
4005:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

4007:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4008:         SSE_INLINE_BEGIN_2(tmpx,v)
4009:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4011:           /* First Column */
4012:           SSE_COPY_PS(XMM0,XMM6)
4013:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4014:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4015:           SSE_SUB_PS(XMM7,XMM0)

4017:           /* Second Column */
4018:           SSE_COPY_PS(XMM1,XMM6)
4019:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4020:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4021:           SSE_SUB_PS(XMM7,XMM1)

4023:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4024: 
4025:           /* Third Column */
4026:           SSE_COPY_PS(XMM2,XMM6)
4027:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4028:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4029:           SSE_SUB_PS(XMM7,XMM2)

4031:           /* Fourth Column */
4032:           SSE_COPY_PS(XMM3,XMM6)
4033:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4034:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4035:           SSE_SUB_PS(XMM7,XMM3)
4036:         SSE_INLINE_END_2
4037:         v  += 16;
4038:       }
4039:       v    = aa + ai16;
4040:       ai16 = 16*diag[--i];
4041:       PREFETCH_NTA(aa+ai16+16);
4042:       /* 
4043:          Scale the result by the diagonal 4x4 block, 
4044:          which was inverted as part of the factorization
4045:       */
4046:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4047:         /* First Column */
4048:         SSE_COPY_PS(XMM0,XMM7)
4049:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4050:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4052:         /* Second Column */
4053:         SSE_COPY_PS(XMM1,XMM7)
4054:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4055:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4056:         SSE_ADD_PS(XMM0,XMM1)

4058:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4059: 
4060:         /* Third Column */
4061:         SSE_COPY_PS(XMM2,XMM7)
4062:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4063:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4064:         SSE_ADD_PS(XMM0,XMM2)

4066:         /* Fourth Column */
4067:         SSE_COPY_PS(XMM3,XMM7)
4068:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4069:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4070:         SSE_ADD_PS(XMM0,XMM3)
4071: 
4072:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4073:       SSE_INLINE_END_3

4075:       /* Promote solution from float to double */
4076:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

4078:       /* Apply reordering to t and stream into x.    */
4079:       /* This way, x doesn't pollute the cache.      */
4080:       /* Be careful with size: 2 doubles = 4 floats! */
4081:       idc  = 4*(*c--);
4082:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4083:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4084:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4085:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4086:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4087:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4088:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4089:       SSE_INLINE_END_2
4090:       v    = aa + ai16 + 16;
4091:       idt -= 4;
4092:     }

4094:     ISRestoreIndices(isrow,&rout);
4095:     ISRestoreIndices(iscol,&cout);
4096:     VecRestoreArray(bb,&b);
4097:     VecRestoreArray(xx,&x);
4098:     PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4099:   SSE_SCOPE_END;
4100:   return(0);
4101: }

4103: #endif


4106: /*
4107:       Special case where the matrix was ILU(0) factored in the natural
4108:    ordering. This eliminates the need for the column and row permutation.
4109: */
4112: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4113: {
4114:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4115:   PetscInt          n=a->mbs;
4116:   const PetscInt    *ai=a->i,*aj=a->j;
4117:   PetscErrorCode    ierr;
4118:   const PetscInt    *diag = a->diag;
4119:   const MatScalar   *aa=a->a;
4120:   PetscScalar       *x;
4121:   const PetscScalar *b;

4124:   VecGetArrayRead(bb,&b);
4125:   VecGetArray(xx,&x);

4127: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4128:   {
4129:     static PetscScalar w[2000]; /* very BAD need to fix */
4130:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4131:   }
4132: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4133:   {
4134:     static PetscScalar w[2000]; /* very BAD need to fix */
4135:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4136:   }
4137: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4138:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4139: #else
4140:   {
4141:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4142:     const MatScalar *v;
4143:     PetscInt        jdx,idt,idx,nz,i,ai16;
4144:     const PetscInt  *vi;

4146:   /* forward solve the lower triangular */
4147:   idx    = 0;
4148:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4149:   for (i=1; i<n; i++) {
4150:     v     =  aa      + 16*ai[i];
4151:     vi    =  aj      + ai[i];
4152:     nz    =  diag[i] - ai[i];
4153:     idx   +=  4;
4154:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4155:     while (nz--) {
4156:       jdx   = 4*(*vi++);
4157:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4158:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4159:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4160:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4161:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4162:       v    += 16;
4163:     }
4164:     x[idx]   = s1;
4165:     x[1+idx] = s2;
4166:     x[2+idx] = s3;
4167:     x[3+idx] = s4;
4168:   }
4169:   /* backward solve the upper triangular */
4170:   idt = 4*(n-1);
4171:   for (i=n-1; i>=0; i--){
4172:     ai16 = 16*diag[i];
4173:     v    = aa + ai16 + 16;
4174:     vi   = aj + diag[i] + 1;
4175:     nz   = ai[i+1] - diag[i] - 1;
4176:     s1 = x[idt];  s2 = x[1+idt];
4177:     s3 = x[2+idt];s4 = x[3+idt];
4178:     while (nz--) {
4179:       idx   = 4*(*vi++);
4180:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4181:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4182:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4183:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4184:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4185:       v    += 16;
4186:     }
4187:     v        = aa + ai16;
4188:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4189:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4190:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4191:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4192:     idt -= 4;
4193:   }
4194:   }
4195: #endif

4197:   VecRestoreArrayRead(bb,&b);
4198:   VecRestoreArray(xx,&x);
4199:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4200:   return(0);
4201: }

4205: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4206: {
4207:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4208:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4209:     PetscInt          i,k,nz,idx,jdx,idt;
4210:     PetscErrorCode    ierr;
4211:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4212:     const MatScalar   *aa=a->a,*v;
4213:     PetscScalar       *x;
4214:     const PetscScalar *b;
4215:     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

4218:     VecGetArrayRead(bb,&b);
4219:     VecGetArray(xx,&x);
4220:     /* forward solve the lower triangular */
4221:     idx    = 0;
4222:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4223:     for (i=1; i<n; i++) {
4224:        v    = aa + bs2*ai[i];
4225:        vi   = aj + ai[i];
4226:        nz   = ai[i+1] - ai[i];
4227:       idx   = bs*i;
4228:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4229:       for(k=0;k<nz;k++) {
4230:           jdx   = bs*vi[k];
4231:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4232:           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4233:           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4234:           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4235:           s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4236: 
4237:           v   +=  bs2;
4238:         }

4240:        x[idx]   = s1;
4241:        x[1+idx] = s2;
4242:        x[2+idx] = s3;
4243:        x[3+idx] = s4;
4244:     }
4245: 
4246:    /* backward solve the upper triangular */
4247:   for (i=n-1; i>=0; i--){
4248:     v   = aa + bs2*(adiag[i+1]+1);
4249:      vi  = aj + adiag[i+1]+1;
4250:      nz  = adiag[i] - adiag[i+1]-1;
4251:      idt = bs*i;
4252:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4253: 
4254:     for(k=0;k<nz;k++){
4255:       idx   = bs*vi[k];
4256:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4257:        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4258:        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4259:        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4260:        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

4262:         v   +=  bs2;
4263:     }
4264:     /* x = inv_diagonal*x */
4265:    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4266:    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4267:    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4268:    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

4270:   }

4272:   VecRestoreArrayRead(bb,&b);
4273:   VecRestoreArray(xx,&x);
4274:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4275:   return(0);
4276: }

4280: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4281: {
4282:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4283:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4284:   PetscErrorCode    ierr;
4285:   const MatScalar   *aa=a->a;
4286:   const PetscScalar *b;
4287:   PetscScalar       *x;

4290:   VecGetArrayRead(bb,&b);
4291:   VecGetArray(xx,&x);

4293:   {
4294:     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4295:     const MatScalar  *v;
4296:     MatScalar        *t=(MatScalar *)x;
4297:     PetscInt         jdx,idt,idx,nz,i,ai16;
4298:     const PetscInt   *vi;

4300:     /* forward solve the lower triangular */
4301:     idx  = 0;
4302:     t[0] = (MatScalar)b[0];
4303:     t[1] = (MatScalar)b[1];
4304:     t[2] = (MatScalar)b[2];
4305:     t[3] = (MatScalar)b[3];
4306:     for (i=1; i<n; i++) {
4307:       v     =  aa      + 16*ai[i];
4308:       vi    =  aj      + ai[i];
4309:       nz    =  diag[i] - ai[i];
4310:       idx   +=  4;
4311:       s1 = (MatScalar)b[idx];
4312:       s2 = (MatScalar)b[1+idx];
4313:       s3 = (MatScalar)b[2+idx];
4314:       s4 = (MatScalar)b[3+idx];
4315:       while (nz--) {
4316:         jdx = 4*(*vi++);
4317:         x1  = t[jdx];
4318:         x2  = t[1+jdx];
4319:         x3  = t[2+jdx];
4320:         x4  = t[3+jdx];
4321:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4322:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4323:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4324:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4325:         v    += 16;
4326:       }
4327:       t[idx]   = s1;
4328:       t[1+idx] = s2;
4329:       t[2+idx] = s3;
4330:       t[3+idx] = s4;
4331:     }
4332:     /* backward solve the upper triangular */
4333:     idt = 4*(n-1);
4334:     for (i=n-1; i>=0; i--){
4335:       ai16 = 16*diag[i];
4336:       v    = aa + ai16 + 16;
4337:       vi   = aj + diag[i] + 1;
4338:       nz   = ai[i+1] - diag[i] - 1;
4339:       s1   = t[idt];
4340:       s2   = t[1+idt];
4341:       s3   = t[2+idt];
4342:       s4   = t[3+idt];
4343:       while (nz--) {
4344:         idx = 4*(*vi++);
4345:         x1  = (MatScalar)x[idx];
4346:         x2  = (MatScalar)x[1+idx];
4347:         x3  = (MatScalar)x[2+idx];
4348:         x4  = (MatScalar)x[3+idx];
4349:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4350:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4351:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4352:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4353:         v    += 16;
4354:       }
4355:       v        = aa + ai16;
4356:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4357:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4358:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4359:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4360:       idt -= 4;
4361:     }
4362:   }

4364:   VecRestoreArrayRead(bb,&b);
4365:   VecRestoreArray(xx,&x);
4366:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4367:   return(0);
4368: }

4370: #if defined (PETSC_HAVE_SSE)

4372: #include PETSC_HAVE_SSE
4375: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4376: {
4377:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4378:   unsigned short *aj=(unsigned short *)a->j;
4380:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4381:   MatScalar      *aa=a->a;
4382:   PetscScalar    *x,*b;

4385:   SSE_SCOPE_BEGIN;
4386:   /* 
4387:      Note: This code currently uses demotion of double
4388:      to float when performing the mixed-mode computation.
4389:      This may not be numerically reasonable for all applications.
4390:   */
4391:   PREFETCH_NTA(aa+16*ai[1]);

4393:   VecGetArray(bb,&b);
4394:   VecGetArray(xx,&x);
4395:   {
4396:     /* x will first be computed in single precision then promoted inplace to double */
4397:     MatScalar      *v,*t=(MatScalar *)x;
4398:     int            nz,i,idt,ai16;
4399:     unsigned int   jdx,idx;
4400:     unsigned short *vi;
4401:     /* Forward solve the lower triangular factor. */

4403:     /* First block is the identity. */
4404:     idx  = 0;
4405:     CONVERT_DOUBLE4_FLOAT4(t,b);
4406:     v    =  aa + 16*((unsigned int)ai[1]);

4408:     for (i=1; i<n;) {
4409:       PREFETCH_NTA(&v[8]);
4410:       vi   =  aj      + ai[i];
4411:       nz   =  diag[i] - ai[i];
4412:       idx +=  4;

4414:       /* Demote RHS from double to float. */
4415:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4416:       LOAD_PS(&t[idx],XMM7);

4418:       while (nz--) {
4419:         PREFETCH_NTA(&v[16]);
4420:         jdx = 4*((unsigned int)(*vi++));
4421: 
4422:         /* 4x4 Matrix-Vector product with negative accumulation: */
4423:         SSE_INLINE_BEGIN_2(&t[jdx],v)
4424:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4426:           /* First Column */
4427:           SSE_COPY_PS(XMM0,XMM6)
4428:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4429:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4430:           SSE_SUB_PS(XMM7,XMM0)

4432:           /* Second Column */
4433:           SSE_COPY_PS(XMM1,XMM6)
4434:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4435:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4436:           SSE_SUB_PS(XMM7,XMM1)

4438:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4439: 
4440:           /* Third Column */
4441:           SSE_COPY_PS(XMM2,XMM6)
4442:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4443:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4444:           SSE_SUB_PS(XMM7,XMM2)

4446:           /* Fourth Column */
4447:           SSE_COPY_PS(XMM3,XMM6)
4448:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4449:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4450:           SSE_SUB_PS(XMM7,XMM3)
4451:         SSE_INLINE_END_2
4452: 
4453:         v  += 16;
4454:       }
4455:       v    =  aa + 16*ai[++i];
4456:       PREFETCH_NTA(v);
4457:       STORE_PS(&t[idx],XMM7);
4458:     }

4460:     /* Backward solve the upper triangular factor.*/

4462:     idt  = 4*(n-1);
4463:     ai16 = 16*diag[n-1];
4464:     v    = aa + ai16 + 16;
4465:     for (i=n-1; i>=0;){
4466:       PREFETCH_NTA(&v[8]);
4467:       vi = aj + diag[i] + 1;
4468:       nz = ai[i+1] - diag[i] - 1;
4469: 
4470:       LOAD_PS(&t[idt],XMM7);

4472:       while (nz--) {
4473:         PREFETCH_NTA(&v[16]);
4474:         idx = 4*((unsigned int)(*vi++));

4476:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4477:         SSE_INLINE_BEGIN_2(&t[idx],v)
4478:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4480:           /* First Column */
4481:           SSE_COPY_PS(XMM0,XMM6)
4482:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4483:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4484:           SSE_SUB_PS(XMM7,XMM0)

4486:           /* Second Column */
4487:           SSE_COPY_PS(XMM1,XMM6)
4488:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4489:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4490:           SSE_SUB_PS(XMM7,XMM1)

4492:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4493: 
4494:           /* Third Column */
4495:           SSE_COPY_PS(XMM2,XMM6)
4496:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4497:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4498:           SSE_SUB_PS(XMM7,XMM2)

4500:           /* Fourth Column */
4501:           SSE_COPY_PS(XMM3,XMM6)
4502:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4503:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4504:           SSE_SUB_PS(XMM7,XMM3)
4505:         SSE_INLINE_END_2
4506:         v  += 16;
4507:       }
4508:       v    = aa + ai16;
4509:       ai16 = 16*diag[--i];
4510:       PREFETCH_NTA(aa+ai16+16);
4511:       /* 
4512:          Scale the result by the diagonal 4x4 block, 
4513:          which was inverted as part of the factorization
4514:       */
4515:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4516:         /* First Column */
4517:         SSE_COPY_PS(XMM0,XMM7)
4518:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4519:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4521:         /* Second Column */
4522:         SSE_COPY_PS(XMM1,XMM7)
4523:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4524:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4525:         SSE_ADD_PS(XMM0,XMM1)

4527:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4528: 
4529:         /* Third Column */
4530:         SSE_COPY_PS(XMM2,XMM7)
4531:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4532:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4533:         SSE_ADD_PS(XMM0,XMM2)

4535:         /* Fourth Column */
4536:         SSE_COPY_PS(XMM3,XMM7)
4537:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4538:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4539:         SSE_ADD_PS(XMM0,XMM3)

4541:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4542:       SSE_INLINE_END_3

4544:       v    = aa + ai16 + 16;
4545:       idt -= 4;
4546:     }

4548:     /* Convert t from single precision back to double precision (inplace)*/
4549:     idt = 4*(n-1);
4550:     for (i=n-1;i>=0;i--) {
4551:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4552:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4553:       PetscScalar *xtemp=&x[idt];
4554:       MatScalar   *ttemp=&t[idt];
4555:       xtemp[3] = (PetscScalar)ttemp[3];
4556:       xtemp[2] = (PetscScalar)ttemp[2];
4557:       xtemp[1] = (PetscScalar)ttemp[1];
4558:       xtemp[0] = (PetscScalar)ttemp[0];
4559:       idt -= 4;
4560:     }

4562:   } /* End of artificial scope. */
4563:   VecRestoreArray(bb,&b);
4564:   VecRestoreArray(xx,&x);
4565:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4566:   SSE_SCOPE_END;
4567:   return(0);
4568: }

4572: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4573: {
4574:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4575:   int            *aj=a->j;
4577:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4578:   MatScalar      *aa=a->a;
4579:   PetscScalar    *x,*b;

4582:   SSE_SCOPE_BEGIN;
4583:   /* 
4584:      Note: This code currently uses demotion of double
4585:      to float when performing the mixed-mode computation.
4586:      This may not be numerically reasonable for all applications.
4587:   */
4588:   PREFETCH_NTA(aa+16*ai[1]);

4590:   VecGetArray(bb,&b);
4591:   VecGetArray(xx,&x);
4592:   {
4593:     /* x will first be computed in single precision then promoted inplace to double */
4594:     MatScalar *v,*t=(MatScalar *)x;
4595:     int       nz,i,idt,ai16;
4596:     int       jdx,idx;
4597:     int       *vi;
4598:     /* Forward solve the lower triangular factor. */

4600:     /* First block is the identity. */
4601:     idx  = 0;
4602:     CONVERT_DOUBLE4_FLOAT4(t,b);
4603:     v    =  aa + 16*ai[1];

4605:     for (i=1; i<n;) {
4606:       PREFETCH_NTA(&v[8]);
4607:       vi   =  aj      + ai[i];
4608:       nz   =  diag[i] - ai[i];
4609:       idx +=  4;

4611:       /* Demote RHS from double to float. */
4612:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4613:       LOAD_PS(&t[idx],XMM7);

4615:       while (nz--) {
4616:         PREFETCH_NTA(&v[16]);
4617:         jdx = 4*(*vi++);
4618: /*          jdx = *vi++; */
4619: 
4620:         /* 4x4 Matrix-Vector product with negative accumulation: */
4621:         SSE_INLINE_BEGIN_2(&t[jdx],v)
4622:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4624:           /* First Column */
4625:           SSE_COPY_PS(XMM0,XMM6)
4626:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4627:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4628:           SSE_SUB_PS(XMM7,XMM0)

4630:           /* Second Column */
4631:           SSE_COPY_PS(XMM1,XMM6)
4632:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4633:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4634:           SSE_SUB_PS(XMM7,XMM1)

4636:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4637: 
4638:           /* Third Column */
4639:           SSE_COPY_PS(XMM2,XMM6)
4640:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4641:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4642:           SSE_SUB_PS(XMM7,XMM2)

4644:           /* Fourth Column */
4645:           SSE_COPY_PS(XMM3,XMM6)
4646:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4647:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4648:           SSE_SUB_PS(XMM7,XMM3)
4649:         SSE_INLINE_END_2
4650: 
4651:         v  += 16;
4652:       }
4653:       v    =  aa + 16*ai[++i];
4654:       PREFETCH_NTA(v);
4655:       STORE_PS(&t[idx],XMM7);
4656:     }

4658:     /* Backward solve the upper triangular factor.*/

4660:     idt  = 4*(n-1);
4661:     ai16 = 16*diag[n-1];
4662:     v    = aa + ai16 + 16;
4663:     for (i=n-1; i>=0;){
4664:       PREFETCH_NTA(&v[8]);
4665:       vi = aj + diag[i] + 1;
4666:       nz = ai[i+1] - diag[i] - 1;
4667: 
4668:       LOAD_PS(&t[idt],XMM7);

4670:       while (nz--) {
4671:         PREFETCH_NTA(&v[16]);
4672:         idx = 4*(*vi++);
4673: /*          idx = *vi++; */

4675:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4676:         SSE_INLINE_BEGIN_2(&t[idx],v)
4677:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4679:           /* First Column */
4680:           SSE_COPY_PS(XMM0,XMM6)
4681:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4682:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4683:           SSE_SUB_PS(XMM7,XMM0)

4685:           /* Second Column */
4686:           SSE_COPY_PS(XMM1,XMM6)
4687:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4688:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4689:           SSE_SUB_PS(XMM7,XMM1)

4691:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4692: 
4693:           /* Third Column */
4694:           SSE_COPY_PS(XMM2,XMM6)
4695:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4696:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4697:           SSE_SUB_PS(XMM7,XMM2)

4699:           /* Fourth Column */
4700:           SSE_COPY_PS(XMM3,XMM6)
4701:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4702:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4703:           SSE_SUB_PS(XMM7,XMM3)
4704:         SSE_INLINE_END_2
4705:         v  += 16;
4706:       }
4707:       v    = aa + ai16;
4708:       ai16 = 16*diag[--i];
4709:       PREFETCH_NTA(aa+ai16+16);
4710:       /* 
4711:          Scale the result by the diagonal 4x4 block, 
4712:          which was inverted as part of the factorization
4713:       */
4714:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4715:         /* First Column */
4716:         SSE_COPY_PS(XMM0,XMM7)
4717:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4718:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4720:         /* Second Column */
4721:         SSE_COPY_PS(XMM1,XMM7)
4722:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4723:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4724:         SSE_ADD_PS(XMM0,XMM1)

4726:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4727: 
4728:         /* Third Column */
4729:         SSE_COPY_PS(XMM2,XMM7)
4730:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4731:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4732:         SSE_ADD_PS(XMM0,XMM2)

4734:         /* Fourth Column */
4735:         SSE_COPY_PS(XMM3,XMM7)
4736:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4737:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4738:         SSE_ADD_PS(XMM0,XMM3)

4740:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4741:       SSE_INLINE_END_3

4743:       v    = aa + ai16 + 16;
4744:       idt -= 4;
4745:     }

4747:     /* Convert t from single precision back to double precision (inplace)*/
4748:     idt = 4*(n-1);
4749:     for (i=n-1;i>=0;i--) {
4750:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4751:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4752:       PetscScalar *xtemp=&x[idt];
4753:       MatScalar   *ttemp=&t[idt];
4754:       xtemp[3] = (PetscScalar)ttemp[3];
4755:       xtemp[2] = (PetscScalar)ttemp[2];
4756:       xtemp[1] = (PetscScalar)ttemp[1];
4757:       xtemp[0] = (PetscScalar)ttemp[0];
4758:       idt -= 4;
4759:     }

4761:   } /* End of artificial scope. */
4762:   VecRestoreArray(bb,&b);
4763:   VecRestoreArray(xx,&x);
4764:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4765:   SSE_SCOPE_END;
4766:   return(0);
4767: }

4769: #endif

4773: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4774: {
4775:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4776:   IS                iscol=a->col,isrow=a->row;
4777:   PetscErrorCode    ierr;
4778:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4779:   PetscInt          i,nz,idx,idt,idc;
4780:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4781:   const MatScalar   *aa=a->a,*v;
4782:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4783:   const PetscScalar *b;

4786:   VecGetArrayRead(bb,&b);
4787:   VecGetArray(xx,&x);
4788:   t  = a->solve_work;

4790:   ISGetIndices(isrow,&rout); r = rout;
4791:   ISGetIndices(iscol,&cout); c = cout + (n-1);

4793:   /* forward solve the lower triangular */
4794:   idx    = 3*(*r++);
4795:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4796:   for (i=1; i<n; i++) {
4797:     v     = aa + 9*ai[i];
4798:     vi    = aj + ai[i];
4799:     nz    = diag[i] - ai[i];
4800:     idx   = 3*(*r++);
4801:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4802:     while (nz--) {
4803:       idx   = 3*(*vi++);
4804:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4805:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4806:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4807:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4808:       v += 9;
4809:     }
4810:     idx = 3*i;
4811:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4812:   }
4813:   /* backward solve the upper triangular */
4814:   for (i=n-1; i>=0; i--){
4815:     v    = aa + 9*diag[i] + 9;
4816:     vi   = aj + diag[i] + 1;
4817:     nz   = ai[i+1] - diag[i] - 1;
4818:     idt  = 3*i;
4819:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4820:     while (nz--) {
4821:       idx   = 3*(*vi++);
4822:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4823:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4824:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4825:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4826:       v += 9;
4827:     }
4828:     idc = 3*(*c--);
4829:     v   = aa + 9*diag[i];
4830:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4831:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4832:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4833:   }
4834:   ISRestoreIndices(isrow,&rout);
4835:   ISRestoreIndices(iscol,&cout);
4836:   VecRestoreArrayRead(bb,&b);
4837:   VecRestoreArray(xx,&x);
4838:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4839:   return(0);
4840: }

4844: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4845: {
4846:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4847:   IS                iscol=a->col,isrow=a->row;
4848:   PetscErrorCode    ierr;
4849:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4850:   PetscInt          i,nz,idx,idt,idc,m;
4851:   const PetscInt    *r,*c,*rout,*cout;
4852:   const MatScalar   *aa=a->a,*v;
4853:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4854:   const PetscScalar *b;

4857:   VecGetArrayRead(bb,&b);
4858:   VecGetArray(xx,&x);
4859:   t  = a->solve_work;

4861:   ISGetIndices(isrow,&rout); r = rout;
4862:   ISGetIndices(iscol,&cout); c = cout;

4864:   /* forward solve the lower triangular */
4865:   idx    = 3*r[0];
4866:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4867:   for (i=1; i<n; i++) {
4868:     v     = aa + 9*ai[i];
4869:     vi    = aj + ai[i];
4870:     nz    = ai[i+1] - ai[i];
4871:     idx   = 3*r[i];
4872:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4873:     for(m=0;m<nz;m++){
4874:       idx   = 3*vi[m];
4875:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4876:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4879:       v += 9;
4880:     }
4881:     idx = 3*i;
4882:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4883:   }
4884:   /* backward solve the upper triangular */
4885:   for (i=n-1; i>=0; i--){
4886:     v    = aa + 9*(adiag[i+1]+1);
4887:     vi   = aj + adiag[i+1]+1;
4888:     nz   = adiag[i] - adiag[i+1] - 1;
4889:     idt  = 3*i;
4890:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4891:     for(m=0;m<nz;m++){
4892:       idx   = 3*vi[m];
4893:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4894:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4895:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4896:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4897:       v += 9;
4898:     }
4899:     idc = 3*c[i];
4900:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4901:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4902:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4903:   }
4904:   ISRestoreIndices(isrow,&rout);
4905:   ISRestoreIndices(iscol,&cout);
4906:   VecRestoreArrayRead(bb,&b);
4907:   VecRestoreArray(xx,&x);
4908:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4909:   return(0);
4910: }

4912: /*
4913:       Special case where the matrix was ILU(0) factored in the natural
4914:    ordering. This eliminates the need for the column and row permutation.
4915: */
4918: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4919: {
4920:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4921:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4922:   PetscErrorCode    ierr;
4923:   const PetscInt    *diag = a->diag,*vi;
4924:   const MatScalar   *aa=a->a,*v;
4925:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4926:   const PetscScalar *b;
4927:   PetscInt          jdx,idt,idx,nz,i;

4930:   VecGetArrayRead(bb,&b);
4931:   VecGetArray(xx,&x);

4933:   /* forward solve the lower triangular */
4934:   idx    = 0;
4935:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4936:   for (i=1; i<n; i++) {
4937:     v     =  aa      + 9*ai[i];
4938:     vi    =  aj      + ai[i];
4939:     nz    =  diag[i] - ai[i];
4940:     idx   +=  3;
4941:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4942:     while (nz--) {
4943:       jdx   = 3*(*vi++);
4944:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4945:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4946:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4947:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4948:       v    += 9;
4949:     }
4950:     x[idx]   = s1;
4951:     x[1+idx] = s2;
4952:     x[2+idx] = s3;
4953:   }
4954:   /* backward solve the upper triangular */
4955:   for (i=n-1; i>=0; i--){
4956:     v    = aa + 9*diag[i] + 9;
4957:     vi   = aj + diag[i] + 1;
4958:     nz   = ai[i+1] - diag[i] - 1;
4959:     idt  = 3*i;
4960:     s1 = x[idt];  s2 = x[1+idt];
4961:     s3 = x[2+idt];
4962:     while (nz--) {
4963:       idx   = 3*(*vi++);
4964:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4965:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4968:       v    += 9;
4969:     }
4970:     v        = aa +  9*diag[i];
4971:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4972:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4973:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4974:   }

4976:   VecRestoreArrayRead(bb,&b);
4977:   VecRestoreArray(xx,&x);
4978:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4979:   return(0);
4980: }

4984: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4985: {
4986:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4987:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4988:     PetscErrorCode    ierr;
4989:     PetscInt          i,k,nz,idx,jdx,idt;
4990:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4991:     const MatScalar   *aa=a->a,*v;
4992:     PetscScalar       *x;
4993:     const PetscScalar *b;
4994:     PetscScalar        s1,s2,s3,x1,x2,x3;

4997:     VecGetArrayRead(bb,&b);
4998:     VecGetArray(xx,&x);
4999:     /* forward solve the lower triangular */
5000:     idx    = 0;
5001:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5002:     for (i=1; i<n; i++) {
5003:        v    = aa + bs2*ai[i];
5004:        vi   = aj + ai[i];
5005:        nz   = ai[i+1] - ai[i];
5006:       idx   = bs*i;
5007:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5008:       for(k=0;k<nz;k++){
5009:          jdx   = bs*vi[k];
5010:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5011:           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5012:           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5013:           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5014: 
5015:           v   +=  bs2;
5016:         }

5018:        x[idx]   = s1;
5019:        x[1+idx] = s2;
5020:        x[2+idx] = s3;
5021:     }
5022: 
5023:    /* backward solve the upper triangular */
5024:   for (i=n-1; i>=0; i--){
5025:     v   = aa + bs2*(adiag[i+1]+1);
5026:      vi  = aj + adiag[i+1]+1;
5027:      nz  = adiag[i] - adiag[i+1]-1;
5028:      idt = bs*i;
5029:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5030: 
5031:      for(k=0;k<nz;k++){
5032:        idx   = bs*vi[k];
5033:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5034:        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5035:        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5036:        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

5038:         v   +=  bs2;
5039:     }
5040:     /* x = inv_diagonal*x */
5041:    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5042:    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5043:    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

5045:   }

5047:   VecRestoreArrayRead(bb,&b);
5048:   VecRestoreArray(xx,&x);
5049:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
5050:   return(0);
5051: }

5055: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5056: {
5057:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5058:   IS                iscol=a->col,isrow=a->row;
5059:   PetscErrorCode    ierr;
5060:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5061:   PetscInt          i,nz,idx,idt,idc;
5062:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5063:   const MatScalar   *aa=a->a,*v;
5064:   PetscScalar       *x,s1,s2,x1,x2,*t;
5065:   const PetscScalar *b;

5068:   VecGetArrayRead(bb,&b);
5069:   VecGetArray(xx,&x);
5070:   t  = a->solve_work;

5072:   ISGetIndices(isrow,&rout); r = rout;
5073:   ISGetIndices(iscol,&cout); c = cout + (n-1);

5075:   /* forward solve the lower triangular */
5076:   idx    = 2*(*r++);
5077:   t[0] = b[idx]; t[1] = b[1+idx];
5078:   for (i=1; i<n; i++) {
5079:     v     = aa + 4*ai[i];
5080:     vi    = aj + ai[i];
5081:     nz    = diag[i] - ai[i];
5082:     idx   = 2*(*r++);
5083:     s1  = b[idx]; s2 = b[1+idx];
5084:     while (nz--) {
5085:       idx   = 2*(*vi++);
5086:       x1    = t[idx]; x2 = t[1+idx];
5087:       s1 -= v[0]*x1 + v[2]*x2;
5088:       s2 -= v[1]*x1 + v[3]*x2;
5089:       v += 4;
5090:     }
5091:     idx = 2*i;
5092:     t[idx] = s1; t[1+idx] = s2;
5093:   }
5094:   /* backward solve the upper triangular */
5095:   for (i=n-1; i>=0; i--){
5096:     v    = aa + 4*diag[i] + 4;
5097:     vi   = aj + diag[i] + 1;
5098:     nz   = ai[i+1] - diag[i] - 1;
5099:     idt  = 2*i;
5100:     s1 = t[idt]; s2 = t[1+idt];
5101:     while (nz--) {
5102:       idx   = 2*(*vi++);
5103:       x1    = t[idx]; x2 = t[1+idx];
5104:       s1 -= v[0]*x1 + v[2]*x2;
5105:       s2 -= v[1]*x1 + v[3]*x2;
5106:       v += 4;
5107:     }
5108:     idc = 2*(*c--);
5109:     v   = aa + 4*diag[i];
5110:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5111:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5112:   }
5113:   ISRestoreIndices(isrow,&rout);
5114:   ISRestoreIndices(iscol,&cout);
5115:   VecRestoreArrayRead(bb,&b);
5116:   VecRestoreArray(xx,&x);
5117:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5118:   return(0);
5119: }

5123: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5124: {
5125:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5126:   IS                iscol=a->col,isrow=a->row;
5127:   PetscErrorCode    ierr;
5128:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5129:   PetscInt          i,nz,idx,jdx,idt,idc,m;
5130:   const PetscInt    *r,*c,*rout,*cout;
5131:   const MatScalar   *aa=a->a,*v;
5132:   PetscScalar       *x,s1,s2,x1,x2,*t;
5133:   const PetscScalar *b;

5136:   VecGetArrayRead(bb,&b);
5137:   VecGetArray(xx,&x);
5138:   t  = a->solve_work;

5140:   ISGetIndices(isrow,&rout); r = rout;
5141:   ISGetIndices(iscol,&cout); c = cout;

5143:   /* forward solve the lower triangular */
5144:   idx    = 2*r[0];
5145:   t[0] = b[idx]; t[1] = b[1+idx];
5146:   for (i=1; i<n; i++) {
5147:     v     = aa + 4*ai[i];
5148:     vi    = aj + ai[i];
5149:     nz    = ai[i+1] - ai[i];
5150:     idx   = 2*r[i];
5151:     s1  = b[idx]; s2 = b[1+idx];
5152:     for(m=0;m<nz;m++){
5153:       jdx   = 2*vi[m];
5154:       x1    = t[jdx]; x2 = t[1+jdx];
5155:       s1 -= v[0]*x1 + v[2]*x2;
5156:       s2 -= v[1]*x1 + v[3]*x2;
5157:       v += 4;
5158:     }
5159:     idx = 2*i;
5160:     t[idx] = s1; t[1+idx] = s2;
5161:   }
5162:   /* backward solve the upper triangular */
5163:   for (i=n-1; i>=0; i--){
5164:     v    = aa + 4*(adiag[i+1]+1);
5165:     vi   = aj + adiag[i+1]+1;
5166:     nz   = adiag[i] - adiag[i+1] - 1;
5167:     idt  = 2*i;
5168:     s1 = t[idt]; s2 = t[1+idt];
5169:     for(m=0;m<nz;m++){
5170:       idx   = 2*vi[m];
5171:       x1    = t[idx]; x2 = t[1+idx];
5172:       s1 -= v[0]*x1 + v[2]*x2;
5173:       s2 -= v[1]*x1 + v[3]*x2;
5174:       v += 4;
5175:     }
5176:     idc = 2*c[i];
5177:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5178:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5179:   }
5180:   ISRestoreIndices(isrow,&rout);
5181:   ISRestoreIndices(iscol,&cout);
5182:   VecRestoreArrayRead(bb,&b);
5183:   VecRestoreArray(xx,&x);
5184:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5185:   return(0);
5186: }

5188: /*
5189:       Special case where the matrix was ILU(0) factored in the natural
5190:    ordering. This eliminates the need for the column and row permutation.
5191: */
5194: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5195: {
5196:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5197:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5198:   PetscErrorCode    ierr;
5199:   const MatScalar   *aa=a->a,*v;
5200:   PetscScalar       *x,s1,s2,x1,x2;
5201:   const PetscScalar *b;
5202:   PetscInt          jdx,idt,idx,nz,i;

5205:   VecGetArrayRead(bb,&b);
5206:   VecGetArray(xx,&x);

5208:   /* forward solve the lower triangular */
5209:   idx    = 0;
5210:   x[0]   = b[0]; x[1] = b[1];
5211:   for (i=1; i<n; i++) {
5212:     v     =  aa      + 4*ai[i];
5213:     vi    =  aj      + ai[i];
5214:     nz    =  diag[i] - ai[i];
5215:     idx   +=  2;
5216:     s1  =  b[idx];s2 = b[1+idx];
5217:     while (nz--) {
5218:       jdx   = 2*(*vi++);
5219:       x1    = x[jdx];x2 = x[1+jdx];
5220:       s1 -= v[0]*x1 + v[2]*x2;
5221:       s2 -= v[1]*x1 + v[3]*x2;
5222:       v    += 4;
5223:     }
5224:     x[idx]   = s1;
5225:     x[1+idx] = s2;
5226:   }
5227:   /* backward solve the upper triangular */
5228:   for (i=n-1; i>=0; i--){
5229:     v    = aa + 4*diag[i] + 4;
5230:     vi   = aj + diag[i] + 1;
5231:     nz   = ai[i+1] - diag[i] - 1;
5232:     idt  = 2*i;
5233:     s1 = x[idt];  s2 = x[1+idt];
5234:     while (nz--) {
5235:       idx   = 2*(*vi++);
5236:       x1    = x[idx];   x2 = x[1+idx];
5237:       s1 -= v[0]*x1 + v[2]*x2;
5238:       s2 -= v[1]*x1 + v[3]*x2;
5239:       v    += 4;
5240:     }
5241:     v        = aa +  4*diag[i];
5242:     x[idt]   = v[0]*s1 + v[2]*s2;
5243:     x[1+idt] = v[1]*s1 + v[3]*s2;
5244:   }

5246:   VecRestoreArrayRead(bb,&b);
5247:   VecRestoreArray(xx,&x);
5248:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5249:   return(0);
5250: }

5254: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5255: {
5256:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5257:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5258:     PetscInt          i,k,nz,idx,idt,jdx;
5259:     PetscErrorCode    ierr;
5260:     const MatScalar   *aa=a->a,*v;
5261:     PetscScalar       *x,s1,s2,x1,x2;
5262:     const PetscScalar *b;
5263: 
5265:     VecGetArrayRead(bb,&b);
5266:     VecGetArray(xx,&x);
5267:     /* forward solve the lower triangular */
5268:     idx    = 0;
5269:     x[0] = b[idx]; x[1] = b[1+idx];
5270:     for (i=1; i<n; i++) {
5271:         v   = aa + 4*ai[i];
5272:        vi   = aj + ai[i];
5273:        nz   = ai[i+1] - ai[i];
5274:        idx  = 2*i;
5275:        s1   = b[idx];s2 = b[1+idx];
5276:        PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5277:        PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5278:       for(k=0;k<nz;k++){
5279:          jdx   = 2*vi[k];
5280:           x1    = x[jdx];x2 = x[1+jdx];
5281:           s1   -= v[0]*x1 + v[2]*x2;
5282:           s2   -= v[1]*x1 + v[3]*x2;
5283:            v   +=  4;
5284:         }
5285:        x[idx]   = s1;
5286:        x[1+idx] = s2;
5287:     }
5288: 
5289:    /* backward solve the upper triangular */
5290:   for (i=n-1; i>=0; i--){
5291:      v   = aa + 4*(adiag[i+1]+1);
5292:      vi  = aj + adiag[i+1]+1;
5293:      nz  = adiag[i] - adiag[i+1]-1;
5294:      idt = 2*i;
5295:      s1 = x[idt];  s2 = x[1+idt];
5296:      PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5297:      PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5298:      for(k=0;k<nz;k++){
5299:       idx   = 2*vi[k];
5300:        x1    = x[idx];   x2 = x[1+idx];
5301:        s1 -= v[0]*x1 + v[2]*x2;
5302:        s2 -= v[1]*x1 + v[3]*x2;
5303:          v    += 4;
5304:     }
5305:     /* x = inv_diagonal*x */
5306:    x[idt]   = v[0]*s1 + v[2]*s2;
5307:    x[1+idt] = v[1]*s1 + v[3]*s2;
5308:   }

5310:   VecRestoreArrayRead(bb,&b);
5311:   VecRestoreArray(xx,&x);
5312:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5313:   return(0);
5314: }

5318: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5319: {
5320:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5321:   IS                iscol=a->col,isrow=a->row;
5322:   PetscErrorCode    ierr;
5323:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5324:   PetscInt          i,nz;
5325:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5326:   const MatScalar   *aa=a->a,*v;
5327:   PetscScalar       *x,s1,*t;
5328:   const PetscScalar *b;

5331:   if (!n) return(0);

5333:   VecGetArrayRead(bb,&b);
5334:   VecGetArray(xx,&x);
5335:   t  = a->solve_work;

5337:   ISGetIndices(isrow,&rout); r = rout;
5338:   ISGetIndices(iscol,&cout); c = cout + (n-1);

5340:   /* forward solve the lower triangular */
5341:   t[0] = b[*r++];
5342:   for (i=1; i<n; i++) {
5343:     v     = aa + ai[i];
5344:     vi    = aj + ai[i];
5345:     nz    = diag[i] - ai[i];
5346:     s1  = b[*r++];
5347:     while (nz--) {
5348:       s1 -= (*v++)*t[*vi++];
5349:     }
5350:     t[i] = s1;
5351:   }
5352:   /* backward solve the upper triangular */
5353:   for (i=n-1; i>=0; i--){
5354:     v    = aa + diag[i] + 1;
5355:     vi   = aj + diag[i] + 1;
5356:     nz   = ai[i+1] - diag[i] - 1;
5357:     s1 = t[i];
5358:     while (nz--) {
5359:       s1 -= (*v++)*t[*vi++];
5360:     }
5361:     x[*c--] = t[i] = aa[diag[i]]*s1;
5362:   }

5364:   ISRestoreIndices(isrow,&rout);
5365:   ISRestoreIndices(iscol,&cout);
5366:   VecRestoreArrayRead(bb,&b);
5367:   VecRestoreArray(xx,&x);
5368:   PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
5369:   return(0);
5370: }

5374: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5375: {
5376:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5377:   IS                iscol = a->col,isrow = a->row;
5378:   PetscErrorCode    ierr;
5379:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5380:   const PetscInt    *rout,*cout,*r,*c;
5381:   PetscScalar       *x,*tmp,sum;
5382:   const PetscScalar *b;
5383:   const MatScalar   *aa = a->a,*v;

5386:   if (!n) return(0);

5388:   VecGetArrayRead(bb,&b);
5389:   VecGetArray(xx,&x);
5390:   tmp  = a->solve_work;

5392:   ISGetIndices(isrow,&rout); r = rout;
5393:   ISGetIndices(iscol,&cout); c = cout;

5395:   /* forward solve the lower triangular */
5396:   tmp[0] = b[r[0]];
5397:   v      = aa;
5398:   vi     = aj;
5399:   for (i=1; i<n; i++) {
5400:     nz  = ai[i+1] - ai[i];
5401:     sum = b[r[i]];
5402:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5403:     tmp[i] = sum;
5404:     v += nz; vi += nz;
5405:   }

5407:   /* backward solve the upper triangular */
5408:   for (i=n-1; i>=0; i--){
5409:     v   = aa + adiag[i+1]+1;
5410:     vi  = aj + adiag[i+1]+1;
5411:     nz  = adiag[i]-adiag[i+1]-1;
5412:     sum = tmp[i];
5413:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5414:     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5415:   }

5417:   ISRestoreIndices(isrow,&rout);
5418:   ISRestoreIndices(iscol,&cout);
5419:   VecRestoreArrayRead(bb,&b);
5420:   VecRestoreArray(xx,&x);
5421:   PetscLogFlops(2*a->nz - A->cmap->n);
5422:   return(0);
5423: }

5425: /*
5426:       Special case where the matrix was ILU(0) factored in the natural
5427:    ordering. This eliminates the need for the column and row permutation.
5428: */
5431: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5432: {
5433:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5434:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5435:   PetscErrorCode    ierr;
5436:   const MatScalar   *aa=a->a,*v;
5437:   PetscScalar       *x;
5438:   const PetscScalar *b;
5439:   PetscScalar       s1,x1;
5440:   PetscInt          jdx,idt,idx,nz,i;

5443:   VecGetArrayRead(bb,&b);
5444:   VecGetArray(xx,&x);

5446:   /* forward solve the lower triangular */
5447:   idx    = 0;
5448:   x[0]   = b[0];
5449:   for (i=1; i<n; i++) {
5450:     v     =  aa      + ai[i];
5451:     vi    =  aj      + ai[i];
5452:     nz    =  diag[i] - ai[i];
5453:     idx   +=  1;
5454:     s1  =  b[idx];
5455:     while (nz--) {
5456:       jdx   = *vi++;
5457:       x1    = x[jdx];
5458:       s1 -= v[0]*x1;
5459:       v    += 1;
5460:     }
5461:     x[idx]   = s1;
5462:   }
5463:   /* backward solve the upper triangular */
5464:   for (i=n-1; i>=0; i--){
5465:     v    = aa + diag[i] + 1;
5466:     vi   = aj + diag[i] + 1;
5467:     nz   = ai[i+1] - diag[i] - 1;
5468:     idt  = i;
5469:     s1 = x[idt];
5470:     while (nz--) {
5471:       idx   = *vi++;
5472:       x1    = x[idx];
5473:       s1 -= v[0]*x1;
5474:       v    += 1;
5475:     }
5476:     v        = aa +  diag[i];
5477:     x[idt]   = v[0]*s1;
5478:   }
5479:   VecRestoreArrayRead(bb,&b);
5480:   VecRestoreArray(xx,&x);
5481:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
5482:   return(0);
5483: }


5488: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5489: {
5490:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5491:   PetscErrorCode    ierr;
5492:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5493:   PetscScalar       *x,sum;
5494:   const PetscScalar *b;
5495:   const MatScalar   *aa = a->a,*v;
5496:   PetscInt          i,nz;

5499:   if (!n) return(0);

5501:   VecGetArrayRead(bb,&b);
5502:   VecGetArray(xx,&x);

5504:   /* forward solve the lower triangular */
5505:   x[0] = b[0];
5506:   v    = aa;
5507:   vi   = aj;
5508:   for (i=1; i<n; i++) {
5509:     nz  = ai[i+1] - ai[i];
5510:     sum = b[i];
5511:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5512:     v  += nz;
5513:     vi += nz;
5514:     x[i] = sum;
5515:   }
5516: 
5517:   /* backward solve the upper triangular */
5518:   for (i=n-1; i>=0; i--){
5519:     v   = aa + adiag[i+1] + 1;
5520:     vi  = aj + adiag[i+1] + 1;
5521:     nz = adiag[i] - adiag[i+1]-1;
5522:     sum = x[i];
5523:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5524:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5525:   }
5526: 
5527:   PetscLogFlops(2.0*a->nz - A->cmap->n);
5528:   VecRestoreArrayRead(bb,&b);
5529:   VecRestoreArray(xx,&x);
5530:   return(0);
5531: }

5533: /* ----------------------------------------------------------------*/
5534: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );

5538: /*
5539:    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5540: */
5541: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5542: {
5543:   Mat             C=B;
5544:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5545:   PetscErrorCode  ierr;
5546:   PetscInt        i,j,k,ipvt[15];
5547:   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5548:   PetscInt        nz,nzL,row;
5549:   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5550:   const MatScalar *v,*aa=a->a;
5551:   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5552:   PetscInt        sol_ver;


5556:   PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);

5558:   /* generate work space needed by the factorization */
5559:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
5560:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

5562:   for (i=0; i<n; i++){
5563:     /* zero rtmp */
5564:     /* L part */
5565:     nz    = bi[i+1] - bi[i];
5566:     bjtmp = bj + bi[i];
5567:     for  (j=0; j<nz; j++){
5568:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5569:     }

5571:     /* U part */
5572:     nz = bdiag[i] - bdiag[i+1];
5573:     bjtmp = bj + bdiag[i+1]+1;
5574:     for  (j=0; j<nz; j++){
5575:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5576:     }
5577: 
5578:     /* load in initial (unfactored row) */
5579:     nz    = ai[i+1] - ai[i];
5580:     ajtmp = aj + ai[i];
5581:     v     = aa + bs2*ai[i];
5582:     for (j=0; j<nz; j++) {
5583:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
5584:     }

5586:     /* elimination */
5587:     bjtmp = bj + bi[i];
5588:     nzL   = bi[i+1] - bi[i];
5589:     for(k=0;k < nzL;k++) {
5590:       row = bjtmp[k];
5591:       pc = rtmp + bs2*row;
5592:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5593:       if (flg) {
5594:         pv = b->a + bs2*bdiag[row];
5595:         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
5596:         /*PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);*/
5597:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5598:         pv = b->a + bs2*(bdiag[row+1]+1);
5599:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5600:         for (j=0; j<nz; j++) {
5601:           vv   = rtmp + bs2*pj[j];
5602:           PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5603:           /* PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
5604:           pv  += bs2;
5605:         }
5606:         PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5607:       }
5608:     }

5610:     /* finished row so stick it into b->a */
5611:     /* L part */
5612:     pv   = b->a + bs2*bi[i] ;
5613:     pj   = b->j + bi[i] ;
5614:     nz   = bi[i+1] - bi[i];
5615:     for (j=0; j<nz; j++) {
5616:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5617:     }

5619:     /* Mark diagonal and invert diagonal for simplier triangular solves */
5620:     pv   = b->a + bs2*bdiag[i];
5621:     pj   = b->j + bdiag[i];
5622:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5623:     /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
5624:     PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);
5625: 
5626:     /* U part */
5627:     pv = b->a + bs2*(bdiag[i+1]+1);
5628:     pj = b->j + bdiag[i+1]+1;
5629:     nz = bdiag[i] - bdiag[i+1] - 1;
5630:     for (j=0; j<nz; j++){
5631:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5632:     }
5633:   }

5635:   PetscFree2(rtmp,mwork);
5636:   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5637:   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5638:   C->assembled = PETSC_TRUE;
5639:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5640:   return(0);
5641: }

5645: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5646: {
5647:   Mat            C=B;
5648:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5649:   IS             isrow = b->row,isicol = b->icol;
5651:   const PetscInt *r,*ic;
5652:   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5653:   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5654:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5655:   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5656:   MatScalar      *v_work;
5657:   PetscBool      col_identity,row_identity,both_identity;

5660:   ISGetIndices(isrow,&r);
5661:   ISGetIndices(isicol,&ic);
5662: 
5663:   PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);
5664:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

5666:   /* generate work space needed by dense LU factorization */
5667:   PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);

5669:   for (i=0; i<n; i++){
5670:     /* zero rtmp */
5671:     /* L part */
5672:     nz    = bi[i+1] - bi[i];
5673:     bjtmp = bj + bi[i];
5674:     for  (j=0; j<nz; j++){
5675:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5676:     }

5678:     /* U part */
5679:     nz = bdiag[i] - bdiag[i+1];
5680:     bjtmp = bj + bdiag[i+1]+1;
5681:     for  (j=0; j<nz; j++){
5682:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5683:     }
5684: 
5685:     /* load in initial (unfactored row) */
5686:     nz    = ai[r[i]+1] - ai[r[i]];
5687:     ajtmp = aj + ai[r[i]];
5688:     v     = aa + bs2*ai[r[i]];
5689:     for (j=0; j<nz; j++) {
5690:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
5691:     }

5693:     /* elimination */
5694:     bjtmp = bj + bi[i];
5695:     nzL   = bi[i+1] - bi[i];
5696:     for(k=0;k < nzL;k++) {
5697:       row = bjtmp[k];
5698:       pc = rtmp + bs2*row;
5699:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5700:       if (flg) {
5701:         pv         = b->a + bs2*bdiag[row];
5702:         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5703:         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5704:         pv         = b->a + bs2*(bdiag[row+1]+1);
5705:         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5706:         for (j=0; j<nz; j++) {
5707:           PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5708:         }
5709:         PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5710:       }
5711:     }

5713:     /* finished row so stick it into b->a */
5714:     /* L part */
5715:     pv   = b->a + bs2*bi[i] ;
5716:     pj   = b->j + bi[i] ;
5717:     nz   = bi[i+1] - bi[i];
5718:     for (j=0; j<nz; j++) {
5719:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5720:     }

5722:     /* Mark diagonal and invert diagonal for simplier triangular solves */
5723:     pv  = b->a + bs2*bdiag[i];
5724:     pj  = b->j + bdiag[i];
5725:     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5726:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5727:     PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);
5728: 
5729:     /* U part */
5730:     pv = b->a + bs2*(bdiag[i+1]+1);
5731:     pj = b->j + bdiag[i+1]+1;
5732:     nz = bdiag[i] - bdiag[i+1] - 1;
5733:     for (j=0; j<nz; j++){
5734:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5735:     }
5736:   }

5738:   PetscFree(rtmp);
5739:   PetscFree3(v_work,mwork,v_pivots);
5740:   ISRestoreIndices(isicol,&ic);
5741:   ISRestoreIndices(isrow,&r);

5743:   ISIdentity(isrow,&row_identity);
5744:   ISIdentity(isicol,&col_identity);
5745:   both_identity = (PetscBool) (row_identity && col_identity);
5746:   if (both_identity){
5747:     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5748:   } else {
5749:     C->ops->solve = MatSolve_SeqBAIJ_N;
5750:   }
5751:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5752: 
5753:   C->assembled = PETSC_TRUE;
5754:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5755:   return(0);
5756: }

5758: /* 
5759:    ilu(0) with natural ordering under new data structure.
5760:    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5761:    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5762: */

5766: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5767: {
5768: 
5769:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5770:   PetscErrorCode     ierr;
5771:   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5772:   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;

5775:   MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
5776:   b    = (Mat_SeqBAIJ*)(fact)->data;
5777: 
5778:   /* allocate matrix arrays for new data structure */
5779:   PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);
5780:   PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
5781:   b->singlemalloc    = PETSC_TRUE;
5782:   b->free_a          = PETSC_TRUE;
5783:   b->free_ij         = PETSC_TRUE;
5784:   fact->preallocated = PETSC_TRUE;
5785:   fact->assembled    = PETSC_TRUE;
5786:   if (!b->diag){
5787:     PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);
5788:     PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));
5789:   }
5790:   bdiag = b->diag;
5791: 
5792:   if (n > 0) {
5793:     PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
5794:   }
5795: 
5796:   /* set bi and bj with new data structure */
5797:   bi = b->i;
5798:   bj = b->j;

5800:   /* L part */
5801:   bi[0] = 0;
5802:   for (i=0; i<n; i++){
5803:     nz = adiag[i] - ai[i];
5804:     bi[i+1] = bi[i] + nz;
5805:     aj = a->j + ai[i];
5806:     for (j=0; j<nz; j++){
5807:       *bj = aj[j]; bj++;
5808:     }
5809:   }
5810: 
5811:   /* U part */
5812:   bi_temp = bi[n];
5813:   bdiag[n] = bi[n]-1;
5814:   for (i=n-1; i>=0; i--){
5815:     nz = ai[i+1] - adiag[i] - 1;
5816:     bi_temp = bi_temp + nz + 1;
5817:     aj = a->j + adiag[i] + 1;
5818:     for (j=0; j<nz; j++){
5819:       *bj = aj[j]; bj++;
5820:     }
5821:     /* diag[i] */
5822:     *bj = i; bj++;
5823:     bdiag[i] = bi_temp - 1;
5824:   }
5825:   return(0);
5826: }

5830: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5831: {
5832:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5833:   IS                 isicol;
5834:   PetscErrorCode     ierr;
5835:   const PetscInt     *r,*ic;
5836:   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5837:   PetscInt           *bi,*cols,nnz,*cols_lvl;
5838:   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5839:   PetscInt           i,levels,diagonal_fill;
5840:   PetscBool          col_identity,row_identity,both_identity;
5841:   PetscReal          f;
5842:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5843:   PetscBT            lnkbt;
5844:   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5845:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5846:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5847:   PetscBool          missing;
5848:   PetscInt           bs=A->rmap->bs,bs2=a->bs2;

5851:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5852:   if (bs>1){  /* check shifttype */
5853:     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5854:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5855:   }

5857:   MatMissingDiagonal(A,&missing,&d);
5858:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

5860:   f             = info->fill;
5861:   levels        = (PetscInt)info->levels;
5862:   diagonal_fill = (PetscInt)info->diagonal_fill;
5863:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);

5865:   ISIdentity(isrow,&row_identity);
5866:   ISIdentity(iscol,&col_identity);
5867:   both_identity = (PetscBool) (row_identity && col_identity);
5868: 
5869:   if (!levels && both_identity) {
5870:     /* special case: ilu(0) with natural ordering */
5871:     MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
5872:     MatSeqBAIJSetNumericFactorization(fact,both_identity);

5874:     fact->factortype               = MAT_FACTOR_ILU;
5875:     (fact)->info.factor_mallocs    = 0;
5876:     (fact)->info.fill_ratio_given  = info->fill;
5877:     (fact)->info.fill_ratio_needed = 1.0;
5878:     b                = (Mat_SeqBAIJ*)(fact)->data;
5879:     b->row           = isrow;
5880:     b->col           = iscol;
5881:     b->icol          = isicol;
5882:     PetscObjectReference((PetscObject)isrow);
5883:     PetscObjectReference((PetscObject)iscol);
5884:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5885:     PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5886:     return(0);
5887:   }
5888: 
5889:   ISGetIndices(isrow,&r);
5890:   ISGetIndices(isicol,&ic);
5891: 
5892:   /* get new row pointers */
5893:   PetscMalloc((n+1)*sizeof(PetscInt),&bi);
5894:   bi[0] = 0;
5895:   /* bdiag is location of diagonal in factor */
5896:   PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);
5897:   bdiag[0]  = 0;

5899:   PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);

5901:   /* create a linked list for storing column indices of the active row */
5902:   nlnk = n + 1;
5903:   PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
5904: 
5905:   /* initial FreeSpace size is f*(ai[n]+1) */
5906:   PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
5907:   current_space = free_space;
5908:   PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
5909:   current_space_lvl = free_space_lvl;
5910: 
5911:   for (i=0; i<n; i++) {
5912:     nzi = 0;
5913:     /* copy current row into linked list */
5914:     nnz  = ai[r[i]+1] - ai[r[i]];
5915:     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5916:     cols = aj + ai[r[i]];
5917:     lnk[i] = -1; /* marker to indicate if diagonal exists */
5918:     PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
5919:     nzi += nlnk;

5921:     /* make sure diagonal entry is included */
5922:     if (diagonal_fill && lnk[i] == -1) {
5923:       fm = n;
5924:       while (lnk[fm] < i) fm = lnk[fm];
5925:       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5926:       lnk[fm]    = i;
5927:       lnk_lvl[i] = 0;
5928:       nzi++; dcount++;
5929:     }

5931:     /* add pivot rows into the active row */
5932:     nzbd = 0;
5933:     prow = lnk[n];
5934:     while (prow < i) {
5935:       nnz      = bdiag[prow];
5936:       cols     = bj_ptr[prow] + nnz + 1;
5937:       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5938:       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5939:       PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
5940:       nzi += nlnk;
5941:       prow = lnk[prow];
5942:       nzbd++;
5943:     }
5944:     bdiag[i] = nzbd;
5945:     bi[i+1]  = bi[i] + nzi;

5947:     /* if free space is not available, make more free space */
5948:     if (current_space->local_remaining<nzi) {
5949:       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5950:       PetscFreeSpaceGet(nnz,&current_space);
5951:       PetscFreeSpaceGet(nnz,&current_space_lvl);
5952:       reallocs++;
5953:     }

5955:     /* copy data into free_space and free_space_lvl, then initialize lnk */
5956:     PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
5957:     bj_ptr[i]    = current_space->array;
5958:     bjlvl_ptr[i] = current_space_lvl->array;

5960:     /* make sure the active row i has diagonal entry */
5961:     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);

5963:     current_space->array           += nzi;
5964:     current_space->local_used      += nzi;
5965:     current_space->local_remaining -= nzi;
5966:     current_space_lvl->array           += nzi;
5967:     current_space_lvl->local_used      += nzi;
5968:     current_space_lvl->local_remaining -= nzi;
5969:   }
5970: 
5971:   ISRestoreIndices(isrow,&r);
5972:   ISRestoreIndices(isicol,&ic);
5973: 
5974:   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5975:   PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);
5976:   PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
5977: 
5978:   PetscIncompleteLLDestroy(lnk,lnkbt);
5979:   PetscFreeSpaceDestroy(free_space_lvl);
5980:   PetscFree2(bj_ptr,bjlvl_ptr);

5982: #if defined(PETSC_USE_INFO)
5983:   {
5984:     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5985:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
5986:     PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);
5987:     PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);
5988:     PetscInfo(A,"for best performance.\n");
5989:     if (diagonal_fill) {
5990:       PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);
5991:     }
5992:   }
5993: #endif

5995:   /* put together the new matrix */
5996:   MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
5997:   PetscLogObjectParent(fact,isicol);
5998:   b = (Mat_SeqBAIJ*)(fact)->data;
5999:   b->free_a       = PETSC_TRUE;
6000:   b->free_ij      = PETSC_TRUE;
6001:   b->singlemalloc = PETSC_FALSE;
6002:   PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);
6003:   b->j          = bj;
6004:   b->i          = bi;
6005:   b->diag       = bdiag;
6006:   b->free_diag  = PETSC_TRUE;
6007:   b->ilen       = 0;
6008:   b->imax       = 0;
6009:   b->row        = isrow;
6010:   b->col        = iscol;
6011:   PetscObjectReference((PetscObject)isrow);
6012:   PetscObjectReference((PetscObject)iscol);
6013:   b->icol       = isicol;
6014:   PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6015:   /* In b structure:  Free imax, ilen, old a, old j.  
6016:      Allocate bdiag, solve_work, new a, new j */
6017:   PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
6018:   b->maxnz = b->nz = bdiag[0]+1;
6019:   fact->info.factor_mallocs    = reallocs;
6020:   fact->info.fill_ratio_given  = f;
6021:   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6022:   MatSeqBAIJSetNumericFactorization(fact,both_identity);
6023:   return(0);
6024: }

6026: /*
6027:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6028:    except that the data structure of Mat_SeqAIJ is slightly different.
6029:    Not a good example of code reuse.
6030: */
6033: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6034: {
6035:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6036:   IS             isicol;
6038:   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6039:   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6040:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6041:   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6042:   PetscBool      col_identity,row_identity,both_identity,flg;
6043:   PetscReal      f;

6046:   MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
6047:   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6048: 
6049:   f             = info->fill;
6050:   levels        = (PetscInt)info->levels;
6051:   diagonal_fill = (PetscInt)info->diagonal_fill;
6052:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);

6054:   ISIdentity(isrow,&row_identity);
6055:   ISIdentity(iscol,&col_identity);
6056:   both_identity = (PetscBool) (row_identity && col_identity);

6058:   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6059:     MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
6060:     MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);

6062:     fact->factortype = MAT_FACTOR_ILU;
6063:     b            = (Mat_SeqBAIJ*)fact->data;
6064:     b->row       = isrow;
6065:     b->col       = iscol;
6066:     PetscObjectReference((PetscObject)isrow);
6067:     PetscObjectReference((PetscObject)iscol);
6068:     b->icol      = isicol;
6069:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6070:     PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
6071:     return(0);
6072:   }

6074:   /* general case perform the symbolic factorization */
6075:     ISGetIndices(isrow,&r);
6076:     ISGetIndices(isicol,&ic);

6078:     /* get new row pointers */
6079:     PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
6080:     ainew[0] = 0;
6081:     /* don't know how many column pointers are needed so estimate */
6082:     jmax = (PetscInt)(f*ai[n] + 1);
6083:     PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
6084:     /* ajfill is level of fill for each fill entry */
6085:     PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
6086:     /* fill is a linked list of nonzeros in active row */
6087:     PetscMalloc((n+1)*sizeof(PetscInt),&fill);
6088:     /* im is level for each filled value */
6089:     PetscMalloc((n+1)*sizeof(PetscInt),&im);
6090:     /* dloc is location of diagonal in factor */
6091:     PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
6092:     dloc[0]  = 0;
6093:     for (prow=0; prow<n; prow++) {

6095:       /* copy prow into linked list */
6096:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6097:       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6098:       xi         = aj + ai[r[prow]];
6099:       fill[n]    = n;
6100:       fill[prow] = -1; /* marker for diagonal entry */
6101:       while (nz--) {
6102:         fm  = n;
6103:         idx = ic[*xi++];
6104:         do {
6105:           m  = fm;
6106:           fm = fill[m];
6107:         } while (fm < idx);
6108:         fill[m]   = idx;
6109:         fill[idx] = fm;
6110:         im[idx]   = 0;
6111:       }

6113:       /* make sure diagonal entry is included */
6114:       if (diagonal_fill && fill[prow] == -1) {
6115:         fm = n;
6116:         while (fill[fm] < prow) fm = fill[fm];
6117:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
6118:         fill[fm]   = prow;
6119:         im[prow]   = 0;
6120:         nzf++;
6121:         dcount++;
6122:       }

6124:       nzi = 0;
6125:       row = fill[n];
6126:       while (row < prow) {
6127:         incrlev = im[row] + 1;
6128:         nz      = dloc[row];
6129:         xi      = ajnew  + ainew[row] + nz + 1;
6130:         flev    = ajfill + ainew[row] + nz + 1;
6131:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
6132:         fm      = row;
6133:         while (nnz-- > 0) {
6134:           idx = *xi++;
6135:           if (*flev + incrlev > levels) {
6136:             flev++;
6137:             continue;
6138:           }
6139:           do {
6140:             m  = fm;
6141:             fm = fill[m];
6142:           } while (fm < idx);
6143:           if (fm != idx) {
6144:             im[idx]   = *flev + incrlev;
6145:             fill[m]   = idx;
6146:             fill[idx] = fm;
6147:             fm        = idx;
6148:             nzf++;
6149:           } else {
6150:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6151:           }
6152:           flev++;
6153:         }
6154:         row = fill[row];
6155:         nzi++;
6156:       }
6157:       /* copy new filled row into permanent storage */
6158:       ainew[prow+1] = ainew[prow] + nzf;
6159:       if (ainew[prow+1] > jmax) {

6161:         /* estimate how much additional space we will need */
6162:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6163:         /* just double the memory each time */
6164:         PetscInt maxadd = jmax;
6165:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6166:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6167:         jmax += maxadd;

6169:         /* allocate a longer ajnew and ajfill */
6170:         PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6171:         PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
6172:         PetscFree(ajnew);
6173:         ajnew = xitmp;
6174:         PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
6175:         PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
6176:         PetscFree(ajfill);
6177:         ajfill = xitmp;
6178:         reallocate++; /* count how many reallocations are needed */
6179:       }
6180:       xitmp       = ajnew + ainew[prow];
6181:       flev        = ajfill + ainew[prow];
6182:       dloc[prow]  = nzi;
6183:       fm          = fill[n];
6184:       while (nzf--) {
6185:         *xitmp++ = fm;
6186:         *flev++ = im[fm];
6187:         fm      = fill[fm];
6188:       }
6189:       /* make sure row has diagonal entry */
6190:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6191:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6192:     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6193:       }
6194:     }
6195:     PetscFree(ajfill);
6196:     ISRestoreIndices(isrow,&r);
6197:     ISRestoreIndices(isicol,&ic);
6198:     PetscFree(fill);
6199:     PetscFree(im);

6201: #if defined(PETSC_USE_INFO)
6202:     {
6203:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6204:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
6205:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
6206:       PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
6207:       PetscInfo(A,"for best performance.\n");
6208:       if (diagonal_fill) {
6209:         PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
6210:       }
6211:     }
6212: #endif

6214:     /* put together the new matrix */
6215:     MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
6216:     PetscLogObjectParent(fact,isicol);
6217:     b    = (Mat_SeqBAIJ*)fact->data;
6218:     b->free_a       = PETSC_TRUE;
6219:     b->free_ij      = PETSC_TRUE;
6220:     b->singlemalloc = PETSC_FALSE;
6221:     PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
6222:     b->j          = ajnew;
6223:     b->i          = ainew;
6224:     for (i=0; i<n; i++) dloc[i] += ainew[i];
6225:     b->diag       = dloc;
6226:     b->free_diag  = PETSC_TRUE;
6227:     b->ilen       = 0;
6228:     b->imax       = 0;
6229:     b->row        = isrow;
6230:     b->col        = iscol;
6231:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6232:     PetscObjectReference((PetscObject)isrow);
6233:     PetscObjectReference((PetscObject)iscol);
6234:     b->icol       = isicol;
6235:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6236:     /* In b structure:  Free imax, ilen, old a, old j.  
6237:        Allocate dloc, solve_work, new a, new j */
6238:     PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
6239:     b->maxnz          = b->nz = ainew[n];

6241:     fact->info.factor_mallocs    = reallocate;
6242:     fact->info.fill_ratio_given  = f;
6243:     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);

6245:   MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6246:   return(0);
6247: }

6251: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6252: {
6253:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6254:   /* int i,*AJ=a->j,nz=a->nz; */
6256:   /* Undo Column scaling */
6257: /*    while (nz--) { */
6258: /*      AJ[i] = AJ[i]/4; */
6259: /*    } */
6260:   /* This should really invoke a push/pop logic, but we don't have that yet. */
6261:   A->ops->setunfactored = PETSC_NULL;
6262:   return(0);
6263: }

6267: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6268: {
6269:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6270:   PetscInt       *AJ=a->j,nz=a->nz;
6271:   unsigned short *aj=(unsigned short *)AJ;
6273:   /* Is this really necessary? */
6274:   while (nz--) {
6275:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6276:   }
6277:   A->ops->setunfactored = PETSC_NULL;
6278:   return(0);
6279: }