Actual source code: sbaijfact2.c

petsc-3.6.1 2015-08-06
Report Typos and Errors
  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 12: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 13: {
 14:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
 15:   IS                isrow=a->row;
 16:   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
 17:   PetscErrorCode    ierr;
 18:   const PetscInt    *r;
 19:   PetscInt          nz,*vj,k,idx,k1;
 20:   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
 21:   const MatScalar   *aa=a->a,*v,*diag;
 22:   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
 23:   const PetscScalar *b;

 26:   VecGetArrayRead(bb,&b);
 27:   VecGetArray(xx,&x);
 28:   t    = a->solve_work;
 29:   ISGetIndices(isrow,&r);
 30:   PetscMalloc1(bs,&xk_tmp);

 32:   /* solve U^T * D * y = b by forward substitution */
 33:   xk = t;
 34:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 35:     idx = bs*r[k];
 36:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 37:   }
 38:   for (k=0; k<mbs; k++) {
 39:     v    = aa + bs2*ai[k];
 40:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 41:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 42:     nz   = ai[k+1] - ai[k];
 43:     vj   = aj + ai[k];
 44:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 45:     while (nz--) {
 46:       /* x(:) += U(k,:)^T*(Dk*xk) */
 47:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 48:       vj++; xj = t + (*vj)*bs;
 49:       v       += bs2;
 50:     }
 51:     /* xk = inv(Dk)*(Dk*xk) */
 52:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 53:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 54:   }

 56:   /* solve U*x = y by back substitution */
 57:   for (k=mbs-1; k>=0; k--) {
 58:     v  = aa + bs2*ai[k];
 59:     xk = t + k*bs;        /* xk */
 60:     nz = ai[k+1] - ai[k];
 61:     vj = aj + ai[k];
 62:     xj = t + (*vj)*bs;
 63:     while (nz--) {
 64:       /* xk += U(k,:)*x(:) */
 65:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 66:       vj++;
 67:       v += bs2; xj = t + (*vj)*bs;
 68:     }
 69:     idx = bs*r[k];
 70:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 71:   }

 73:   PetscFree(xk_tmp);
 74:   ISRestoreIndices(isrow,&r);
 75:   VecRestoreArrayRead(bb,&b);
 76:   VecRestoreArray(xx,&x);
 77:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 78:   return(0);
 79: }

 83: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 84: {
 86:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 87:   return(0);
 88: }

 92: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 93: {
 95:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 96:   return(0);
 97: }

101: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102: {
103:   PetscErrorCode  ierr;
104:   PetscInt        nz,k;
105:   const PetscInt  *vj,bs2 = bs*bs;
106:   const MatScalar *v,*diag;
107:   PetscScalar     *xk,*xj,*xk_tmp;

110:   PetscMalloc1(bs,&xk_tmp);
111:   for (k=0; k<mbs; k++) {
112:     v    = aa + bs2*ai[k];
113:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
114:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
115:     nz   = ai[k+1] - ai[k];
116:     vj   = aj + ai[k];
117:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
118:     while (nz--) {
119:       /* x(:) += U(k,:)^T*(Dk*xk) */
120:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121:       vj++; xj = x + (*vj)*bs;
122:       v       += bs2;
123:     }
124:     /* xk = inv(Dk)*(Dk*xk) */
125:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
126:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127:   }
128:   PetscFree(xk_tmp);
129:   return(0);
130: }

134: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135: {
136:   PetscInt        nz,k;
137:   const PetscInt  *vj,bs2 = bs*bs;
138:   const MatScalar *v;
139:   PetscScalar     *xk,*xj;

142:   for (k=mbs-1; k>=0; k--) {
143:     v  = aa + bs2*ai[k];
144:     xk = x + k*bs;        /* xk */
145:     nz = ai[k+1] - ai[k];
146:     vj = aj + ai[k];
147:     xj = x + (*vj)*bs;
148:     while (nz--) {
149:       /* xk += U(k,:)*x(:) */
150:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151:       vj++;
152:       v += bs2; xj = x + (*vj)*bs;
153:     }
154:   }
155:   return(0);
156: }

160: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161: {
162:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
163:   PetscErrorCode    ierr;
164:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
165:   PetscInt          bs =A->rmap->bs;
166:   const MatScalar   *aa=a->a;
167:   PetscScalar       *x;
168:   const PetscScalar *b;

171:   VecGetArrayRead(bb,&b);
172:   VecGetArray(xx,&x);

174:   /* solve U^T * D * y = b by forward substitution */
175:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
176:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

178:   /* solve U*x = y by back substitution */
179:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

181:   VecRestoreArrayRead(bb,&b);
182:   VecRestoreArray(xx,&x);
183:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
184:   return(0);
185: }

189: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
190: {
191:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
192:   PetscErrorCode    ierr;
193:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
194:   PetscInt          bs =A->rmap->bs;
195:   const MatScalar   *aa=a->a;
196:   const PetscScalar *b;
197:   PetscScalar       *x;

200:   VecGetArrayRead(bb,&b);
201:   VecGetArray(xx,&x);
202:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
203:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
204:   VecRestoreArrayRead(bb,&b);
205:   VecRestoreArray(xx,&x);
206:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
207:   return(0);
208: }

212: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
213: {
214:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
215:   PetscErrorCode    ierr;
216:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
217:   PetscInt          bs =A->rmap->bs;
218:   const MatScalar   *aa=a->a;
219:   const PetscScalar *b;
220:   PetscScalar       *x;

223:   VecGetArrayRead(bb,&b);
224:   VecGetArray(xx,&x);
225:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
226:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
227:   VecRestoreArrayRead(bb,&b);
228:   VecRestoreArray(xx,&x);
229:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
230:   return(0);
231: }

235: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
236: {
237:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
238:   IS                isrow=a->row;
239:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs,*r,*vj;
240:   PetscErrorCode    ierr;
241:   PetscInt          nz,k,idx;
242:   const MatScalar   *aa=a->a,*v,*d;
243:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
244:   const PetscScalar *b;

247:   VecGetArrayRead(bb,&b);
248:   VecGetArray(xx,&x);
249:   t    = a->solve_work;
250:   ISGetIndices(isrow,&r);

252:   /* solve U^T * D * y = b by forward substitution */
253:   tp = t;
254:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
255:     idx   = 7*r[k];
256:     tp[0] = b[idx];
257:     tp[1] = b[idx+1];
258:     tp[2] = b[idx+2];
259:     tp[3] = b[idx+3];
260:     tp[4] = b[idx+4];
261:     tp[5] = b[idx+5];
262:     tp[6] = b[idx+6];
263:     tp   += 7;
264:   }

266:   for (k=0; k<mbs; k++) {
267:     v  = aa + 49*ai[k];
268:     vj = aj + ai[k];
269:     tp = t + k*7;
270:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
271:     nz = ai[k+1] - ai[k];
272:     tp = t + (*vj)*7;
273:     while (nz--) {
274:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
275:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
276:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
277:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
278:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
279:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
280:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
281:       vj++;
282:       tp = t + (*vj)*7;
283:       v += 49;
284:     }

286:     /* xk = inv(Dk)*(Dk*xk) */
287:     d     = aa+k*49;       /* ptr to inv(Dk) */
288:     tp    = t + k*7;
289:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
290:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
291:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
292:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
293:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
294:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
295:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
296:   }

298:   /* solve U*x = y by back substitution */
299:   for (k=mbs-1; k>=0; k--) {
300:     v  = aa + 49*ai[k];
301:     vj = aj + ai[k];
302:     tp = t + k*7;
303:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
304:     nz = ai[k+1] - ai[k];

306:     tp = t + (*vj)*7;
307:     while (nz--) {
308:       /* xk += U(k,:)*x(:) */
309:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
310:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
311:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
312:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
313:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
314:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
315:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
316:       vj++;
317:       tp = t + (*vj)*7;
318:       v += 49;
319:     }
320:     tp       = t + k*7;
321:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
322:     idx      = 7*r[k];
323:     x[idx]   = x0;
324:     x[idx+1] = x1;
325:     x[idx+2] = x2;
326:     x[idx+3] = x3;
327:     x[idx+4] = x4;
328:     x[idx+5] = x5;
329:     x[idx+6] = x6;
330:   }

332:   ISRestoreIndices(isrow,&r);
333:   VecRestoreArrayRead(bb,&b);
334:   VecRestoreArray(xx,&x);
335:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
336:   return(0);
337: }

341: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
342: {
343:   const MatScalar *v,*d;
344:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
345:   PetscInt        nz,k;
346:   const PetscInt  *vj;

349:   for (k=0; k<mbs; k++) {
350:     v  = aa + 49*ai[k];
351:     xp = x + k*7;
352:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
353:     nz = ai[k+1] - ai[k];
354:     vj = aj + ai[k];
355:     xp = x + (*vj)*7;
356:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
357:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358:     while (nz--) {
359:       /* x(:) += U(k,:)^T*(Dk*xk) */
360:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
361:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
362:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
363:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
364:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
365:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
366:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
367:       vj++;
368:       xp = x + (*vj)*7;
369:       v += 49;
370:     }
371:     /* xk = inv(Dk)*(Dk*xk) */
372:     d     = aa+k*49;       /* ptr to inv(Dk) */
373:     xp    = x + k*7;
374:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
375:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
376:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
377:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
378:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
379:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
380:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
381:   }
382:   return(0);
383: }

387: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
388: {
389:   const MatScalar *v;
390:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
391:   PetscInt        nz,k;
392:   const PetscInt  *vj;

395:   for (k=mbs-1; k>=0; k--) {
396:     v  = aa + 49*ai[k];
397:     xp = x + k*7;
398:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
399:     nz = ai[k+1] - ai[k];
400:     vj = aj + ai[k];
401:     xp = x + (*vj)*7;
402:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
403:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
404:     while (nz--) {
405:       /* xk += U(k,:)*x(:) */
406:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
407:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
408:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
409:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
410:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
411:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
412:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
413:       vj++;
414:       v += 49; xp = x + (*vj)*7;
415:     }
416:     xp   = x + k*7;
417:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
418:   }
419:   return(0);
420: }

424: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
427:   PetscErrorCode    ierr;
428:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
429:   const MatScalar   *aa=a->a;
430:   PetscScalar       *x;
431:   const PetscScalar *b;

434:   VecGetArrayRead(bb,&b);
435:   VecGetArray(xx,&x);

437:   /* solve U^T * D * y = b by forward substitution */
438:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
439:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

441:   /* solve U*x = y by back substitution */
442:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

444:   VecRestoreArrayRead(bb,&b);
445:   VecRestoreArray(xx,&x);
446:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
447:   return(0);
448: }

452: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
453: {
454:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
455:   PetscErrorCode    ierr;
456:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
457:   const MatScalar   *aa=a->a;
458:   PetscScalar       *x;
459:   const PetscScalar *b;

462:   VecGetArrayRead(bb,&b);
463:   VecGetArray(xx,&x);
464:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
465:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
466:   VecRestoreArrayRead(bb,&b);
467:   VecRestoreArray(xx,&x);
468:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
469:   return(0);
470: }

474: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
475: {
476:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
477:   PetscErrorCode    ierr;
478:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
479:   const MatScalar   *aa=a->a;
480:   PetscScalar       *x;
481:   const PetscScalar *b;

484:   VecGetArrayRead(bb,&b);
485:   VecGetArray(xx,&x);
486:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
487:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
488:   VecRestoreArrayRead(bb,&b);
489:   VecRestoreArray(xx,&x);
490:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
491:   return(0);
492: }

496: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
497: {
498:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
499:   IS                isrow=a->row;
500:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2,*r,*vj;
501:   PetscErrorCode    ierr;
502:   PetscInt          nz,k,idx;
503:   const MatScalar   *aa=a->a,*v,*d;
504:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
505:   const PetscScalar *b;

508:   VecGetArrayRead(bb,&b);
509:   VecGetArray(xx,&x);
510:   t    = a->solve_work;
511:   ISGetIndices(isrow,&r);

513:   /* solve U^T * D * y = b by forward substitution */
514:   tp = t;
515:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
516:     idx   = 6*r[k];
517:     tp[0] = b[idx];
518:     tp[1] = b[idx+1];
519:     tp[2] = b[idx+2];
520:     tp[3] = b[idx+3];
521:     tp[4] = b[idx+4];
522:     tp[5] = b[idx+5];
523:     tp   += 6;
524:   }

526:   for (k=0; k<mbs; k++) {
527:     v  = aa + 36*ai[k];
528:     vj = aj + ai[k];
529:     tp = t + k*6;
530:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
531:     nz = ai[k+1] - ai[k];
532:     tp = t + (*vj)*6;
533:     while (nz--) {
534:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
535:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
536:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
537:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
538:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
539:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
540:       vj++;
541:       tp = t + (*vj)*6;
542:       v += 36;
543:     }

545:     /* xk = inv(Dk)*(Dk*xk) */
546:     d     = aa+k*36;       /* ptr to inv(Dk) */
547:     tp    = t + k*6;
548:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
549:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
550:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
551:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
552:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
553:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
554:   }

556:   /* solve U*x = y by back substitution */
557:   for (k=mbs-1; k>=0; k--) {
558:     v  = aa + 36*ai[k];
559:     vj = aj + ai[k];
560:     tp = t + k*6;
561:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
562:     nz = ai[k+1] - ai[k];

564:     tp = t + (*vj)*6;
565:     while (nz--) {
566:       /* xk += U(k,:)*x(:) */
567:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
568:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
569:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
570:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
571:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
572:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
573:       vj++;
574:       tp = t + (*vj)*6;
575:       v += 36;
576:     }
577:     tp       = t + k*6;
578:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
579:     idx      = 6*r[k];
580:     x[idx]   = x0;
581:     x[idx+1] = x1;
582:     x[idx+2] = x2;
583:     x[idx+3] = x3;
584:     x[idx+4] = x4;
585:     x[idx+5] = x5;
586:   }

588:   ISRestoreIndices(isrow,&r);
589:   VecRestoreArrayRead(bb,&b);
590:   VecRestoreArray(xx,&x);
591:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
592:   return(0);
593: }

597: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
598: {
599:   const MatScalar *v,*d;
600:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
601:   PetscInt        nz,k;
602:   const PetscInt  *vj;

605:   for (k=0; k<mbs; k++) {
606:     v  = aa + 36*ai[k];
607:     xp = x + k*6;
608:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
609:     nz = ai[k+1] - ai[k];
610:     vj = aj + ai[k];
611:     xp = x + (*vj)*6;
612:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
613:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
614:     while (nz--) {
615:       /* x(:) += U(k,:)^T*(Dk*xk) */
616:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
617:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
618:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
619:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
620:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
621:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
622:       vj++;
623:       xp = x + (*vj)*6;
624:       v += 36;
625:     }
626:     /* xk = inv(Dk)*(Dk*xk) */
627:     d     = aa+k*36;       /* ptr to inv(Dk) */
628:     xp    = x + k*6;
629:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
630:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
631:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
632:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
633:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
634:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
635:   }
636:   return(0);
637: }
640: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
641: {
642:   const MatScalar   *v;
643:   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
644:   PetscInt          nz,k;
645:   const PetscInt    *vj;

648:   for (k=mbs-1; k>=0; k--) {
649:     v  = aa + 36*ai[k];
650:     xp = x + k*6;
651:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
652:     nz = ai[k+1] - ai[k];
653:     vj = aj + ai[k];
654:     xp = x + (*vj)*6;
655:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
656:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
657:     while (nz--) {
658:       /* xk += U(k,:)*x(:) */
659:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
660:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
661:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
662:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
663:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
664:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
665:       vj++;
666:       v += 36; xp = x + (*vj)*6;
667:     }
668:     xp   = x + k*6;
669:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
670:   }
671:   return(0);
672: }


677: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
678: {
679:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
680:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
681:   const MatScalar   *aa=a->a;
682:   PetscScalar       *x;
683:   const PetscScalar *b;
684:   PetscErrorCode    ierr;

687:   VecGetArrayRead(bb,&b);
688:   VecGetArray(xx,&x);

690:   /* solve U^T * D * y = b by forward substitution */
691:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
692:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

694:   /* solve U*x = y by back substitution */
695:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

697:   VecRestoreArrayRead(bb,&b);
698:   VecRestoreArray(xx,&x);
699:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
700:   return(0);
701: }

705: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
706: {
707:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
708:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
709:   const MatScalar   *aa=a->a;
710:   PetscScalar       *x;
711:   const PetscScalar *b;
712:   PetscErrorCode    ierr;

715:   VecGetArrayRead(bb,&b);
716:   VecGetArray(xx,&x);
717:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
718:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
719:   VecRestoreArrayRead(bb,&b);
720:   VecRestoreArray(xx,&x);
721:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
722:   return(0);
723: }

727: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
728: {
729:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
730:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
731:   const MatScalar   *aa=a->a;
732:   PetscScalar       *x;
733:   const PetscScalar *b;
734:   PetscErrorCode    ierr;

737:   VecGetArrayRead(bb,&b);
738:   VecGetArray(xx,&x);
739:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
740:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
741:   VecRestoreArrayRead(bb,&b);
742:   VecRestoreArray(xx,&x);
743:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
744:   return(0);
745: }

749: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
750: {
751:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
752:   IS                isrow=a->row;
753:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
754:   PetscErrorCode    ierr;
755:   const PetscInt    *r,*vj;
756:   PetscInt          nz,k,idx;
757:   const MatScalar   *aa=a->a,*v,*diag;
758:   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
759:   const PetscScalar *b;

762:   VecGetArrayRead(bb,&b);
763:   VecGetArray(xx,&x);
764:   t    = a->solve_work;
765:   ISGetIndices(isrow,&r);

767:   /* solve U^T * D * y = b by forward substitution */
768:   tp = t;
769:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
770:     idx   = 5*r[k];
771:     tp[0] = b[idx];
772:     tp[1] = b[idx+1];
773:     tp[2] = b[idx+2];
774:     tp[3] = b[idx+3];
775:     tp[4] = b[idx+4];
776:     tp   += 5;
777:   }

779:   for (k=0; k<mbs; k++) {
780:     v  = aa + 25*ai[k];
781:     vj = aj + ai[k];
782:     tp = t + k*5;
783:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
784:     nz = ai[k+1] - ai[k];

786:     tp = t + (*vj)*5;
787:     while (nz--) {
788:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
789:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
790:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
791:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
792:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
793:       vj++;
794:       tp = t + (*vj)*5;
795:       v += 25;
796:     }

798:     /* xk = inv(Dk)*(Dk*xk) */
799:     diag  = aa+k*25;          /* ptr to inv(Dk) */
800:     tp    = t + k*5;
801:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
802:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
803:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
804:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
805:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
806:   }

808:   /* solve U*x = y by back substitution */
809:   for (k=mbs-1; k>=0; k--) {
810:     v  = aa + 25*ai[k];
811:     vj = aj + ai[k];
812:     tp = t + k*5;
813:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
814:     nz = ai[k+1] - ai[k];

816:     tp = t + (*vj)*5;
817:     while (nz--) {
818:       /* xk += U(k,:)*x(:) */
819:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
820:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
821:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
822:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
823:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
824:       vj++;
825:       tp = t + (*vj)*5;
826:       v += 25;
827:     }
828:     tp       = t + k*5;
829:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
830:     idx      = 5*r[k];
831:     x[idx]   = x0;
832:     x[idx+1] = x1;
833:     x[idx+2] = x2;
834:     x[idx+3] = x3;
835:     x[idx+4] = x4;
836:   }

838:   ISRestoreIndices(isrow,&r);
839:   VecRestoreArrayRead(bb,&b);
840:   VecRestoreArray(xx,&x);
841:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
842:   return(0);
843: }

847: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
848: {
849:   const MatScalar *v,*diag;
850:   PetscScalar     *xp,x0,x1,x2,x3,x4;
851:   PetscInt        nz,k;
852:   const PetscInt  *vj;

855:   for (k=0; k<mbs; k++) {
856:     v  = aa + 25*ai[k];
857:     xp = x + k*5;
858:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
859:     nz = ai[k+1] - ai[k];
860:     vj = aj + ai[k];
861:     xp = x + (*vj)*5;
862:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
863:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864:     while (nz--) {
865:       /* x(:) += U(k,:)^T*(Dk*xk) */
866:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
867:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
868:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
869:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
870:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
871:       vj++;
872:       xp = x + (*vj)*5;
873:       v += 25;
874:     }
875:     /* xk = inv(Dk)*(Dk*xk) */
876:     diag  = aa+k*25;         /* ptr to inv(Dk) */
877:     xp    = x + k*5;
878:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
879:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
880:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
881:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
882:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
883:   }
884:   return(0);
885: }

889: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
890: {
891:   const MatScalar *v;
892:   PetscScalar     *xp,x0,x1,x2,x3,x4;
893:   PetscInt        nz,k;
894:   const PetscInt  *vj;

897:   for (k=mbs-1; k>=0; k--) {
898:     v  = aa + 25*ai[k];
899:     xp = x + k*5;
900:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
901:     nz = ai[k+1] - ai[k];
902:     vj = aj + ai[k];
903:     xp = x + (*vj)*5;
904:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
905:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
906:     while (nz--) {
907:       /* xk += U(k,:)*x(:) */
908:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
909:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
910:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
911:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
912:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
913:       vj++;
914:       v += 25; xp = x + (*vj)*5;
915:     }
916:     xp   = x + k*5;
917:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
918:   }
919:   return(0);
920: }

924: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
925: {
926:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
927:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
928:   const MatScalar   *aa=a->a;
929:   PetscScalar       *x;
930:   const PetscScalar *b;
931:   PetscErrorCode    ierr;

934:   VecGetArrayRead(bb,&b);
935:   VecGetArray(xx,&x);

937:   /* solve U^T * D * y = b by forward substitution */
938:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
939:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

941:   /* solve U*x = y by back substitution */
942:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

944:   VecRestoreArrayRead(bb,&b);
945:   VecRestoreArray(xx,&x);
946:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
947:   return(0);
948: }

952: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
953: {
954:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
955:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
956:   const MatScalar   *aa=a->a;
957:   PetscScalar       *x;
958:   const PetscScalar *b;
959:   PetscErrorCode    ierr;

962:   VecGetArrayRead(bb,&b);
963:   VecGetArray(xx,&x);
964:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
965:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
966:   VecRestoreArrayRead(bb,&b);
967:   VecRestoreArray(xx,&x);
968:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
969:   return(0);
970: }

974: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
975: {
976:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
977:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
978:   const MatScalar   *aa=a->a;
979:   PetscScalar       *x;
980:   const PetscScalar *b;
981:   PetscErrorCode    ierr;

984:   VecGetArrayRead(bb,&b);
985:   VecGetArray(xx,&x);
986:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
987:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
988:   VecRestoreArrayRead(bb,&b);
989:   VecRestoreArray(xx,&x);
990:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
991:   return(0);
992: }

996: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
997: {
998:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
999:   IS                isrow=a->row;
1000:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1001:   PetscErrorCode    ierr;
1002:   const PetscInt    *r,*vj;
1003:   PetscInt          nz,k,idx;
1004:   const MatScalar   *aa=a->a,*v,*diag;
1005:   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
1006:   const PetscScalar *b;

1009:   VecGetArrayRead(bb,&b);
1010:   VecGetArray(xx,&x);
1011:   t    = a->solve_work;
1012:   ISGetIndices(isrow,&r);

1014:   /* solve U^T * D * y = b by forward substitution */
1015:   tp = t;
1016:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1017:     idx   = 4*r[k];
1018:     tp[0] = b[idx];
1019:     tp[1] = b[idx+1];
1020:     tp[2] = b[idx+2];
1021:     tp[3] = b[idx+3];
1022:     tp   += 4;
1023:   }

1025:   for (k=0; k<mbs; k++) {
1026:     v  = aa + 16*ai[k];
1027:     vj = aj + ai[k];
1028:     tp = t + k*4;
1029:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1030:     nz = ai[k+1] - ai[k];

1032:     tp = t + (*vj)*4;
1033:     while (nz--) {
1034:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1035:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1036:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1037:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1038:       vj++;
1039:       tp = t + (*vj)*4;
1040:       v += 16;
1041:     }

1043:     /* xk = inv(Dk)*(Dk*xk) */
1044:     diag  = aa+k*16;          /* ptr to inv(Dk) */
1045:     tp    = t + k*4;
1046:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1047:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1048:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1049:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1050:   }

1052:   /* solve U*x = y by back substitution */
1053:   for (k=mbs-1; k>=0; k--) {
1054:     v  = aa + 16*ai[k];
1055:     vj = aj + ai[k];
1056:     tp = t + k*4;
1057:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1058:     nz = ai[k+1] - ai[k];

1060:     tp = t + (*vj)*4;
1061:     while (nz--) {
1062:       /* xk += U(k,:)*x(:) */
1063:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1064:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1065:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1066:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1067:       vj++;
1068:       tp = t + (*vj)*4;
1069:       v += 16;
1070:     }
1071:     tp       = t + k*4;
1072:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1073:     idx      = 4*r[k];
1074:     x[idx]   = x0;
1075:     x[idx+1] = x1;
1076:     x[idx+2] = x2;
1077:     x[idx+3] = x3;
1078:   }

1080:   ISRestoreIndices(isrow,&r);
1081:   VecRestoreArrayRead(bb,&b);
1082:   VecRestoreArray(xx,&x);
1083:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1084:   return(0);
1085: }

1089: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1090: {
1091:   const MatScalar *v,*diag;
1092:   PetscScalar     *xp,x0,x1,x2,x3;
1093:   PetscInt        nz,k;
1094:   const PetscInt  *vj;

1097:   for (k=0; k<mbs; k++) {
1098:     v  = aa + 16*ai[k];
1099:     xp = x + k*4;
1100:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1101:     nz = ai[k+1] - ai[k];
1102:     vj = aj + ai[k];
1103:     xp = x + (*vj)*4;
1104:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1105:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1106:     while (nz--) {
1107:       /* x(:) += U(k,:)^T*(Dk*xk) */
1108:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1109:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1110:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1111:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1112:       vj++;
1113:       xp = x + (*vj)*4;
1114:       v += 16;
1115:     }
1116:     /* xk = inv(Dk)*(Dk*xk) */
1117:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1118:     xp    = x + k*4;
1119:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1120:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1121:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1122:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1123:   }
1124:   return(0);
1125: }

1129: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1130: {
1131:   const MatScalar *v;
1132:   PetscScalar     *xp,x0,x1,x2,x3;
1133:   PetscInt        nz,k;
1134:   const PetscInt  *vj;

1137:   for (k=mbs-1; k>=0; k--) {
1138:     v  = aa + 16*ai[k];
1139:     xp = x + k*4;
1140:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1141:     nz = ai[k+1] - ai[k];
1142:     vj = aj + ai[k];
1143:     xp = x + (*vj)*4;
1144:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1145:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1146:     while (nz--) {
1147:       /* xk += U(k,:)*x(:) */
1148:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1149:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1150:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1151:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1152:       vj++;
1153:       v += 16; xp = x + (*vj)*4;
1154:     }
1155:     xp    = x + k*4;
1156:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1157:   }
1158:   return(0);
1159: }

1163: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1164: {
1165:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1166:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1167:   const MatScalar   *aa=a->a;
1168:   PetscScalar       *x;
1169:   const PetscScalar *b;
1170:   PetscErrorCode    ierr;

1173:   VecGetArrayRead(bb,&b);
1174:   VecGetArray(xx,&x);

1176:   /* solve U^T * D * y = b by forward substitution */
1177:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1178:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1180:   /* solve U*x = y by back substitution */
1181:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1182:   VecRestoreArrayRead(bb,&b);
1183:   VecRestoreArray(xx,&x);
1184:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1185:   return(0);
1186: }

1190: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1191: {
1192:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1193:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1194:   const MatScalar   *aa=a->a;
1195:   PetscScalar       *x;
1196:   const PetscScalar *b;
1197:   PetscErrorCode    ierr;

1200:   VecGetArrayRead(bb,&b);
1201:   VecGetArray(xx,&x);
1202:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1203:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1204:   VecRestoreArrayRead(bb,&b);
1205:   VecRestoreArray(xx,&x);
1206:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1207:   return(0);
1208: }

1212: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1213: {
1214:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1215:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1216:   const MatScalar   *aa=a->a;
1217:   PetscScalar       *x;
1218:   const PetscScalar *b;
1219:   PetscErrorCode    ierr;

1222:   VecGetArrayRead(bb,&b);
1223:   VecGetArray(xx,&x);
1224:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1225:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1226:   VecRestoreArrayRead(bb,&b);
1227:   VecRestoreArray(xx,&x);
1228:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1229:   return(0);
1230: }

1234: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1235: {
1236:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1237:   IS                isrow=a->row;
1238:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1239:   PetscErrorCode    ierr;
1240:   const PetscInt    *r;
1241:   PetscInt          nz,k,idx;
1242:   const PetscInt    *vj;
1243:   const MatScalar   *aa=a->a,*v,*diag;
1244:   PetscScalar       *x,x0,x1,x2,*t,*tp;
1245:   const PetscScalar *b;

1248:   VecGetArrayRead(bb,&b);
1249:   VecGetArray(xx,&x);
1250:   t    = a->solve_work;
1251:   ISGetIndices(isrow,&r);

1253:   /* solve U^T * D * y = b by forward substitution */
1254:   tp = t;
1255:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1256:     idx   = 3*r[k];
1257:     tp[0] = b[idx];
1258:     tp[1] = b[idx+1];
1259:     tp[2] = b[idx+2];
1260:     tp   += 3;
1261:   }

1263:   for (k=0; k<mbs; k++) {
1264:     v  = aa + 9*ai[k];
1265:     vj = aj + ai[k];
1266:     tp = t + k*3;
1267:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1268:     nz = ai[k+1] - ai[k];

1270:     tp = t + (*vj)*3;
1271:     while (nz--) {
1272:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1273:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1274:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1275:       vj++;
1276:       tp = t + (*vj)*3;
1277:       v += 9;
1278:     }

1280:     /* xk = inv(Dk)*(Dk*xk) */
1281:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1282:     tp    = t + k*3;
1283:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1284:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1285:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1286:   }

1288:   /* solve U*x = y by back substitution */
1289:   for (k=mbs-1; k>=0; k--) {
1290:     v  = aa + 9*ai[k];
1291:     vj = aj + ai[k];
1292:     tp = t + k*3;
1293:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1294:     nz = ai[k+1] - ai[k];

1296:     tp = t + (*vj)*3;
1297:     while (nz--) {
1298:       /* xk += U(k,:)*x(:) */
1299:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1300:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1301:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1302:       vj++;
1303:       tp = t + (*vj)*3;
1304:       v += 9;
1305:     }
1306:     tp       = t + k*3;
1307:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1308:     idx      = 3*r[k];
1309:     x[idx]   = x0;
1310:     x[idx+1] = x1;
1311:     x[idx+2] = x2;
1312:   }

1314:   ISRestoreIndices(isrow,&r);
1315:   VecRestoreArrayRead(bb,&b);
1316:   VecRestoreArray(xx,&x);
1317:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1318:   return(0);
1319: }

1323: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1324: {
1325:   const MatScalar *v,*diag;
1326:   PetscScalar     *xp,x0,x1,x2;
1327:   PetscInt        nz,k;
1328:   const PetscInt  *vj;

1331:   for (k=0; k<mbs; k++) {
1332:     v  = aa + 9*ai[k];
1333:     xp = x + k*3;
1334:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1335:     nz = ai[k+1] - ai[k];
1336:     vj = aj + ai[k];
1337:     xp = x + (*vj)*3;
1338:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1339:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1340:     while (nz--) {
1341:       /* x(:) += U(k,:)^T*(Dk*xk) */
1342:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1343:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1344:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1345:       vj++;
1346:       xp = x + (*vj)*3;
1347:       v += 9;
1348:     }
1349:     /* xk = inv(Dk)*(Dk*xk) */
1350:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1351:     xp    = x + k*3;
1352:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1353:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1354:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1355:   }
1356:   return(0);
1357: }

1361: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1362: {
1363:   const MatScalar *v;
1364:   PetscScalar     *xp,x0,x1,x2;
1365:   PetscInt        nz,k;
1366:   const PetscInt  *vj;

1369:   for (k=mbs-1; k>=0; k--) {
1370:     v  = aa + 9*ai[k];
1371:     xp = x + k*3;
1372:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1373:     nz = ai[k+1] - ai[k];
1374:     vj = aj + ai[k];
1375:     xp = x + (*vj)*3;
1376:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1377:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1378:     while (nz--) {
1379:       /* xk += U(k,:)*x(:) */
1380:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1381:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1382:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1383:       vj++;
1384:       v += 9; xp = x + (*vj)*3;
1385:     }
1386:     xp    = x + k*3;
1387:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1388:   }
1389:   return(0);
1390: }

1394: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1395: {
1396:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1397:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1398:   const MatScalar   *aa=a->a;
1399:   PetscScalar       *x;
1400:   const PetscScalar *b;
1401:   PetscErrorCode    ierr;

1404:   VecGetArrayRead(bb,&b);
1405:   VecGetArray(xx,&x);

1407:   /* solve U^T * D * y = b by forward substitution */
1408:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1409:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1411:   /* solve U*x = y by back substitution */
1412:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1414:   VecRestoreArrayRead(bb,&b);
1415:   VecRestoreArray(xx,&x);
1416:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1417:   return(0);
1418: }

1422: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1423: {
1424:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1425:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1426:   const MatScalar   *aa=a->a;
1427:   PetscScalar       *x;
1428:   const PetscScalar *b;
1429:   PetscErrorCode    ierr;

1432:   VecGetArrayRead(bb,&b);
1433:   VecGetArray(xx,&x);
1434:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1435:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1436:   VecRestoreArrayRead(bb,&b);
1437:   VecRestoreArray(xx,&x);
1438:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1439:   return(0);
1440: }

1444: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1445: {
1446:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1447:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1448:   const MatScalar   *aa=a->a;
1449:   PetscScalar       *x;
1450:   const PetscScalar *b;
1451:   PetscErrorCode    ierr;

1454:   VecGetArrayRead(bb,&b);
1455:   VecGetArray(xx,&x);
1456:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1457:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1458:   VecRestoreArrayRead(bb,&b);
1459:   VecRestoreArray(xx,&x);
1460:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1461:   return(0);
1462: }

1466: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1467: {
1468:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1469:   IS                isrow=a->row;
1470:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1471:   PetscErrorCode    ierr;
1472:   const PetscInt    *r,*vj;
1473:   PetscInt          nz,k,k2,idx;
1474:   const MatScalar   *aa=a->a,*v,*diag;
1475:   PetscScalar       *x,x0,x1,*t;
1476:   const PetscScalar *b;

1479:   VecGetArrayRead(bb,&b);
1480:   VecGetArray(xx,&x);
1481:   t    = a->solve_work;
1482:   ISGetIndices(isrow,&r);

1484:   /* solve U^T * D * y = perm(b) by forward substitution */
1485:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1486:     idx      = 2*r[k];
1487:     t[k*2]   = b[idx];
1488:     t[k*2+1] = b[idx+1];
1489:   }
1490:   for (k=0; k<mbs; k++) {
1491:     v  = aa + 4*ai[k];
1492:     vj = aj + ai[k];
1493:     k2 = k*2;
1494:     x0 = t[k2]; x1 = t[k2+1];
1495:     nz = ai[k+1] - ai[k];
1496:     while (nz--) {
1497:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1498:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1499:       vj++; v      += 4;
1500:     }
1501:     diag    = aa+k*4; /* ptr to inv(Dk) */
1502:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1503:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1504:   }

1506:   /* solve U*x = y by back substitution */
1507:   for (k=mbs-1; k>=0; k--) {
1508:     v  = aa + 4*ai[k];
1509:     vj = aj + ai[k];
1510:     k2 = k*2;
1511:     x0 = t[k2]; x1 = t[k2+1];
1512:     nz = ai[k+1] - ai[k];
1513:     while (nz--) {
1514:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1515:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1516:       vj++;
1517:       v += 4;
1518:     }
1519:     t[k2]    = x0;
1520:     t[k2+1]  = x1;
1521:     idx      = 2*r[k];
1522:     x[idx]   = x0;
1523:     x[idx+1] = x1;
1524:   }

1526:   ISRestoreIndices(isrow,&r);
1527:   VecRestoreArrayRead(bb,&b);
1528:   VecRestoreArray(xx,&x);
1529:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1530:   return(0);
1531: }

1535: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1536: {
1537:   const MatScalar *v,*diag;
1538:   PetscScalar     x0,x1;
1539:   PetscInt        nz,k,k2;
1540:   const PetscInt  *vj;
1541: 
1543:   for (k=0; k<mbs; k++) {
1544:     v  = aa + 4*ai[k];
1545:     vj = aj + ai[k];
1546:     k2 = k*2;
1547:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1548:     nz = ai[k+1] - ai[k];
1549:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1550:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1551:     while (nz--) {
1552:       /* x(:) += U(k,:)^T*(Dk*xk) */
1553:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1554:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1555:       vj++; v      += 4;
1556:     }
1557:     /* xk = inv(Dk)*(Dk*xk) */
1558:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1559:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1560:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1561:   }
1562:   return(0);
1563: }

1567: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1568: {
1569:   const MatScalar *v;
1570:   PetscScalar     x0,x1;
1571:   PetscInt        nz,k,k2;
1572:   const PetscInt  *vj;

1575:   for (k=mbs-1; k>=0; k--) {
1576:     v  = aa + 4*ai[k];
1577:     vj = aj + ai[k];
1578:     k2 = k*2;
1579:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1580:     nz = ai[k+1] - ai[k];
1581:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1582:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583:     while (nz--) {
1584:       /* xk += U(k,:)*x(:) */
1585:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1586:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1587:       vj++;
1588:       v += 4;
1589:     }
1590:     x[k2]   = x0;
1591:     x[k2+1] = x1;
1592:   }
1593:   return(0);
1594: }

1598: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1599: {
1600:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1601:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1602:   const MatScalar   *aa=a->a;
1603:   PetscScalar       *x;
1604:   const PetscScalar *b;
1605:   PetscErrorCode    ierr;

1608:   VecGetArrayRead(bb,&b);
1609:   VecGetArray(xx,&x);

1611:   /* solve U^T * D * y = b by forward substitution */
1612:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1613:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1615:   /* solve U*x = y by back substitution */
1616:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1618:   VecRestoreArrayRead(bb,&b);
1619:   VecRestoreArray(xx,&x);
1620:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1621:   return(0);
1622: }

1626: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1627: {
1628:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1629:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1630:   const MatScalar   *aa=a->a;
1631:   PetscScalar       *x;
1632:   const PetscScalar *b;
1633:   PetscErrorCode    ierr;

1636:   VecGetArrayRead(bb,&b);
1637:   VecGetArray(xx,&x);
1638:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1639:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1640:   VecRestoreArrayRead(bb,&b);
1641:   VecRestoreArray(xx,&x);
1642:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1643:   return(0);
1644: }

1648: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1649: {
1650:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1651:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1652:   const MatScalar   *aa=a->a;
1653:   PetscScalar       *x;
1654:   const PetscScalar *b;
1655:   PetscErrorCode    ierr;

1658:   VecGetArrayRead(bb,&b);
1659:   VecGetArray(xx,&x);
1660:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1661:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1662:   VecRestoreArrayRead(bb,&b);
1663:   VecRestoreArray(xx,&x);
1664:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
1665:   return(0);
1666: }

1670: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1671: {
1672:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1673:   IS                isrow=a->row;
1674:   PetscErrorCode    ierr;
1675:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1676:   const MatScalar   *aa=a->a,*v;
1677:   const PetscScalar *b;
1678:   PetscScalar       *x,xk,*t;
1679:   PetscInt          nz,k,j;

1682:   VecGetArrayRead(bb,&b);
1683:   VecGetArray(xx,&x);
1684:   t    = a->solve_work;
1685:   ISGetIndices(isrow,&rp);

1687:   /* solve U^T*D*y = perm(b) by forward substitution */
1688:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1689:   for (k=0; k<mbs; k++) {
1690:     v  = aa + ai[k];
1691:     vj = aj + ai[k];
1692:     xk = t[k];
1693:     nz = ai[k+1] - ai[k] - 1;
1694:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1695:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1696:   }

1698:   /* solve U*perm(x) = y by back substitution */
1699:   for (k=mbs-1; k>=0; k--) {
1700:     v  = aa + adiag[k] - 1;
1701:     vj = aj + adiag[k] - 1;
1702:     nz = ai[k+1] - ai[k] - 1;
1703:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1704:     x[rp[k]] = t[k];
1705:   }

1707:   ISRestoreIndices(isrow,&rp);
1708:   VecRestoreArrayRead(bb,&b);
1709:   VecRestoreArray(xx,&x);
1710:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1711:   return(0);
1712: }

1716: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1717: {
1718:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1719:   IS                isrow=a->row;
1720:   PetscErrorCode    ierr;
1721:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1722:   const MatScalar   *aa=a->a,*v;
1723:   PetscScalar       *x,xk,*t;
1724:   const PetscScalar *b;
1725:   PetscInt          nz,k;

1728:   VecGetArrayRead(bb,&b);
1729:   VecGetArray(xx,&x);
1730:   t    = a->solve_work;
1731:   ISGetIndices(isrow,&rp);

1733:   /* solve U^T*D*y = perm(b) by forward substitution */
1734:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1735:   for (k=0; k<mbs; k++) {
1736:     v  = aa + ai[k] + 1;
1737:     vj = aj + ai[k] + 1;
1738:     xk = t[k];
1739:     nz = ai[k+1] - ai[k] - 1;
1740:     while (nz--) t[*vj++] += (*v++) * xk;
1741:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1742:   }

1744:   /* solve U*perm(x) = y by back substitution */
1745:   for (k=mbs-1; k>=0; k--) {
1746:     v  = aa + ai[k] + 1;
1747:     vj = aj + ai[k] + 1;
1748:     nz = ai[k+1] - ai[k] - 1;
1749:     while (nz--) t[k] += (*v++) * t[*vj++];
1750:     x[rp[k]] = t[k];
1751:   }

1753:   ISRestoreIndices(isrow,&rp);
1754:   VecRestoreArrayRead(bb,&b);
1755:   VecRestoreArray(xx,&x);
1756:   PetscLogFlops(4.0*a->nz - 3*mbs);
1757:   return(0);
1758: }

1762: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1763: {
1764:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1765:   IS                isrow=a->row;
1766:   PetscErrorCode    ierr;
1767:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1768:   const MatScalar   *aa=a->a,*v;
1769:   PetscReal         diagk;
1770:   PetscScalar       *x,xk;
1771:   const PetscScalar *b;
1772:   PetscInt          nz,k;

1775:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1776:   VecGetArrayRead(bb,&b);
1777:   VecGetArray(xx,&x);
1778:   ISGetIndices(isrow,&rp);

1780:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1781:   for (k=0; k<mbs; k++) {
1782:     v  = aa + ai[k];
1783:     vj = aj + ai[k];
1784:     xk = x[k];
1785:     nz = ai[k+1] - ai[k] - 1;
1786:     while (nz--) x[*vj++] += (*v++) * xk;

1788:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1789:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1790:     x[k] = xk*PetscSqrtReal(diagk);
1791:   }
1792:   ISRestoreIndices(isrow,&rp);
1793:   VecRestoreArrayRead(bb,&b);
1794:   VecRestoreArray(xx,&x);
1795:   PetscLogFlops(2.0*a->nz - mbs);
1796:   return(0);
1797: }

1801: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1802: {
1803:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1804:   IS                isrow=a->row;
1805:   PetscErrorCode    ierr;
1806:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1807:   const MatScalar   *aa=a->a,*v;
1808:   PetscReal         diagk;
1809:   PetscScalar       *x,xk;
1810:   const PetscScalar *b;
1811:   PetscInt          nz,k;

1814:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1815:   VecGetArrayRead(bb,&b);
1816:   VecGetArray(xx,&x);
1817:   ISGetIndices(isrow,&rp);

1819:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1820:   for (k=0; k<mbs; k++) {
1821:     v  = aa + ai[k] + 1;
1822:     vj = aj + ai[k] + 1;
1823:     xk = x[k];
1824:     nz = ai[k+1] - ai[k] - 1;
1825:     while (nz--) x[*vj++] += (*v++) * xk;

1827:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1828:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1829:     x[k] = xk*PetscSqrtReal(diagk);
1830:   }
1831:   ISRestoreIndices(isrow,&rp);
1832:   VecRestoreArrayRead(bb,&b);
1833:   VecRestoreArray(xx,&x);
1834:   PetscLogFlops(2.0*a->nz - mbs);
1835:   return(0);
1836: }

1840: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1841: {
1842:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1843:   IS                isrow=a->row;
1844:   PetscErrorCode    ierr;
1845:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1846:   const MatScalar   *aa=a->a,*v;
1847:   PetscReal         diagk;
1848:   PetscScalar       *x,*t;
1849:   const PetscScalar *b;
1850:   PetscInt          nz,k;

1853:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1854:   VecGetArrayRead(bb,&b);
1855:   VecGetArray(xx,&x);
1856:   t    = a->solve_work;
1857:   ISGetIndices(isrow,&rp);

1859:   for (k=mbs-1; k>=0; k--) {
1860:     v     = aa + ai[k];
1861:     vj    = aj + ai[k];
1862:     diagk = PetscRealPart(aa[adiag[k]]);
1863:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1864:     t[k] = b[k] * PetscSqrtReal(diagk);
1865:     nz   = ai[k+1] - ai[k] - 1;
1866:     while (nz--) t[k] += (*v++) * t[*vj++];
1867:     x[rp[k]] = t[k];
1868:   }
1869:   ISRestoreIndices(isrow,&rp);
1870:   VecRestoreArrayRead(bb,&b);
1871:   VecRestoreArray(xx,&x);
1872:   PetscLogFlops(2.0*a->nz - mbs);
1873:   return(0);
1874: }

1878: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1879: {
1880:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1881:   IS                isrow=a->row;
1882:   PetscErrorCode    ierr;
1883:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1884:   const MatScalar   *aa=a->a,*v;
1885:   PetscReal         diagk;
1886:   PetscScalar       *x,*t;
1887:   const PetscScalar *b;
1888:   PetscInt          nz,k;

1891:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1892:   VecGetArrayRead(bb,&b);
1893:   VecGetArray(xx,&x);
1894:   t    = a->solve_work;
1895:   ISGetIndices(isrow,&rp);

1897:   for (k=mbs-1; k>=0; k--) {
1898:     v     = aa + ai[k] + 1;
1899:     vj    = aj + ai[k] + 1;
1900:     diagk = PetscRealPart(aa[ai[k]]);
1901:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1902:     t[k] = b[k] * PetscSqrtReal(diagk);
1903:     nz   = ai[k+1] - ai[k] - 1;
1904:     while (nz--) t[k] += (*v++) * t[*vj++];
1905:     x[rp[k]] = t[k];
1906:   }
1907:   ISRestoreIndices(isrow,&rp);
1908:   VecRestoreArrayRead(bb,&b);
1909:   VecRestoreArray(xx,&x);
1910:   PetscLogFlops(2.0*a->nz - mbs);
1911:   return(0);
1912: }

1916: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1917: {
1918:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1922:   if (A->rmap->bs == 1) {
1923:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1924:   } else {
1925:     IS                isrow=a->row;
1926:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1927:     const MatScalar   *aa=a->a,*v;
1928:     PetscScalar       *x,*t;
1929:     const PetscScalar *b;
1930:     PetscInt          nz,k,n,i,j;

1932:     if (bb->n > a->solves_work_n) {
1933:       PetscFree(a->solves_work);
1934:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1935:       a->solves_work_n = bb->n;
1936:     }
1937:     n    = bb->n;
1938:     VecGetArrayRead(bb->v,&b);
1939:     VecGetArray(xx->v,&x);
1940:     t    = a->solves_work;

1942:     ISGetIndices(isrow,&rp);

1944:     /* solve U^T*D*y = perm(b) by forward substitution */
1945:     for (k=0; k<mbs; k++) {
1946:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1947:     }
1948:     for (k=0; k<mbs; k++) {
1949:       v  = aa + ai[k];
1950:       vj = aj + ai[k];
1951:       nz = ai[k+1] - ai[k] - 1;
1952:       for (j=0; j<nz; j++) {
1953:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1954:         v++;vj++;
1955:       }
1956:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1957:     }

1959:     /* solve U*perm(x) = y by back substitution */
1960:     for (k=mbs-1; k>=0; k--) {
1961:       v  = aa + ai[k] - 1;
1962:       vj = aj + ai[k] - 1;
1963:       nz = ai[k+1] - ai[k] - 1;
1964:       for (j=0; j<nz; j++) {
1965:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1966:         v++;vj++;
1967:       }
1968:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1969:     }

1971:     ISRestoreIndices(isrow,&rp);
1972:     VecRestoreArrayRead(bb->v,&b);
1973:     VecRestoreArray(xx->v,&x);
1974:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1975:   }
1976:   return(0);
1977: }

1981: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1982: {
1983:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1987:   if (A->rmap->bs == 1) {
1988:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1989:   } else {
1990:     IS                isrow=a->row;
1991:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1992:     const MatScalar   *aa=a->a,*v;
1993:     PetscScalar       *x,*t;
1994:     const PetscScalar *b;
1995:     PetscInt          nz,k,n,i;

1997:     if (bb->n > a->solves_work_n) {
1998:       PetscFree(a->solves_work);
1999:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
2000:       a->solves_work_n = bb->n;
2001:     }
2002:     n    = bb->n;
2003:     VecGetArrayRead(bb->v,&b);
2004:     VecGetArray(xx->v,&x);
2005:     t    = a->solves_work;

2007:     ISGetIndices(isrow,&rp);

2009:     /* solve U^T*D*y = perm(b) by forward substitution */
2010:     for (k=0; k<mbs; k++) {
2011:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
2012:     }
2013:     for (k=0; k<mbs; k++) {
2014:       v  = aa + ai[k];
2015:       vj = aj + ai[k];
2016:       nz = ai[k+1] - ai[k];
2017:       while (nz--) {
2018:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
2019:         v++;vj++;
2020:       }
2021:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
2022:     }

2024:     /* solve U*perm(x) = y by back substitution */
2025:     for (k=mbs-1; k>=0; k--) {
2026:       v  = aa + ai[k];
2027:       vj = aj + ai[k];
2028:       nz = ai[k+1] - ai[k];
2029:       while (nz--) {
2030:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
2031:         v++;vj++;
2032:       }
2033:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
2034:     }

2036:     ISRestoreIndices(isrow,&rp);
2037:     VecRestoreArrayRead(bb->v,&b);
2038:     VecRestoreArray(xx->v,&x);
2039:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
2040:   }
2041:   return(0);
2042: }

2046: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2047: {
2048:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2049:   PetscErrorCode    ierr;
2050:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
2051:   const MatScalar   *aa=a->a,*v;
2052:   const PetscScalar *b;
2053:   PetscScalar       *x,xi;
2054:   PetscInt          nz,i,j;

2057:   VecGetArrayRead(bb,&b);
2058:   VecGetArray(xx,&x);

2060:   /* solve U^T*D*y = b by forward substitution */
2061:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2062:   for (i=0; i<mbs; i++) {
2063:     v  = aa + ai[i];
2064:     vj = aj + ai[i];
2065:     xi = x[i];
2066:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2067:     for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
2068:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2069:   }

2071:   /* solve U*x = y by backward substitution */
2072:   for (i=mbs-2; i>=0; i--) {
2073:     xi = x[i];
2074:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2075:     vj = aj + adiag[i] - 1;
2076:     nz = ai[i+1] - ai[i] - 1;
2077:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2078:     x[i] = xi;
2079:   }

2081:   VecRestoreArrayRead(bb,&b);
2082:   VecRestoreArray(xx,&x);
2083:   PetscLogFlops(4.0*a->nz - 3*mbs);
2084:   return(0);
2085: }

2089: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2090: {
2091:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2092:   PetscErrorCode    ierr;
2093:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2094:   const MatScalar   *aa=a->a,*v;
2095:   PetscScalar       *x,xk;
2096:   const PetscScalar *b;
2097:   PetscInt          nz,k;

2100:   VecGetArrayRead(bb,&b);
2101:   VecGetArray(xx,&x);

2103:   /* solve U^T*D*y = b by forward substitution */
2104:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2105:   for (k=0; k<mbs; k++) {
2106:     v  = aa + ai[k] + 1;
2107:     vj = aj + ai[k] + 1;
2108:     xk = x[k];
2109:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2110:     while (nz--) x[*vj++] += (*v++) * xk;
2111:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2112:   }

2114:   /* solve U*x = y by back substitution */
2115:   for (k=mbs-2; k>=0; k--) {
2116:     v  = aa + ai[k] + 1;
2117:     vj = aj + ai[k] + 1;
2118:     xk = x[k];
2119:     nz = ai[k+1] - ai[k] - 1;
2120:     while (nz--) {
2121:       xk += (*v++) * x[*vj++];
2122:     }
2123:     x[k] = xk;
2124:   }

2126:   VecRestoreArrayRead(bb,&b);
2127:   VecRestoreArray(xx,&x);
2128:   PetscLogFlops(4.0*a->nz - 3*mbs);
2129:   return(0);
2130: }

2134: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2135: {
2136:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2137:   PetscErrorCode    ierr;
2138:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2139:   const MatScalar   *aa=a->a,*v;
2140:   PetscReal         diagk;
2141:   PetscScalar       *x;
2142:   const PetscScalar *b;
2143:   PetscInt          nz,k;

2146:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2147:   VecGetArrayRead(bb,&b);
2148:   VecGetArray(xx,&x);
2149:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2150:   for (k=0; k<mbs; k++) {
2151:     v  = aa + ai[k];
2152:     vj = aj + ai[k];
2153:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2154:     while (nz--) x[*vj++] += (*v++) * x[k];
2155:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2156:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2157:     x[k] *= PetscSqrtReal(diagk);
2158:   }
2159:   VecRestoreArrayRead(bb,&b);
2160:   VecRestoreArray(xx,&x);
2161:   PetscLogFlops(2.0*a->nz - mbs);
2162:   return(0);
2163: }

2167: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2168: {
2169:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2170:   PetscErrorCode    ierr;
2171:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2172:   const MatScalar   *aa=a->a,*v;
2173:   PetscReal         diagk;
2174:   PetscScalar       *x;
2175:   const PetscScalar *b;
2176:   PetscInt          nz,k;

2179:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2180:   VecGetArrayRead(bb,&b);
2181:   VecGetArray(xx,&x);
2182:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2183:   for (k=0; k<mbs; k++) {
2184:     v  = aa + ai[k] + 1;
2185:     vj = aj + ai[k] + 1;
2186:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2187:     while (nz--) x[*vj++] += (*v++) * x[k];
2188:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2189:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2190:     x[k] *= PetscSqrtReal(diagk);
2191:   }
2192:   VecRestoreArrayRead(bb,&b);
2193:   VecRestoreArray(xx,&x);
2194:   PetscLogFlops(2.0*a->nz - mbs);
2195:   return(0);
2196: }

2200: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2201: {
2202:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2203:   PetscErrorCode    ierr;
2204:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2205:   const MatScalar   *aa=a->a,*v;
2206:   PetscReal         diagk;
2207:   PetscScalar       *x;
2208:   const PetscScalar *b;
2209:   PetscInt          nz,k;

2212:   /* solve D^(1/2)*U*x = b by back substitution */
2213:   VecGetArrayRead(bb,&b);
2214:   VecGetArray(xx,&x);

2216:   for (k=mbs-1; k>=0; k--) {
2217:     v     = aa + ai[k];
2218:     vj    = aj + ai[k];
2219:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2220:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2221:     x[k] = PetscSqrtReal(diagk)*b[k];
2222:     nz   = ai[k+1] - ai[k] - 1;
2223:     while (nz--) x[k] += (*v++) * x[*vj++];
2224:   }
2225:   VecRestoreArrayRead(bb,&b);
2226:   VecRestoreArray(xx,&x);
2227:   PetscLogFlops(2.0*a->nz - mbs);
2228:   return(0);
2229: }

2233: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2234: {
2235:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2236:   PetscErrorCode    ierr;
2237:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2238:   const MatScalar   *aa=a->a,*v;
2239:   PetscReal         diagk;
2240:   PetscScalar       *x;
2241:   const PetscScalar *b;
2242:   PetscInt          nz,k;

2245:   /* solve D^(1/2)*U*x = b by back substitution */
2246:   VecGetArrayRead(bb,&b);
2247:   VecGetArray(xx,&x);

2249:   for (k=mbs-1; k>=0; k--) {
2250:     v     = aa + ai[k] + 1;
2251:     vj    = aj + ai[k] + 1;
2252:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2253:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2254:     x[k] = PetscSqrtReal(diagk)*b[k];
2255:     nz   = ai[k+1] - ai[k] - 1;
2256:     while (nz--) x[k] += (*v++) * x[*vj++];
2257:   }
2258:   VecRestoreArrayRead(bb,&b);
2259:   VecRestoreArray(xx,&x);
2260:   PetscLogFlops(2.0*a->nz - mbs);
2261:   return(0);
2262: }

2264: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2267: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2268: {
2269:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2271:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2272:   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2273:   PetscInt       m,reallocs = 0,*levtmp;
2274:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2275:   PetscInt       incrlev,*lev,shift,prow,nz;
2276:   PetscReal      f = info->fill,levels = info->levels;
2277:   PetscBool      perm_identity;

2280:   /* check whether perm is the identity mapping */
2281:   ISIdentity(perm,&perm_identity);

2283:   if (perm_identity) {
2284:     a->permute = PETSC_FALSE;
2285:     ai         = a->i; aj = a->j;
2286:   } else { /*  non-trivial permutation */
2287:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2288:     a->permute = PETSC_TRUE;
2289:     MatReorderingSeqSBAIJ(A, perm);
2290:     ai         = a->inew; aj = a->jnew;
2291:   }

2293:   /* initialization */
2294:   ISGetIndices(perm,&rip);
2295:   umax  = (PetscInt)(f*ai[mbs] + 1);
2296:   PetscMalloc1(umax,&lev);
2297:   umax += mbs + 1;
2298:   shift = mbs + 1;
2299:   PetscMalloc1(mbs+1,&iu);
2300:   PetscMalloc1(umax,&ju);
2301:   iu[0] = mbs + 1;
2302:   juidx = mbs + 1;
2303:   /* prowl: linked list for pivot row */
2304:   PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2305:   /* q: linked list for col index */

2307:   for (i=0; i<mbs; i++) {
2308:     prowl[i] = mbs;
2309:     q[i]     = 0;
2310:   }

2312:   /* for each row k */
2313:   for (k=0; k<mbs; k++) {
2314:     nzk  = 0;
2315:     q[k] = mbs;
2316:     /* copy current row into linked list */
2317:     nz = ai[rip[k]+1] - ai[rip[k]];
2318:     j  = ai[rip[k]];
2319:     while (nz--) {
2320:       vj = rip[aj[j++]];
2321:       if (vj > k) {
2322:         qm = k;
2323:         do {
2324:           m = qm; qm = q[m];
2325:         } while (qm < vj);
2326:         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2327:         nzk++;
2328:         q[m]       = vj;
2329:         q[vj]      = qm;
2330:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2331:       }
2332:     }

2334:     /* modify nonzero structure of k-th row by computing fill-in
2335:        for each row prow to be merged in */
2336:     prow = k;
2337:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2339:     while (prow < k) {
2340:       /* merge row prow into k-th row */
2341:       jmin = iu[prow] + 1;
2342:       jmax = iu[prow+1];
2343:       qm   = k;
2344:       for (j=jmin; j<jmax; j++) {
2345:         incrlev = lev[j-shift] + 1;
2346:         if (incrlev > levels) continue;
2347:         vj = ju[j];
2348:         do {
2349:           m = qm; qm = q[m];
2350:         } while (qm < vj);
2351:         if (qm != vj) {      /* a fill */
2352:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2353:           levtmp[vj] = incrlev;
2354:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2355:       }
2356:       prow = prowl[prow]; /* next pivot row */
2357:     }

2359:     /* add k to row list for first nonzero element in k-th row */
2360:     if (nzk > 1) {
2361:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2362:       prowl[k] = prowl[i]; prowl[i] = k;
2363:     }
2364:     iu[k+1] = iu[k] + nzk;

2366:     /* allocate more space to ju and lev if needed */
2367:     if (iu[k+1] > umax) {
2368:       /* estimate how much additional space we will need */
2369:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2370:       /* just double the memory each time */
2371:       maxadd = umax;
2372:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2373:       umax += maxadd;

2375:       /* allocate a longer ju */
2376:       PetscMalloc1(umax,&jutmp);
2377:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2378:       PetscFree(ju);
2379:       ju   = jutmp;

2381:       PetscMalloc1(umax,&jutmp);
2382:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2383:       PetscFree(lev);
2384:       lev       = jutmp;
2385:       reallocs += 2; /* count how many times we realloc */
2386:     }

2388:     /* save nonzero structure of k-th row in ju */
2389:     i=k;
2390:     while (nzk--) {
2391:       i                = q[i];
2392:       ju[juidx]        = i;
2393:       lev[juidx-shift] = levtmp[i];
2394:       juidx++;
2395:     }
2396:   }

2398: #if defined(PETSC_USE_INFO)
2399:   if (ai[mbs] != 0) {
2400:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2401:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2402:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2403:     PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2404:     PetscInfo(A,"for best performance.\n");
2405:   } else {
2406:     PetscInfo(A,"Empty matrix.\n");
2407:   }
2408: #endif

2410:   ISRestoreIndices(perm,&rip);
2411:   PetscFree3(prowl,q,levtmp);
2412:   PetscFree(lev);

2414:   /* put together the new matrix */
2415:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);

2417:   /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2418:   b    = (Mat_SeqSBAIJ*)(B)->data;
2419:   PetscFree2(b->imax,b->ilen);

2421:   b->singlemalloc = PETSC_FALSE;
2422:   b->free_a       = PETSC_TRUE;
2423:   b->free_ij      = PETSC_TRUE;
2424:   /* the next line frees the default space generated by the Create() */
2425:   PetscFree3(b->a,b->j,b->i);
2426:   PetscMalloc1((iu[mbs]+1)*bs2,&b->a);
2427:   b->j    = ju;
2428:   b->i    = iu;
2429:   b->diag = 0;
2430:   b->ilen = 0;
2431:   b->imax = 0;

2433:   ISDestroy(&b->row);
2434:   ISDestroy(&b->icol);
2435:   b->row  = perm;
2436:   b->icol = perm;
2437:   PetscObjectReference((PetscObject)perm);
2438:   PetscObjectReference((PetscObject)perm);
2439:   PetscMalloc1(bs*mbs+bs,&b->solve_work);
2440:   /* In b structure:  Free imax, ilen, old a, old j.
2441:      Allocate idnew, solve_work, new a, new j */
2442:   PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2443:   b->maxnz = b->nz = iu[mbs];

2445:   (B)->info.factor_mallocs   = reallocs;
2446:   (B)->info.fill_ratio_given = f;
2447:   if (ai[mbs] != 0) {
2448:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2449:   } else {
2450:     (B)->info.fill_ratio_needed = 0.0;
2451:   }
2452:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2453:   return(0);
2454: }

2456: /*
2457:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2458: */
2459: #include <petscbt.h>
2460: #include <../src/mat/utils/freespace.h>
2463: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2464: {
2465:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2466:   PetscErrorCode     ierr;
2467:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2468:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2469:   const PetscInt     *rip;
2470:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2471:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2472:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2473:   PetscReal          fill          =info->fill,levels=info->levels;
2474:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2475:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2476:   PetscBT            lnkbt;

2479:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2480:   MatMissingDiagonal(A,&missing,&d);
2481:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2482:   if (bs > 1) {
2483:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2484:     return(0);
2485:   }

2487:   /* check whether perm is the identity mapping */
2488:   ISIdentity(perm,&perm_identity);
2489:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2490:   a->permute = PETSC_FALSE;

2492:   PetscMalloc1(am+1,&ui);
2493:   PetscMalloc1(am+1,&udiag);
2494:   ui[0] = 0;

2496:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2497:   if (!levels) {
2498:     /* reuse the column pointers and row offsets for memory savings */
2499:     for (i=0; i<am; i++) {
2500:       ncols    = ai[i+1] - ai[i];
2501:       ui[i+1]  = ui[i] + ncols;
2502:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2503:     }
2504:     PetscMalloc1(ui[am]+1,&uj);
2505:     cols = uj;
2506:     for (i=0; i<am; i++) {
2507:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2508:       ncols = ai[i+1] - ai[i] -1;
2509:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2510:       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2511:     }
2512:   } else { /* case: levels>0 */
2513:     ISGetIndices(perm,&rip);

2515:     /* initialization */
2516:     /* jl: linked list for storing indices of the pivot rows
2517:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2518:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2519:     for (i=0; i<am; i++) {
2520:       jl[i] = am; il[i] = 0;
2521:     }

2523:     /* create and initialize a linked list for storing column indices of the active row k */
2524:     nlnk = am + 1;
2525:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2527:     /* initial FreeSpace size is fill*(ai[am]+1) */
2528:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);

2530:     current_space = free_space;

2532:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);

2534:     current_space_lvl = free_space_lvl;

2536:     for (k=0; k<am; k++) {  /* for each active row k */
2537:       /* initialize lnk by the column indices of row k */
2538:       nzk   = 0;
2539:       ncols = ai[k+1] - ai[k];
2540:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2541:       cols = aj+ai[k];
2542:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2543:       nzk += nlnk;

2545:       /* update lnk by computing fill-in for each pivot row to be merged in */
2546:       prow = jl[k]; /* 1st pivot row */

2548:       while (prow < k) {
2549:         nextprow = jl[prow];

2551:         /* merge prow into k-th row */
2552:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2553:         jmax  = ui[prow+1];
2554:         ncols = jmax-jmin;
2555:         i     = jmin - ui[prow];

2557:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2558:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2559:         j    = *(uj - 1);
2560:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2561:         nzk += nlnk;

2563:         /* update il and jl for prow */
2564:         if (jmin < jmax) {
2565:           il[prow] = jmin;

2567:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2568:         }
2569:         prow = nextprow;
2570:       }

2572:       /* if free space is not available, make more free space */
2573:       if (current_space->local_remaining<nzk) {
2574:         i    = am - k + 1; /* num of unfactored rows */
2575:         i   *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2576:         PetscFreeSpaceGet(i,&current_space);
2577:         PetscFreeSpaceGet(i,&current_space_lvl);
2578:         reallocs++;
2579:       }

2581:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2582:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2583:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2585:       /* add the k-th row into il and jl */
2586:       if (nzk > 1) {
2587:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2588:         jl[k] = jl[i]; jl[i] = k;
2589:         il[k] = ui[k] + 1;
2590:       }
2591:       uj_ptr[k]     = current_space->array;
2592:       uj_lvl_ptr[k] = current_space_lvl->array;

2594:       current_space->array               += nzk;
2595:       current_space->local_used          += nzk;
2596:       current_space->local_remaining     -= nzk;
2597:       current_space_lvl->array           += nzk;
2598:       current_space_lvl->local_used      += nzk;
2599:       current_space_lvl->local_remaining -= nzk;

2601:       ui[k+1] = ui[k] + nzk;
2602:     }

2604:     ISRestoreIndices(perm,&rip);
2605:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2607:     /* destroy list of free space and other temporary array(s) */
2608:     PetscMalloc1(ui[am]+1,&uj);
2609:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2610:     PetscIncompleteLLDestroy(lnk,lnkbt);
2611:     PetscFreeSpaceDestroy(free_space_lvl);

2613:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2615:   /* put together the new matrix in MATSEQSBAIJ format */
2616:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2618:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2619:   PetscFree2(b->imax,b->ilen);

2621:   b->singlemalloc = PETSC_FALSE;
2622:   b->free_a       = PETSC_TRUE;
2623:   b->free_ij      = free_ij;

2625:   PetscMalloc1(ui[am]+1,&b->a);

2627:   b->j         = uj;
2628:   b->i         = ui;
2629:   b->diag      = udiag;
2630:   b->free_diag = PETSC_TRUE;
2631:   b->ilen      = 0;
2632:   b->imax      = 0;
2633:   b->row       = perm;
2634:   b->col       = perm;

2636:   PetscObjectReference((PetscObject)perm);
2637:   PetscObjectReference((PetscObject)perm);

2639:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2641:   PetscMalloc1(am+1,&b->solve_work);
2642:   PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2644:   b->maxnz = b->nz = ui[am];

2646:   fact->info.factor_mallocs   = reallocs;
2647:   fact->info.fill_ratio_given = fill;
2648:   if (ai[am] != 0) {
2649:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2650:   } else {
2651:     fact->info.fill_ratio_needed = 0.0;
2652:   }
2653: #if defined(PETSC_USE_INFO)
2654:   if (ai[am] != 0) {
2655:     PetscReal af = fact->info.fill_ratio_needed;
2656:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2657:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2658:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2659:   } else {
2660:     PetscInfo(A,"Empty matrix.\n");
2661:   }
2662: #endif
2663:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2664:   return(0);
2665: }

2669: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2670: {
2671:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2672:   Mat_SeqSBAIJ       *b;
2673:   PetscErrorCode     ierr;
2674:   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2675:   PetscInt           bs=A->rmap->bs,am=a->mbs;
2676:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2677:   PetscInt           reallocs=0,i,*ui;
2678:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2679:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2680:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2681:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2682:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2683:   PetscBT            lnkbt;

2686:   /*
2687:    This code originally uses Modified Sparse Row (MSR) storage
2688:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2689:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2690:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2691:    thus the original code in MSR format is still used for these cases.
2692:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2693:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2694:   */
2695:   if (bs > 1) {
2696:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2697:     return(0);
2698:   }

2700:   /* check whether perm is the identity mapping */
2701:   ISIdentity(perm,&perm_identity);
2702:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2703:   a->permute = PETSC_FALSE;

2705:   /* special case that simply copies fill pattern */
2706:   if (!levels) {
2707:     /* reuse the column pointers and row offsets for memory savings */
2708:     ui           = a->i;
2709:     uj           = a->j;
2710:     free_ij      = PETSC_FALSE;
2711:     ratio_needed = 1.0;
2712:   } else { /* case: levels>0 */
2713:     ISGetIndices(perm,&rip);

2715:     /* initialization */
2716:     PetscMalloc1(am+1,&ui);
2717:     ui[0] = 0;

2719:     /* jl: linked list for storing indices of the pivot rows
2720:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2721:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2722:     for (i=0; i<am; i++) {
2723:       jl[i] = am; il[i] = 0;
2724:     }

2726:     /* create and initialize a linked list for storing column indices of the active row k */
2727:     nlnk = am + 1;
2728:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2730:     /* initial FreeSpace size is fill*(ai[am]+1) */
2731:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);

2733:     current_space = free_space;

2735:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);

2737:     current_space_lvl = free_space_lvl;

2739:     for (k=0; k<am; k++) {  /* for each active row k */
2740:       /* initialize lnk by the column indices of row rip[k] */
2741:       nzk   = 0;
2742:       ncols = ai[rip[k]+1] - ai[rip[k]];
2743:       cols  = aj+ai[rip[k]];
2744:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2745:       nzk  += nlnk;

2747:       /* update lnk by computing fill-in for each pivot row to be merged in */
2748:       prow = jl[k]; /* 1st pivot row */

2750:       while (prow < k) {
2751:         nextprow = jl[prow];

2753:         /* merge prow into k-th row */
2754:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2755:         jmax     = ui[prow+1];
2756:         ncols    = jmax-jmin;
2757:         i        = jmin - ui[prow];
2758:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2759:         j        = *(uj_lvl_ptr[prow] + i - 1);
2760:         cols_lvl = uj_lvl_ptr[prow]+i;
2761:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2762:         nzk     += nlnk;

2764:         /* update il and jl for prow */
2765:         if (jmin < jmax) {
2766:           il[prow] = jmin;
2767:           j        = *cols;
2768:           jl[prow] = jl[j];
2769:           jl[j]    = prow;
2770:         }
2771:         prow = nextprow;
2772:       }

2774:       /* if free space is not available, make more free space */
2775:       if (current_space->local_remaining<nzk) {
2776:         i    = am - k + 1; /* num of unfactored rows */
2777:         i    = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2778:         PetscFreeSpaceGet(i,&current_space);
2779:         PetscFreeSpaceGet(i,&current_space_lvl);
2780:         reallocs++;
2781:       }

2783:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2784:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2786:       /* add the k-th row into il and jl */
2787:       if (nzk-1 > 0) {
2788:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2789:         jl[k] = jl[i]; jl[i] = k;
2790:         il[k] = ui[k] + 1;
2791:       }
2792:       uj_ptr[k]     = current_space->array;
2793:       uj_lvl_ptr[k] = current_space_lvl->array;

2795:       current_space->array               += nzk;
2796:       current_space->local_used          += nzk;
2797:       current_space->local_remaining     -= nzk;
2798:       current_space_lvl->array           += nzk;
2799:       current_space_lvl->local_used      += nzk;
2800:       current_space_lvl->local_remaining -= nzk;

2802:       ui[k+1] = ui[k] + nzk;
2803:     }

2805:     ISRestoreIndices(perm,&rip);
2806:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2808:     /* destroy list of free space and other temporary array(s) */
2809:     PetscMalloc1(ui[am]+1,&uj);
2810:     PetscFreeSpaceContiguous(&free_space,uj);
2811:     PetscIncompleteLLDestroy(lnk,lnkbt);
2812:     PetscFreeSpaceDestroy(free_space_lvl);
2813:     if (ai[am] != 0) {
2814:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2815:     } else {
2816:       ratio_needed = 0.0;
2817:     }
2818:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2820:   /* put together the new matrix in MATSEQSBAIJ format */
2821:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2823:   b = (Mat_SeqSBAIJ*)(fact)->data;

2825:   PetscFree2(b->imax,b->ilen);

2827:   b->singlemalloc = PETSC_FALSE;
2828:   b->free_a       = PETSC_TRUE;
2829:   b->free_ij      = free_ij;

2831:   PetscMalloc1(ui[am]+1,&b->a);

2833:   b->j             = uj;
2834:   b->i             = ui;
2835:   b->diag          = 0;
2836:   b->ilen          = 0;
2837:   b->imax          = 0;
2838:   b->row           = perm;
2839:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2841:   PetscObjectReference((PetscObject)perm);
2842:   b->icol = perm;
2843:   PetscObjectReference((PetscObject)perm);
2844:   PetscMalloc1(am+1,&b->solve_work);

2846:   b->maxnz = b->nz = ui[am];

2848:   fact->info.factor_mallocs    = reallocs;
2849:   fact->info.fill_ratio_given  = fill;
2850:   fact->info.fill_ratio_needed = ratio_needed;
2851: #if defined(PETSC_USE_INFO)
2852:   if (ai[am] != 0) {
2853:     PetscReal af = fact->info.fill_ratio_needed;
2854:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2855:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2856:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2857:   } else {
2858:     PetscInfo(A,"Empty matrix.\n");
2859:   }
2860: #endif
2861:   if (perm_identity) {
2862:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2863:   } else {
2864:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2865:   }
2866:   return(0);
2867: }