Actual source code: sbaijfact2.c

petsc-3.4.5 2014-06-29
  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc-private/kernels/blockinvert.h>

 12: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 13: {
 14:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
 15:   IS             isrow=a->row;
 16:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j;
 18:   const PetscInt *r;
 19:   PetscInt       nz,*vj,k,idx,k1;
 20:   PetscInt       bs =A->rmap->bs,bs2 = a->bs2;
 21:   MatScalar      *aa=a->a,*v,*diag;
 22:   PetscScalar    *x,*xk,*xj,*b,*xk_tmp,*t;

 25:   VecGetArray(bb,&b);
 26:   VecGetArray(xx,&x);
 27:   t    = a->solve_work;
 28:   ISGetIndices(isrow,&r);
 29:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 31:   /* solve U^T * D * y = b by forward substitution */
 32:   xk = t;
 33:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 34:     idx = bs*r[k];
 35:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 36:   }
 37:   for (k=0; k<mbs; k++) {
 38:     v    = aa + bs2*ai[k];
 39:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 40:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 41:     nz   = ai[k+1] - ai[k];
 42:     vj   = aj + ai[k];
 43:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 44:     while (nz--) {
 45:       /* x(:) += U(k,:)^T*(Dk*xk) */
 46:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 47:       vj++; xj = t + (*vj)*bs;
 48:       v       += bs2;
 49:     }
 50:     /* xk = inv(Dk)*(Dk*xk) */
 51:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 52:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 53:   }

 55:   /* solve U*x = y by back substitution */
 56:   for (k=mbs-1; k>=0; k--) {
 57:     v  = aa + bs2*ai[k];
 58:     xk = t + k*bs;        /* xk */
 59:     nz = ai[k+1] - ai[k];
 60:     vj = aj + ai[k];
 61:     xj = t + (*vj)*bs;
 62:     while (nz--) {
 63:       /* xk += U(k,:)*x(:) */
 64:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 65:       vj++;
 66:       v += bs2; xj = t + (*vj)*bs;
 67:     }
 68:     idx = bs*r[k];
 69:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 70:   }

 72:   PetscFree(xk_tmp);
 73:   ISRestoreIndices(isrow,&r);
 74:   VecRestoreArray(bb,&b);
 75:   VecRestoreArray(xx,&x);
 76:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 77:   return(0);
 78: }

 82: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 83: {
 85:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 86:   return(0);
 87: }

 91: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 92: {
 94:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 95:   return(0);
 96: }

100: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
101: {
103:   PetscInt       nz,*vj,k;
104:   PetscInt       bs2 = bs*bs;
105:   MatScalar      *v,*diag;
106:   PetscScalar    *xk,*xj,*xk_tmp;

109:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
110:   for (k=0; k<mbs; k++) {
111:     v    = aa + bs2*ai[k];
112:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
113:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
114:     nz   = ai[k+1] - ai[k];
115:     vj   = aj + ai[k];
116:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
117:     while (nz--) {
118:       /* x(:) += U(k,:)^T*(Dk*xk) */
119:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
120:       vj++; xj = x + (*vj)*bs;
121:       v       += bs2;
122:     }
123:     /* xk = inv(Dk)*(Dk*xk) */
124:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
125:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
126:   }
127:   PetscFree(xk_tmp);
128:   return(0);
129: }

133: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
134: {
135:   PetscInt    nz,*vj,k;
136:   PetscInt    bs2 = bs*bs;
137:   MatScalar   *v;
138:   PetscScalar *xk,*xj;

141:   for (k=mbs-1; k>=0; k--) {
142:     v  = aa + bs2*ai[k];
143:     xk = x + k*bs;        /* xk */
144:     nz = ai[k+1] - ai[k];
145:     vj = aj + ai[k];
146:     xj = x + (*vj)*bs;
147:     while (nz--) {
148:       /* xk += U(k,:)*x(:) */
149:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
150:       vj++;
151:       v += bs2; xj = x + (*vj)*bs;
152:     }
153:   }
154:   return(0);
155: }

159: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
160: {
161:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
163:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
164:   PetscInt       bs =A->rmap->bs;
165:   MatScalar      *aa=a->a;
166:   PetscScalar    *x,*b;

169:   VecGetArray(bb,&b);
170:   VecGetArray(xx,&x);

172:   /* solve U^T * D * y = b by forward substitution */
173:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
174:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

176:   /* solve U*x = y by back substitution */
177:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

179:   VecRestoreArray(bb,&b);
180:   VecRestoreArray(xx,&x);
181:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
182:   return(0);
183: }

187: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
188: {
189:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
191:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
192:   PetscInt       bs =A->rmap->bs;
193:   MatScalar      *aa=a->a;
194:   PetscScalar    *x,*b;

197:   VecGetArray(bb,&b);
198:   VecGetArray(xx,&x);
199:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
200:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
201:   VecRestoreArray(bb,&b);
202:   VecRestoreArray(xx,&x);
203:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
204:   return(0);
205: }

209: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
210: {
211:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
213:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
214:   PetscInt       bs =A->rmap->bs;
215:   MatScalar      *aa=a->a;
216:   PetscScalar    *x,*b;

219:   VecGetArray(bb,&b);
220:   VecGetArray(xx,&x);
221:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
222:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
223:   VecRestoreArray(bb,&b);
224:   VecRestoreArray(xx,&x);
225:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
226:   return(0);
227: }

231: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
232: {
233:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
234:   IS             isrow=a->row;
235:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
237:   const PetscInt *r;
238:   PetscInt       nz,*vj,k,idx;
239:   MatScalar      *aa=a->a,*v,*d;
240:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

243:   VecGetArray(bb,&b);
244:   VecGetArray(xx,&x);
245:   t    = a->solve_work;
246:   ISGetIndices(isrow,&r);

248:   /* solve U^T * D * y = b by forward substitution */
249:   tp = t;
250:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
251:     idx   = 7*r[k];
252:     tp[0] = b[idx];
253:     tp[1] = b[idx+1];
254:     tp[2] = b[idx+2];
255:     tp[3] = b[idx+3];
256:     tp[4] = b[idx+4];
257:     tp[5] = b[idx+5];
258:     tp[6] = b[idx+6];
259:     tp   += 7;
260:   }

262:   for (k=0; k<mbs; k++) {
263:     v  = aa + 49*ai[k];
264:     vj = aj + ai[k];
265:     tp = t + k*7;
266:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
267:     nz = ai[k+1] - ai[k];
268:     tp = t + (*vj)*7;
269:     while (nz--) {
270:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
271:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
272:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
273:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
274:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
275:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
276:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
277:       vj++;
278:       tp = t + (*vj)*7;
279:       v += 49;
280:     }

282:     /* xk = inv(Dk)*(Dk*xk) */
283:     d     = aa+k*49;       /* ptr to inv(Dk) */
284:     tp    = t + k*7;
285:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292:   }

294:   /* solve U*x = y by back substitution */
295:   for (k=mbs-1; k>=0; k--) {
296:     v  = aa + 49*ai[k];
297:     vj = aj + ai[k];
298:     tp = t + k*7;
299:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
300:     nz = ai[k+1] - ai[k];

302:     tp = t + (*vj)*7;
303:     while (nz--) {
304:       /* xk += U(k,:)*x(:) */
305:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312:       vj++;
313:       tp = t + (*vj)*7;
314:       v += 49;
315:     }
316:     tp       = t + k*7;
317:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
318:     idx      = 7*r[k];
319:     x[idx]   = x0;
320:     x[idx+1] = x1;
321:     x[idx+2] = x2;
322:     x[idx+3] = x3;
323:     x[idx+4] = x4;
324:     x[idx+5] = x5;
325:     x[idx+6] = x6;
326:   }

328:   ISRestoreIndices(isrow,&r);
329:   VecRestoreArray(bb,&b);
330:   VecRestoreArray(xx,&x);
331:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
332:   return(0);
333: }

337: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
338: {
339:   MatScalar   *v,*d;
340:   PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
341:   PetscInt    nz,*vj,k;

344:   for (k=0; k<mbs; k++) {
345:     v  = aa + 49*ai[k];
346:     xp = x + k*7;
347:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
348:     nz = ai[k+1] - ai[k];
349:     vj = aj + ai[k];
350:     xp = x + (*vj)*7;
351:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
352:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353:     while (nz--) {
354:       /* x(:) += U(k,:)^T*(Dk*xk) */
355:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
356:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
357:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
358:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
359:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
360:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
361:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
362:       vj++;
363:       xp = x + (*vj)*7;
364:       v += 49;
365:     }
366:     /* xk = inv(Dk)*(Dk*xk) */
367:     d     = aa+k*49;       /* ptr to inv(Dk) */
368:     xp    = x + k*7;
369:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
370:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
371:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
372:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
373:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
374:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
375:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
376:   }
377:   return(0);
378: }

382: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
383: {
384:   MatScalar   *v;
385:   PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
386:   PetscInt    nz,*vj,k;

389:   for (k=mbs-1; k>=0; k--) {
390:     v  = aa + 49*ai[k];
391:     xp = x + k*7;
392:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
393:     nz = ai[k+1] - ai[k];
394:     vj = aj + ai[k];
395:     xp = x + (*vj)*7;
396:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
397:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
398:     while (nz--) {
399:       /* xk += U(k,:)*x(:) */
400:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
401:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
402:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
403:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
404:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
405:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
406:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
407:       vj++;
408:       v += 49; xp = x + (*vj)*7;
409:     }
410:     xp   = x + k*7;
411:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
412:   }
413:   return(0);
414: }

418: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
419: {
420:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
422:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
423:   MatScalar      *aa=a->a;
424:   PetscScalar    *x,*b;

427:   VecGetArray(bb,&b);
428:   VecGetArray(xx,&x);

430:   /* solve U^T * D * y = b by forward substitution */
431:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
432:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

434:   /* solve U*x = y by back substitution */
435:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

437:   VecRestoreArray(bb,&b);
438:   VecRestoreArray(xx,&x);
439:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
440:   return(0);
441: }

445: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
446: {
447:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
449:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
450:   MatScalar      *aa=a->a;
451:   PetscScalar    *x,*b;

454:   VecGetArray(bb,&b);
455:   VecGetArray(xx,&x);
456:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
457:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
458:   VecRestoreArray(bb,&b);
459:   VecRestoreArray(xx,&x);
460:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
461:   return(0);
462: }

466: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
467: {
468:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
470:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
471:   MatScalar      *aa=a->a;
472:   PetscScalar    *x,*b;

475:   VecGetArray(bb,&b);
476:   VecGetArray(xx,&x);
477:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
478:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
479:   VecRestoreArray(bb,&b);
480:   VecRestoreArray(xx,&x);
481:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
482:   return(0);
483: }

487: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
488: {
489:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
490:   IS             isrow=a->row;
491:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
493:   const PetscInt *r;
494:   PetscInt       nz,*vj,k,idx;
495:   MatScalar      *aa=a->a,*v,*d;
496:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

499:   VecGetArray(bb,&b);
500:   VecGetArray(xx,&x);
501:   t    = a->solve_work;
502:   ISGetIndices(isrow,&r);

504:   /* solve U^T * D * y = b by forward substitution */
505:   tp = t;
506:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
507:     idx   = 6*r[k];
508:     tp[0] = b[idx];
509:     tp[1] = b[idx+1];
510:     tp[2] = b[idx+2];
511:     tp[3] = b[idx+3];
512:     tp[4] = b[idx+4];
513:     tp[5] = b[idx+5];
514:     tp   += 6;
515:   }

517:   for (k=0; k<mbs; k++) {
518:     v  = aa + 36*ai[k];
519:     vj = aj + ai[k];
520:     tp = t + k*6;
521:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
522:     nz = ai[k+1] - ai[k];
523:     tp = t + (*vj)*6;
524:     while (nz--) {
525:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
526:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
527:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
528:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
529:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
530:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
531:       vj++;
532:       tp = t + (*vj)*6;
533:       v += 36;
534:     }

536:     /* xk = inv(Dk)*(Dk*xk) */
537:     d     = aa+k*36;       /* ptr to inv(Dk) */
538:     tp    = t + k*6;
539:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
540:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
541:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
542:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
543:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
544:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
545:   }

547:   /* solve U*x = y by back substitution */
548:   for (k=mbs-1; k>=0; k--) {
549:     v  = aa + 36*ai[k];
550:     vj = aj + ai[k];
551:     tp = t + k*6;
552:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
553:     nz = ai[k+1] - ai[k];

555:     tp = t + (*vj)*6;
556:     while (nz--) {
557:       /* xk += U(k,:)*x(:) */
558:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
559:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
560:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
561:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
562:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
563:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
564:       vj++;
565:       tp = t + (*vj)*6;
566:       v += 36;
567:     }
568:     tp       = t + k*6;
569:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
570:     idx      = 6*r[k];
571:     x[idx]   = x0;
572:     x[idx+1] = x1;
573:     x[idx+2] = x2;
574:     x[idx+3] = x3;
575:     x[idx+4] = x4;
576:     x[idx+5] = x5;
577:   }

579:   ISRestoreIndices(isrow,&r);
580:   VecRestoreArray(bb,&b);
581:   VecRestoreArray(xx,&x);
582:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
583:   return(0);
584: }

588: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
589: {
590:   MatScalar   *v,*d;
591:   PetscScalar *xp,x0,x1,x2,x3,x4,x5;
592:   PetscInt    nz,*vj,k;

595:   for (k=0; k<mbs; k++) {
596:     v  = aa + 36*ai[k];
597:     xp = x + k*6;
598:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
599:     nz = ai[k+1] - ai[k];
600:     vj = aj + ai[k];
601:     xp = x + (*vj)*6;
602:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
603:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
604:     while (nz--) {
605:       /* x(:) += U(k,:)^T*(Dk*xk) */
606:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
607:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
608:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
609:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
610:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
611:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
612:       vj++;
613:       xp = x + (*vj)*6;
614:       v += 36;
615:     }
616:     /* xk = inv(Dk)*(Dk*xk) */
617:     d     = aa+k*36;       /* ptr to inv(Dk) */
618:     xp    = x + k*6;
619:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
620:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
621:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
622:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
623:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
624:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
625:   }
626:   return(0);
627: }
630: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
631: {
632:   MatScalar   *v;
633:   PetscScalar *xp,x0,x1,x2,x3,x4,x5;
634:   PetscInt    nz,*vj,k;

637:   for (k=mbs-1; k>=0; k--) {
638:     v  = aa + 36*ai[k];
639:     xp = x + k*6;
640:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
641:     nz = ai[k+1] - ai[k];
642:     vj = aj + ai[k];
643:     xp = x + (*vj)*6;
644:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
645:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
646:     while (nz--) {
647:       /* xk += U(k,:)*x(:) */
648:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
649:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
650:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
651:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
652:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
653:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
654:       vj++;
655:       v += 36; xp = x + (*vj)*6;
656:     }
657:     xp   = x + k*6;
658:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
659:   }
660:   return(0);
661: }


666: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
667: {
668:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
669:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
670:   MatScalar      *aa=a->a;
671:   PetscScalar    *x,*b;

675:   VecGetArray(bb,&b);
676:   VecGetArray(xx,&x);

678:   /* solve U^T * D * y = b by forward substitution */
679:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
680:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

682:   /* solve U*x = y by back substitution */
683:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

685:   VecRestoreArray(bb,&b);
686:   VecRestoreArray(xx,&x);
687:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
688:   return(0);
689: }

693: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
694: {
695:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
696:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
697:   MatScalar      *aa=a->a;
698:   PetscScalar    *x,*b;

702:   VecGetArray(bb,&b);
703:   VecGetArray(xx,&x);
704:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
705:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
706:   VecRestoreArray(bb,&b);
707:   VecRestoreArray(xx,&x);
708:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
709:   return(0);
710: }

714: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
715: {
716:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
717:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
718:   MatScalar      *aa=a->a;
719:   PetscScalar    *x,*b;

723:   VecGetArray(bb,&b);
724:   VecGetArray(xx,&x);
725:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
726:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
727:   VecRestoreArray(bb,&b);
728:   VecRestoreArray(xx,&x);
729:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
730:   return(0);
731: }

735: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
736: {
737:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
738:   IS             isrow=a->row;
739:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
741:   const PetscInt *r;
742:   PetscInt       nz,*vj,k,idx;
743:   MatScalar      *aa=a->a,*v,*diag;
744:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

747:   VecGetArray(bb,&b);
748:   VecGetArray(xx,&x);
749:   t    = a->solve_work;
750:   ISGetIndices(isrow,&r);

752:   /* solve U^T * D * y = b by forward substitution */
753:   tp = t;
754:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
755:     idx   = 5*r[k];
756:     tp[0] = b[idx];
757:     tp[1] = b[idx+1];
758:     tp[2] = b[idx+2];
759:     tp[3] = b[idx+3];
760:     tp[4] = b[idx+4];
761:     tp   += 5;
762:   }

764:   for (k=0; k<mbs; k++) {
765:     v  = aa + 25*ai[k];
766:     vj = aj + ai[k];
767:     tp = t + k*5;
768:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
769:     nz = ai[k+1] - ai[k];

771:     tp = t + (*vj)*5;
772:     while (nz--) {
773:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
774:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
775:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
776:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
777:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
778:       vj++;
779:       tp = t + (*vj)*5;
780:       v += 25;
781:     }

783:     /* xk = inv(Dk)*(Dk*xk) */
784:     diag  = aa+k*25;          /* ptr to inv(Dk) */
785:     tp    = t + k*5;
786:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
787:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
788:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
789:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
790:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
791:   }

793:   /* solve U*x = y by back substitution */
794:   for (k=mbs-1; k>=0; k--) {
795:     v  = aa + 25*ai[k];
796:     vj = aj + ai[k];
797:     tp = t + k*5;
798:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
799:     nz = ai[k+1] - ai[k];

801:     tp = t + (*vj)*5;
802:     while (nz--) {
803:       /* xk += U(k,:)*x(:) */
804:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
805:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
806:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
807:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
808:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
809:       vj++;
810:       tp = t + (*vj)*5;
811:       v += 25;
812:     }
813:     tp       = t + k*5;
814:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
815:     idx      = 5*r[k];
816:     x[idx]   = x0;
817:     x[idx+1] = x1;
818:     x[idx+2] = x2;
819:     x[idx+3] = x3;
820:     x[idx+4] = x4;
821:   }

823:   ISRestoreIndices(isrow,&r);
824:   VecRestoreArray(bb,&b);
825:   VecRestoreArray(xx,&x);
826:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
827:   return(0);
828: }

832: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
833: {
834:   MatScalar   *v,*diag;
835:   PetscScalar *xp,x0,x1,x2,x3,x4;
836:   PetscInt    nz,*vj,k;

839:   for (k=0; k<mbs; k++) {
840:     v  = aa + 25*ai[k];
841:     xp = x + k*5;
842:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
843:     nz = ai[k+1] - ai[k];
844:     vj = aj + ai[k];
845:     xp = x + (*vj)*5;
846:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
847:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
848:     while (nz--) {
849:       /* x(:) += U(k,:)^T*(Dk*xk) */
850:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
851:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
852:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
853:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
854:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
855:       vj++;
856:       xp = x + (*vj)*5;
857:       v += 25;
858:     }
859:     /* xk = inv(Dk)*(Dk*xk) */
860:     diag  = aa+k*25;         /* ptr to inv(Dk) */
861:     xp    = x + k*5;
862:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
863:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
864:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
865:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
866:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
867:   }
868:   return(0);
869: }

873: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
874: {
875:   MatScalar   *v;
876:   PetscScalar *xp,x0,x1,x2,x3,x4;
877:   PetscInt    nz,*vj,k;

880:   for (k=mbs-1; k>=0; k--) {
881:     v  = aa + 25*ai[k];
882:     xp = x + k*5;
883:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
884:     nz = ai[k+1] - ai[k];
885:     vj = aj + ai[k];
886:     xp = x + (*vj)*5;
887:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
888:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889:     while (nz--) {
890:       /* xk += U(k,:)*x(:) */
891:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
892:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
893:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
894:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
895:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
896:       vj++;
897:       v += 25; xp = x + (*vj)*5;
898:     }
899:     xp   = x + k*5;
900:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
901:   }
902:   return(0);
903: }

907: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
908: {
909:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
910:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
911:   MatScalar      *aa=a->a;
912:   PetscScalar    *x,*b;

916:   VecGetArray(bb,&b);
917:   VecGetArray(xx,&x);

919:   /* solve U^T * D * y = b by forward substitution */
920:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
921:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

923:   /* solve U*x = y by back substitution */
924:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

926:   VecRestoreArray(bb,&b);
927:   VecRestoreArray(xx,&x);
928:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
929:   return(0);
930: }

934: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
935: {
936:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
937:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
938:   MatScalar      *aa=a->a;
939:   PetscScalar    *x,*b;

943:   VecGetArray(bb,&b);
944:   VecGetArray(xx,&x);
945:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
946:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
947:   VecRestoreArray(bb,&b);
948:   VecRestoreArray(xx,&x);
949:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
950:   return(0);
951: }

955: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
956: {
957:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
958:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
959:   MatScalar      *aa=a->a;
960:   PetscScalar    *x,*b;

964:   VecGetArray(bb,&b);
965:   VecGetArray(xx,&x);
966:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
967:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
968:   VecRestoreArray(bb,&b);
969:   VecRestoreArray(xx,&x);
970:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
971:   return(0);
972: }

976: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
977: {
978:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
979:   IS             isrow=a->row;
980:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
982:   const PetscInt *r;
983:   PetscInt       nz,*vj,k,idx;
984:   MatScalar      *aa=a->a,*v,*diag;
985:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

988:   VecGetArray(bb,&b);
989:   VecGetArray(xx,&x);
990:   t    = a->solve_work;
991:   ISGetIndices(isrow,&r);

993:   /* solve U^T * D * y = b by forward substitution */
994:   tp = t;
995:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
996:     idx   = 4*r[k];
997:     tp[0] = b[idx];
998:     tp[1] = b[idx+1];
999:     tp[2] = b[idx+2];
1000:     tp[3] = b[idx+3];
1001:     tp   += 4;
1002:   }

1004:   for (k=0; k<mbs; k++) {
1005:     v  = aa + 16*ai[k];
1006:     vj = aj + ai[k];
1007:     tp = t + k*4;
1008:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1009:     nz = ai[k+1] - ai[k];

1011:     tp = t + (*vj)*4;
1012:     while (nz--) {
1013:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1014:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1015:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1016:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1017:       vj++;
1018:       tp = t + (*vj)*4;
1019:       v += 16;
1020:     }

1022:     /* xk = inv(Dk)*(Dk*xk) */
1023:     diag  = aa+k*16;          /* ptr to inv(Dk) */
1024:     tp    = t + k*4;
1025:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1026:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1027:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1028:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1029:   }

1031:   /* solve U*x = y by back substitution */
1032:   for (k=mbs-1; k>=0; k--) {
1033:     v  = aa + 16*ai[k];
1034:     vj = aj + ai[k];
1035:     tp = t + k*4;
1036:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1037:     nz = ai[k+1] - ai[k];

1039:     tp = t + (*vj)*4;
1040:     while (nz--) {
1041:       /* xk += U(k,:)*x(:) */
1042:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1043:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1044:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1045:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1046:       vj++;
1047:       tp = t + (*vj)*4;
1048:       v += 16;
1049:     }
1050:     tp       = t + k*4;
1051:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1052:     idx      = 4*r[k];
1053:     x[idx]   = x0;
1054:     x[idx+1] = x1;
1055:     x[idx+2] = x2;
1056:     x[idx+3] = x3;
1057:   }

1059:   ISRestoreIndices(isrow,&r);
1060:   VecRestoreArray(bb,&b);
1061:   VecRestoreArray(xx,&x);
1062:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1063:   return(0);
1064: }

1068: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1069: {
1070:   MatScalar   *v,*diag;
1071:   PetscScalar *xp,x0,x1,x2,x3;
1072:   PetscInt    nz,*vj,k;

1075:   for (k=0; k<mbs; k++) {
1076:     v  = aa + 16*ai[k];
1077:     xp = x + k*4;
1078:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1079:     nz = ai[k+1] - ai[k];
1080:     vj = aj + ai[k];
1081:     xp = x + (*vj)*4;
1082:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1083:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084:     while (nz--) {
1085:       /* x(:) += U(k,:)^T*(Dk*xk) */
1086:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1087:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1088:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1089:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1090:       vj++;
1091:       xp = x + (*vj)*4;
1092:       v += 16;
1093:     }
1094:     /* xk = inv(Dk)*(Dk*xk) */
1095:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1096:     xp    = x + k*4;
1097:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1098:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1099:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1100:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1101:   }
1102:   return(0);
1103: }

1107: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1108: {
1109:   MatScalar   *v;
1110:   PetscScalar *xp,x0,x1,x2,x3;
1111:   PetscInt    nz,*vj,k;

1114:   for (k=mbs-1; k>=0; k--) {
1115:     v  = aa + 16*ai[k];
1116:     xp = x + k*4;
1117:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1118:     nz = ai[k+1] - ai[k];
1119:     vj = aj + ai[k];
1120:     xp = x + (*vj)*4;
1121:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1122:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1123:     while (nz--) {
1124:       /* xk += U(k,:)*x(:) */
1125:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1126:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1127:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1128:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1129:       vj++;
1130:       v += 16; xp = x + (*vj)*4;
1131:     }
1132:     xp    = x + k*4;
1133:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1134:   }
1135:   return(0);
1136: }

1140: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1141: {
1142:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1143:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1144:   MatScalar      *aa=a->a;
1145:   PetscScalar    *x,*b;

1149:   VecGetArray(bb,&b);
1150:   VecGetArray(xx,&x);

1152:   /* solve U^T * D * y = b by forward substitution */
1153:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1154:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1156:   /* solve U*x = y by back substitution */
1157:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1158:   VecRestoreArray(bb,&b);
1159:   VecRestoreArray(xx,&x);
1160:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1161:   return(0);
1162: }

1166: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1167: {
1168:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1169:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1170:   MatScalar      *aa=a->a;
1171:   PetscScalar    *x,*b;

1175:   VecGetArray(bb,&b);
1176:   VecGetArray(xx,&x);
1177:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1178:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1179:   VecRestoreArray(bb,&b);
1180:   VecRestoreArray(xx,&x);
1181:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1182:   return(0);
1183: }

1187: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1188: {
1189:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1190:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1191:   MatScalar      *aa=a->a;
1192:   PetscScalar    *x,*b;

1196:   VecGetArray(bb,&b);
1197:   VecGetArray(xx,&x);
1198:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1199:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1200:   VecRestoreArray(bb,&b);
1201:   VecRestoreArray(xx,&x);
1202:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1203:   return(0);
1204: }

1208: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1209: {
1210:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
1211:   IS             isrow=a->row;
1212:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1214:   const PetscInt *r;
1215:   PetscInt       nz,*vj,k,idx;
1216:   MatScalar      *aa=a->a,*v,*diag;
1217:   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;

1220:   VecGetArray(bb,&b);
1221:   VecGetArray(xx,&x);
1222:   t    = a->solve_work;
1223:   ISGetIndices(isrow,&r);

1225:   /* solve U^T * D * y = b by forward substitution */
1226:   tp = t;
1227:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1228:     idx   = 3*r[k];
1229:     tp[0] = b[idx];
1230:     tp[1] = b[idx+1];
1231:     tp[2] = b[idx+2];
1232:     tp   += 3;
1233:   }

1235:   for (k=0; k<mbs; k++) {
1236:     v  = aa + 9*ai[k];
1237:     vj = aj + ai[k];
1238:     tp = t + k*3;
1239:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1240:     nz = ai[k+1] - ai[k];

1242:     tp = t + (*vj)*3;
1243:     while (nz--) {
1244:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1245:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1246:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1247:       vj++;
1248:       tp = t + (*vj)*3;
1249:       v += 9;
1250:     }

1252:     /* xk = inv(Dk)*(Dk*xk) */
1253:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1254:     tp    = t + k*3;
1255:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1256:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1257:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1258:   }

1260:   /* solve U*x = y by back substitution */
1261:   for (k=mbs-1; k>=0; k--) {
1262:     v  = aa + 9*ai[k];
1263:     vj = aj + ai[k];
1264:     tp = t + k*3;
1265:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1266:     nz = ai[k+1] - ai[k];

1268:     tp = t + (*vj)*3;
1269:     while (nz--) {
1270:       /* xk += U(k,:)*x(:) */
1271:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1272:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1273:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1274:       vj++;
1275:       tp = t + (*vj)*3;
1276:       v += 9;
1277:     }
1278:     tp       = t + k*3;
1279:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1280:     idx      = 3*r[k];
1281:     x[idx]   = x0;
1282:     x[idx+1] = x1;
1283:     x[idx+2] = x2;
1284:   }

1286:   ISRestoreIndices(isrow,&r);
1287:   VecRestoreArray(bb,&b);
1288:   VecRestoreArray(xx,&x);
1289:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1290:   return(0);
1291: }

1295: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1296: {
1297:   MatScalar   *v,*diag;
1298:   PetscScalar *xp,x0,x1,x2;
1299:   PetscInt    nz,*vj,k;

1302:   for (k=0; k<mbs; k++) {
1303:     v  = aa + 9*ai[k];
1304:     xp = x + k*3;
1305:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1306:     nz = ai[k+1] - ai[k];
1307:     vj = aj + ai[k];
1308:     xp = x + (*vj)*3;
1309:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1310:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1311:     while (nz--) {
1312:       /* x(:) += U(k,:)^T*(Dk*xk) */
1313:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1314:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1315:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1316:       vj++;
1317:       xp = x + (*vj)*3;
1318:       v += 9;
1319:     }
1320:     /* xk = inv(Dk)*(Dk*xk) */
1321:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1322:     xp    = x + k*3;
1323:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1324:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1325:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1326:   }
1327:   return(0);
1328: }

1332: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1333: {
1334:   MatScalar   *v;
1335:   PetscScalar *xp,x0,x1,x2;
1336:   PetscInt    nz,*vj,k;

1339:   for (k=mbs-1; k>=0; k--) {
1340:     v  = aa + 9*ai[k];
1341:     xp = x + k*3;
1342:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1343:     nz = ai[k+1] - ai[k];
1344:     vj = aj + ai[k];
1345:     xp = x + (*vj)*3;
1346:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1347:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1348:     while (nz--) {
1349:       /* xk += U(k,:)*x(:) */
1350:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1351:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1352:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1353:       vj++;
1354:       v += 9; xp = x + (*vj)*3;
1355:     }
1356:     xp    = x + k*3;
1357:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1358:   }
1359:   return(0);
1360: }

1364: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1365: {
1366:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1367:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1368:   MatScalar      *aa=a->a;
1369:   PetscScalar    *x,*b;

1373:   VecGetArray(bb,&b);
1374:   VecGetArray(xx,&x);

1376:   /* solve U^T * D * y = b by forward substitution */
1377:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1378:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1380:   /* solve U*x = y by back substitution */
1381:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1383:   VecRestoreArray(bb,&b);
1384:   VecRestoreArray(xx,&x);
1385:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1386:   return(0);
1387: }

1391: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1392: {
1393:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1394:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1395:   MatScalar      *aa=a->a;
1396:   PetscScalar    *x,*b;

1400:   VecGetArray(bb,&b);
1401:   VecGetArray(xx,&x);
1402:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1403:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1404:   VecRestoreArray(bb,&b);
1405:   VecRestoreArray(xx,&x);
1406:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1407:   return(0);
1408: }

1412: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1413: {
1414:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1415:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1416:   MatScalar      *aa=a->a;
1417:   PetscScalar    *x,*b;

1421:   VecGetArray(bb,&b);
1422:   VecGetArray(xx,&x);
1423:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1424:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1425:   VecRestoreArray(bb,&b);
1426:   VecRestoreArray(xx,&x);
1427:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1428:   return(0);
1429: }

1433: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1434: {
1435:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
1436:   IS             isrow=a->row;
1437:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1439:   const PetscInt *r;
1440:   PetscInt       nz,*vj,k,k2,idx;
1441:   MatScalar      *aa=a->a,*v,*diag;
1442:   PetscScalar    *x,*b,x0,x1,*t;

1445:   VecGetArray(bb,&b);
1446:   VecGetArray(xx,&x);
1447:   t    = a->solve_work;
1448:   ISGetIndices(isrow,&r);

1450:   /* solve U^T * D * y = perm(b) by forward substitution */
1451:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1452:     idx      = 2*r[k];
1453:     t[k*2]   = b[idx];
1454:     t[k*2+1] = b[idx+1];
1455:   }
1456:   for (k=0; k<mbs; k++) {
1457:     v  = aa + 4*ai[k];
1458:     vj = aj + ai[k];
1459:     k2 = k*2;
1460:     x0 = t[k2]; x1 = t[k2+1];
1461:     nz = ai[k+1] - ai[k];
1462:     while (nz--) {
1463:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1464:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1465:       vj++; v      += 4;
1466:     }
1467:     diag    = aa+k*4; /* ptr to inv(Dk) */
1468:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1469:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1470:   }

1472:   /* solve U*x = y by back substitution */
1473:   for (k=mbs-1; k>=0; k--) {
1474:     v  = aa + 4*ai[k];
1475:     vj = aj + ai[k];
1476:     k2 = k*2;
1477:     x0 = t[k2]; x1 = t[k2+1];
1478:     nz = ai[k+1] - ai[k];
1479:     while (nz--) {
1480:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1481:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1482:       vj++;
1483:       v += 4;
1484:     }
1485:     t[k2]    = x0;
1486:     t[k2+1]  = x1;
1487:     idx      = 2*r[k];
1488:     x[idx]   = x0;
1489:     x[idx+1] = x1;
1490:   }

1492:   ISRestoreIndices(isrow,&r);
1493:   VecRestoreArray(bb,&b);
1494:   VecRestoreArray(xx,&x);
1495:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1496:   return(0);
1497: }

1501: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1502: {
1503:   MatScalar   *v,*diag;
1504:   PetscScalar x0,x1;
1505:   PetscInt    nz,*vj,k,k2;

1508:   for (k=0; k<mbs; k++) {
1509:     v  = aa + 4*ai[k];
1510:     vj = aj + ai[k];
1511:     k2 = k*2;
1512:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1513:     nz = ai[k+1] - ai[k];
1514:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1515:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1516:     while (nz--) {
1517:       /* x(:) += U(k,:)^T*(Dk*xk) */
1518:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1519:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1520:       vj++; v      += 4;
1521:     }
1522:     /* xk = inv(Dk)*(Dk*xk) */
1523:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1524:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1525:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1526:   }
1527:   return(0);
1528: }

1532: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1533: {
1534:   MatScalar   *v;
1535:   PetscScalar x0,x1;
1536:   PetscInt    nz,*vj,k,k2;

1539:   for (k=mbs-1; k>=0; k--) {
1540:     v  = aa + 4*ai[k];
1541:     vj = aj + ai[k];
1542:     k2 = k*2;
1543:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1544:     nz = ai[k+1] - ai[k];
1545:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1546:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1547:     while (nz--) {
1548:       /* xk += U(k,:)*x(:) */
1549:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1550:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1551:       vj++;
1552:       v += 4;
1553:     }
1554:     x[k2]   = x0;
1555:     x[k2+1] = x1;
1556:   }
1557:   return(0);
1558: }

1562: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1563: {
1564:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1565:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1566:   MatScalar      *aa=a->a;
1567:   PetscScalar    *x,*b;

1571:   VecGetArray(bb,&b);
1572:   VecGetArray(xx,&x);

1574:   /* solve U^T * D * y = b by forward substitution */
1575:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1576:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1578:   /* solve U*x = y by back substitution */
1579:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1581:   VecRestoreArray(bb,&b);
1582:   VecRestoreArray(xx,&x);
1583:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1584:   return(0);
1585: }

1589: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1590: {
1591:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1592:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1593:   MatScalar      *aa=a->a;
1594:   PetscScalar    *x,*b;

1598:   VecGetArray(bb,&b);
1599:   VecGetArray(xx,&x);
1600:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1601:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1602:   VecRestoreArray(bb,&b);
1603:   VecRestoreArray(xx,&x);
1604:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1605:   return(0);
1606: }

1610: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1611: {
1612:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1613:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1614:   MatScalar      *aa=a->a;
1615:   PetscScalar    *x,*b;

1619:   VecGetArray(bb,&b);
1620:   VecGetArray(xx,&x);
1621:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1622:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1623:   VecRestoreArray(bb,&b);
1624:   VecRestoreArray(xx,&x);
1625:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
1626:   return(0);
1627: }

1631: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1632: {
1633:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1634:   IS                isrow=a->row;
1635:   PetscErrorCode    ierr;
1636:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1637:   const MatScalar   *aa=a->a,*v;
1638:   const PetscScalar *b;
1639:   PetscScalar       *x,xk,*t;
1640:   PetscInt          nz,k,j;

1643:   VecGetArrayRead(bb,&b);
1644:   VecGetArray(xx,&x);
1645:   t    = a->solve_work;
1646:   ISGetIndices(isrow,&rp);

1648:   /* solve U^T*D*y = perm(b) by forward substitution */
1649:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1650:   for (k=0; k<mbs; k++) {
1651:     v  = aa + ai[k];
1652:     vj = aj + ai[k];
1653:     xk = t[k];
1654:     nz = ai[k+1] - ai[k] - 1;
1655:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1656:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1657:   }

1659:   /* solve U*perm(x) = y by back substitution */
1660:   for (k=mbs-1; k>=0; k--) {
1661:     v  = aa + adiag[k] - 1;
1662:     vj = aj + adiag[k] - 1;
1663:     nz = ai[k+1] - ai[k] - 1;
1664:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1665:     x[rp[k]] = t[k];
1666:   }

1668:   ISRestoreIndices(isrow,&rp);
1669:   VecRestoreArrayRead(bb,&b);
1670:   VecRestoreArray(xx,&x);
1671:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1672:   return(0);
1673: }

1677: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1678: {
1679:   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1680:   IS              isrow=a->row;
1681:   PetscErrorCode  ierr;
1682:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1683:   const MatScalar *aa=a->a,*v;
1684:   PetscScalar     *x,*b,xk,*t;
1685:   PetscInt        nz,k;

1688:   VecGetArray(bb,&b);
1689:   VecGetArray(xx,&x);
1690:   t    = a->solve_work;
1691:   ISGetIndices(isrow,&rp);

1693:   /* solve U^T*D*y = perm(b) by forward substitution */
1694:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1695:   for (k=0; k<mbs; k++) {
1696:     v  = aa + ai[k] + 1;
1697:     vj = aj + ai[k] + 1;
1698:     xk = t[k];
1699:     nz = ai[k+1] - ai[k] - 1;
1700:     while (nz--) t[*vj++] += (*v++) * xk;
1701:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1702:   }

1704:   /* solve U*perm(x) = y by back substitution */
1705:   for (k=mbs-1; k>=0; k--) {
1706:     v  = aa + ai[k] + 1;
1707:     vj = aj + ai[k] + 1;
1708:     nz = ai[k+1] - ai[k] - 1;
1709:     while (nz--) t[k] += (*v++) * t[*vj++];
1710:     x[rp[k]] = t[k];
1711:   }

1713:   ISRestoreIndices(isrow,&rp);
1714:   VecRestoreArray(bb,&b);
1715:   VecRestoreArray(xx,&x);
1716:   PetscLogFlops(4.0*a->nz - 3*mbs);
1717:   return(0);
1718: }

1722: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1723: {
1724:   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1725:   IS              isrow=a->row;
1726:   PetscErrorCode  ierr;
1727:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1728:   const MatScalar *aa=a->a,*v;
1729:   PetscReal       diagk;
1730:   PetscScalar     *x,*b,xk;
1731:   PetscInt        nz,k;

1734:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1735:   VecGetArray(bb,&b);
1736:   VecGetArray(xx,&x);
1737:   ISGetIndices(isrow,&rp);

1739:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1740:   for (k=0; k<mbs; k++) {
1741:     v  = aa + ai[k];
1742:     vj = aj + ai[k];
1743:     xk = x[k];
1744:     nz = ai[k+1] - ai[k] - 1;
1745:     while (nz--) x[*vj++] += (*v++) * xk;

1747:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1748:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1749:     x[k] = xk*PetscSqrtReal(diagk);
1750:   }
1751:   ISRestoreIndices(isrow,&rp);
1752:   VecRestoreArray(bb,&b);
1753:   VecRestoreArray(xx,&x);
1754:   PetscLogFlops(2.0*a->nz - mbs);
1755:   return(0);
1756: }

1760: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1761: {
1762:   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1763:   IS              isrow=a->row;
1764:   PetscErrorCode  ierr;
1765:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1766:   const MatScalar *aa=a->a,*v;
1767:   PetscReal       diagk;
1768:   PetscScalar     *x,*b,xk;
1769:   PetscInt        nz,k;

1772:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773:   VecGetArray(bb,&b);
1774:   VecGetArray(xx,&x);
1775:   ISGetIndices(isrow,&rp);

1777:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1778:   for (k=0; k<mbs; k++) {
1779:     v  = aa + ai[k] + 1;
1780:     vj = aj + ai[k] + 1;
1781:     xk = x[k];
1782:     nz = ai[k+1] - ai[k] - 1;
1783:     while (nz--) x[*vj++] += (*v++) * xk;

1785:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1787:     x[k] = xk*PetscSqrtReal(diagk);
1788:   }
1789:   ISRestoreIndices(isrow,&rp);
1790:   VecRestoreArray(bb,&b);
1791:   VecRestoreArray(xx,&x);
1792:   PetscLogFlops(2.0*a->nz - mbs);
1793:   return(0);
1794: }

1798: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1799: {
1800:   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1801:   IS              isrow=a->row;
1802:   PetscErrorCode  ierr;
1803:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1804:   const MatScalar *aa=a->a,*v;
1805:   PetscReal       diagk;
1806:   PetscScalar     *x,*b,*t;
1807:   PetscInt        nz,k;

1810:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1811:   VecGetArray(bb,&b);
1812:   VecGetArray(xx,&x);
1813:   t    = a->solve_work;
1814:   ISGetIndices(isrow,&rp);

1816:   for (k=mbs-1; k>=0; k--) {
1817:     v     = aa + ai[k];
1818:     vj    = aj + ai[k];
1819:     diagk = PetscRealPart(aa[adiag[k]]);
1820:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1821:     t[k] = b[k] * PetscSqrtReal(diagk);
1822:     nz   = ai[k+1] - ai[k] - 1;
1823:     while (nz--) t[k] += (*v++) * t[*vj++];
1824:     x[rp[k]] = t[k];
1825:   }
1826:   ISRestoreIndices(isrow,&rp);
1827:   VecRestoreArray(bb,&b);
1828:   VecRestoreArray(xx,&x);
1829:   PetscLogFlops(2.0*a->nz - mbs);
1830:   return(0);
1831: }

1835: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1836: {
1837:   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1838:   IS              isrow=a->row;
1839:   PetscErrorCode  ierr;
1840:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1841:   const MatScalar *aa=a->a,*v;
1842:   PetscReal       diagk;
1843:   PetscScalar     *x,*b,*t;
1844:   PetscInt        nz,k;

1847:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1848:   VecGetArray(bb,&b);
1849:   VecGetArray(xx,&x);
1850:   t    = a->solve_work;
1851:   ISGetIndices(isrow,&rp);

1853:   for (k=mbs-1; k>=0; k--) {
1854:     v     = aa + ai[k] + 1;
1855:     vj    = aj + ai[k] + 1;
1856:     diagk = PetscRealPart(aa[ai[k]]);
1857:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1858:     t[k] = b[k] * PetscSqrtReal(diagk);
1859:     nz   = ai[k+1] - ai[k] - 1;
1860:     while (nz--) t[k] += (*v++) * t[*vj++];
1861:     x[rp[k]] = t[k];
1862:   }
1863:   ISRestoreIndices(isrow,&rp);
1864:   VecRestoreArray(bb,&b);
1865:   VecRestoreArray(xx,&x);
1866:   PetscLogFlops(2.0*a->nz - mbs);
1867:   return(0);
1868: }

1872: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1873: {
1874:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1878:   if (A->rmap->bs == 1) {
1879:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1880:   } else {
1881:     IS              isrow=a->row;
1882:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1883:     const MatScalar *aa=a->a,*v;
1884:     PetscScalar     *x,*b,*t;
1885:     PetscInt        nz,k,n,i,j;
1886:     if (bb->n > a->solves_work_n) {
1887:       PetscFree(a->solves_work);
1888:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);

1890:       a->solves_work_n = bb->n;
1891:     }
1892:     n    = bb->n;
1893:     VecGetArray(bb->v,&b);
1894:     VecGetArray(xx->v,&x);
1895:     t    = a->solves_work;

1897:     ISGetIndices(isrow,&rp);

1899:     /* solve U^T*D*y = perm(b) by forward substitution */
1900:     for (k=0; k<mbs; k++) {
1901:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1902:     }
1903:     for (k=0; k<mbs; k++) {
1904:       v  = aa + ai[k];
1905:       vj = aj + ai[k];
1906:       nz = ai[k+1] - ai[k] - 1;
1907:       for (j=0; j<nz; j++) {
1908:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1909:         v++;vj++;
1910:       }
1911:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1912:     }

1914:     /* solve U*perm(x) = y by back substitution */
1915:     for (k=mbs-1; k>=0; k--) {
1916:       v  = aa + ai[k] - 1;
1917:       vj = aj + ai[k] - 1;
1918:       nz = ai[k+1] - ai[k] - 1;
1919:       for (j=0; j<nz; j++) {
1920:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1921:         v++;vj++;
1922:       }
1923:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1924:     }

1926:     ISRestoreIndices(isrow,&rp);
1927:     VecRestoreArray(bb->v,&b);
1928:     VecRestoreArray(xx->v,&x);
1929:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1930:   }
1931:   return(0);
1932: }

1936: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1937: {
1938:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1942:   if (A->rmap->bs == 1) {
1943:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1944:   } else {
1945:     IS              isrow=a->row;
1946:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1947:     const MatScalar *aa=a->a,*v;
1948:     PetscScalar     *x,*b,*t;
1949:     PetscInt        nz,k,n,i;
1950:     if (bb->n > a->solves_work_n) {
1951:       PetscFree(a->solves_work);
1952:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);

1954:       a->solves_work_n = bb->n;
1955:     }
1956:     n    = bb->n;
1957:     VecGetArray(bb->v,&b);
1958:     VecGetArray(xx->v,&x);
1959:     t    = a->solves_work;

1961:     ISGetIndices(isrow,&rp);

1963:     /* solve U^T*D*y = perm(b) by forward substitution */
1964:     for (k=0; k<mbs; k++) {
1965:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1966:     }
1967:     for (k=0; k<mbs; k++) {
1968:       v  = aa + ai[k];
1969:       vj = aj + ai[k];
1970:       nz = ai[k+1] - ai[k];
1971:       while (nz--) {
1972:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1973:         v++;vj++;
1974:       }
1975:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1976:     }

1978:     /* solve U*perm(x) = y by back substitution */
1979:     for (k=mbs-1; k>=0; k--) {
1980:       v  = aa + ai[k];
1981:       vj = aj + ai[k];
1982:       nz = ai[k+1] - ai[k];
1983:       while (nz--) {
1984:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1985:         v++;vj++;
1986:       }
1987:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1988:     }

1990:     ISRestoreIndices(isrow,&rp);
1991:     VecRestoreArray(bb->v,&b);
1992:     VecRestoreArray(xx->v,&x);
1993:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1994:   }
1995:   return(0);
1996: }

2000: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2001: {
2002:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2003:   PetscErrorCode    ierr;
2004:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
2005:   const MatScalar   *aa=a->a,*v;
2006:   const PetscScalar *b;
2007:   PetscScalar       *x,xi;
2008:   PetscInt          nz,i,j;

2011:   VecGetArrayRead(bb,&b);
2012:   VecGetArray(xx,&x);

2014:   /* solve U^T*D*y = b by forward substitution */
2015:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2016:   for (i=0; i<mbs; i++) {
2017:     v  = aa + ai[i];
2018:     vj = aj + ai[i];
2019:     xi = x[i];
2020:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2021:     for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
2022:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2023:   }

2025:   /* solve U*x = y by backward substitution */
2026:   for (i=mbs-2; i>=0; i--) {
2027:     xi = x[i];
2028:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2029:     vj = aj + adiag[i] - 1;
2030:     nz = ai[i+1] - ai[i] - 1;
2031:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2032:     x[i] = xi;
2033:   }

2035:   VecRestoreArrayRead(bb,&b);
2036:   VecRestoreArray(xx,&x);
2037:   PetscLogFlops(4.0*a->nz - 3*mbs);
2038:   return(0);
2039: }

2043: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2044: {
2045:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2047:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2048:   MatScalar      *aa=a->a,*v;
2049:   PetscScalar    *x,*b,xk;
2050:   PetscInt       nz,*vj,k;

2053:   VecGetArray(bb,&b);
2054:   VecGetArray(xx,&x);

2056:   /* solve U^T*D*y = b by forward substitution */
2057:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2058:   for (k=0; k<mbs; k++) {
2059:     v  = aa + ai[k] + 1;
2060:     vj = aj + ai[k] + 1;
2061:     xk = x[k];
2062:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2063:     while (nz--) x[*vj++] += (*v++) * xk;
2064:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2065:   }

2067:   /* solve U*x = y by back substitution */
2068:   for (k=mbs-2; k>=0; k--) {
2069:     v  = aa + ai[k] + 1;
2070:     vj = aj + ai[k] + 1;
2071:     xk = x[k];
2072:     nz = ai[k+1] - ai[k] - 1;
2073:     while (nz--) {
2074:       xk += (*v++) * x[*vj++];
2075:     }
2076:     x[k] = xk;
2077:   }

2079:   VecRestoreArray(bb,&b);
2080:   VecRestoreArray(xx,&x);
2081:   PetscLogFlops(4.0*a->nz - 3*mbs);
2082:   return(0);
2083: }

2087: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2088: {
2089:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
2090:   PetscErrorCode  ierr;
2091:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2092:   const MatScalar *aa=a->a,*v;
2093:   PetscReal       diagk;
2094:   PetscScalar     *x,*b;
2095:   PetscInt        nz,*vj,k;

2098:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2099:   VecGetArray(bb,&b);
2100:   VecGetArray(xx,&x);
2101:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2102:   for (k=0; k<mbs; k++) {
2103:     v  = aa + ai[k];
2104:     vj = aj + ai[k];
2105:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2106:     while (nz--) x[*vj++] += (*v++) * x[k];
2107:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2108:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2109:     x[k] *= PetscSqrtReal(diagk);
2110:   }
2111:   VecRestoreArray(bb,&b);
2112:   VecRestoreArray(xx,&x);
2113:   PetscLogFlops(2.0*a->nz - mbs);
2114:   return(0);
2115: }

2119: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2120: {
2121:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2123:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2124:   MatScalar      *aa=a->a,*v;
2125:   PetscReal      diagk;
2126:   PetscScalar    *x,*b;
2127:   PetscInt       nz,*vj,k;

2130:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2131:   VecGetArray(bb,&b);
2132:   VecGetArray(xx,&x);
2133:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2134:   for (k=0; k<mbs; k++) {
2135:     v  = aa + ai[k] + 1;
2136:     vj = aj + ai[k] + 1;
2137:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2138:     while (nz--) x[*vj++] += (*v++) * x[k];
2139:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2140:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2141:     x[k] *= PetscSqrtReal(diagk);
2142:   }
2143:   VecRestoreArray(bb,&b);
2144:   VecRestoreArray(xx,&x);
2145:   PetscLogFlops(2.0*a->nz - mbs);
2146:   return(0);
2147: }

2151: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2152: {
2153:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2155:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2156:   MatScalar      *aa=a->a,*v;
2157:   PetscReal      diagk;
2158:   PetscScalar    *x,*b;
2159:   PetscInt       nz,*vj,k;

2162:   /* solve D^(1/2)*U*x = b by back substitution */
2163:   VecGetArray(bb,&b);
2164:   VecGetArray(xx,&x);

2166:   for (k=mbs-1; k>=0; k--) {
2167:     v     = aa + ai[k];
2168:     vj    = aj + ai[k];
2169:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2170:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2171:     x[k] = PetscSqrtReal(diagk)*b[k];
2172:     nz   = ai[k+1] - ai[k] - 1;
2173:     while (nz--) x[k] += (*v++) * x[*vj++];
2174:   }
2175:   VecRestoreArray(bb,&b);
2176:   VecRestoreArray(xx,&x);
2177:   PetscLogFlops(2.0*a->nz - mbs);
2178:   return(0);
2179: }

2183: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2184: {
2185:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2187:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2188:   MatScalar      *aa=a->a,*v;
2189:   PetscReal      diagk;
2190:   PetscScalar    *x,*b;
2191:   PetscInt       nz,*vj,k;

2194:   /* solve D^(1/2)*U*x = b by back substitution */
2195:   VecGetArray(bb,&b);
2196:   VecGetArray(xx,&x);

2198:   for (k=mbs-1; k>=0; k--) {
2199:     v     = aa + ai[k] + 1;
2200:     vj    = aj + ai[k] + 1;
2201:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2202:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2203:     x[k] = PetscSqrtReal(diagk)*b[k];
2204:     nz   = ai[k+1] - ai[k] - 1;
2205:     while (nz--) x[k] += (*v++) * x[*vj++];
2206:   }
2207:   VecRestoreArray(bb,&b);
2208:   VecRestoreArray(xx,&x);
2209:   PetscLogFlops(2.0*a->nz - mbs);
2210:   return(0);
2211: }

2213: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2216: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2217: {
2218:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2220:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2221:   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2222:   PetscInt       m,reallocs = 0,*levtmp;
2223:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2224:   PetscInt       incrlev,*lev,shift,prow,nz;
2225:   PetscReal      f = info->fill,levels = info->levels;
2226:   PetscBool      perm_identity;

2229:   /* check whether perm is the identity mapping */
2230:   ISIdentity(perm,&perm_identity);

2232:   if (perm_identity) {
2233:     a->permute = PETSC_FALSE;
2234:     ai         = a->i; aj = a->j;
2235:   } else { /*  non-trivial permutation */
2236:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2237:     a->permute = PETSC_TRUE;
2238:     MatReorderingSeqSBAIJ(A, perm);
2239:     ai         = a->inew; aj = a->jnew;
2240:   }

2242:   /* initialization */
2243:   ISGetIndices(perm,&rip);
2244:   umax  = (PetscInt)(f*ai[mbs] + 1);
2245:   PetscMalloc(umax*sizeof(PetscInt),&lev);
2246:   umax += mbs + 1;
2247:   shift = mbs + 1;
2248:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
2249:   PetscMalloc(umax*sizeof(PetscInt),&ju);
2250:   iu[0] = mbs + 1;
2251:   juidx = mbs + 1;
2252:   /* prowl: linked list for pivot row */
2253:   PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);
2254:   /* q: linked list for col index */

2256:   for (i=0; i<mbs; i++) {
2257:     prowl[i] = mbs;
2258:     q[i]     = 0;
2259:   }

2261:   /* for each row k */
2262:   for (k=0; k<mbs; k++) {
2263:     nzk  = 0;
2264:     q[k] = mbs;
2265:     /* copy current row into linked list */
2266:     nz = ai[rip[k]+1] - ai[rip[k]];
2267:     j  = ai[rip[k]];
2268:     while (nz--) {
2269:       vj = rip[aj[j++]];
2270:       if (vj > k) {
2271:         qm = k;
2272:         do {
2273:           m = qm; qm = q[m];
2274:         } while (qm < vj);
2275:         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2276:         nzk++;
2277:         q[m]       = vj;
2278:         q[vj]      = qm;
2279:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2280:       }
2281:     }

2283:     /* modify nonzero structure of k-th row by computing fill-in
2284:        for each row prow to be merged in */
2285:     prow = k;
2286:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2288:     while (prow < k) {
2289:       /* merge row prow into k-th row */
2290:       jmin = iu[prow] + 1;
2291:       jmax = iu[prow+1];
2292:       qm   = k;
2293:       for (j=jmin; j<jmax; j++) {
2294:         incrlev = lev[j-shift] + 1;
2295:         if (incrlev > levels) continue;
2296:         vj = ju[j];
2297:         do {
2298:           m = qm; qm = q[m];
2299:         } while (qm < vj);
2300:         if (qm != vj) {      /* a fill */
2301:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2302:           levtmp[vj] = incrlev;
2303:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2304:       }
2305:       prow = prowl[prow]; /* next pivot row */
2306:     }

2308:     /* add k to row list for first nonzero element in k-th row */
2309:     if (nzk > 1) {
2310:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2311:       prowl[k] = prowl[i]; prowl[i] = k;
2312:     }
2313:     iu[k+1] = iu[k] + nzk;

2315:     /* allocate more space to ju and lev if needed */
2316:     if (iu[k+1] > umax) {
2317:       /* estimate how much additional space we will need */
2318:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2319:       /* just double the memory each time */
2320:       maxadd = umax;
2321:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2322:       umax += maxadd;

2324:       /* allocate a longer ju */
2325:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2326:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2327:       PetscFree(ju);
2328:       ju   = jutmp;

2330:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2331:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2332:       PetscFree(lev);
2333:       lev       = jutmp;
2334:       reallocs += 2; /* count how many times we realloc */
2335:     }

2337:     /* save nonzero structure of k-th row in ju */
2338:     i=k;
2339:     while (nzk--) {
2340:       i                = q[i];
2341:       ju[juidx]        = i;
2342:       lev[juidx-shift] = levtmp[i];
2343:       juidx++;
2344:     }
2345:   }

2347: #if defined(PETSC_USE_INFO)
2348:   if (ai[mbs] != 0) {
2349:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2350:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2351:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2352:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2353:     PetscInfo(A,"for best performance.\n");
2354:   } else {
2355:     PetscInfo(A,"Empty matrix.\n");
2356:   }
2357: #endif

2359:   ISRestoreIndices(perm,&rip);
2360:   PetscFree3(prowl,q,levtmp);
2361:   PetscFree(lev);

2363:   /* put together the new matrix */
2364:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);

2366:   /* PetscLogObjectParent(B,iperm); */
2367:   b    = (Mat_SeqSBAIJ*)(B)->data;
2368:   PetscFree2(b->imax,b->ilen);

2370:   b->singlemalloc = PETSC_FALSE;
2371:   b->free_a       = PETSC_TRUE;
2372:   b->free_ij      = PETSC_TRUE;
2373:   /* the next line frees the default space generated by the Create() */
2374:   PetscFree3(b->a,b->j,b->i);
2375:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2376:   b->j    = ju;
2377:   b->i    = iu;
2378:   b->diag = 0;
2379:   b->ilen = 0;
2380:   b->imax = 0;

2382:   ISDestroy(&b->row);
2383:   ISDestroy(&b->icol);
2384:   b->row  = perm;
2385:   b->icol = perm;
2386:   PetscObjectReference((PetscObject)perm);
2387:   PetscObjectReference((PetscObject)perm);
2388:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2389:   /* In b structure:  Free imax, ilen, old a, old j.
2390:      Allocate idnew, solve_work, new a, new j */
2391:   PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2392:   b->maxnz = b->nz = iu[mbs];

2394:   (B)->info.factor_mallocs   = reallocs;
2395:   (B)->info.fill_ratio_given = f;
2396:   if (ai[mbs] != 0) {
2397:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2398:   } else {
2399:     (B)->info.fill_ratio_needed = 0.0;
2400:   }
2401:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2402:   return(0);
2403: }

2405: /*
2406:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2407: */
2408: #include <petscbt.h>
2409: #include <../src/mat/utils/freespace.h>
2412: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2413: {
2414:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2415:   PetscErrorCode     ierr;
2416:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2417:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2418:   const PetscInt     *rip;
2419:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2420:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2421:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2422:   PetscReal          fill          =info->fill,levels=info->levels;
2423:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2424:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2425:   PetscBT            lnkbt;

2428:   if (bs > 1) {
2429:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2430:     return(0);
2431:   }
2432:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2433:   MatMissingDiagonal(A,&missing,&d);
2434:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2436:   /* check whether perm is the identity mapping */
2437:   ISIdentity(perm,&perm_identity);
2438:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2439:   a->permute = PETSC_FALSE;

2441:   PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2442:   PetscMalloc((am+1)*sizeof(PetscInt),&udiag);
2443:   ui[0] = 0;

2445:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2446:   if (!levels) {
2447:     /* reuse the column pointers and row offsets for memory savings */
2448:     for (i=0; i<am; i++) {
2449:       ncols    = ai[i+1] - ai[i];
2450:       ui[i+1]  = ui[i] + ncols;
2451:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2452:     }
2453:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2454:     cols = uj;
2455:     for (i=0; i<am; i++) {
2456:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2457:       ncols = ai[i+1] - ai[i] -1;
2458:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2459:       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2460:     }
2461:   } else { /* case: levels>0 */
2462:     ISGetIndices(perm,&rip);

2464:     /* initialization */
2465:     /* jl: linked list for storing indices of the pivot rows
2466:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2467:     PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2468:     for (i=0; i<am; i++) {
2469:       jl[i] = am; il[i] = 0;
2470:     }

2472:     /* create and initialize a linked list for storing column indices of the active row k */
2473:     nlnk = am + 1;
2474:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2476:     /* initial FreeSpace size is fill*(ai[am]+1) */
2477:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);

2479:     current_space = free_space;

2481:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);

2483:     current_space_lvl = free_space_lvl;

2485:     for (k=0; k<am; k++) {  /* for each active row k */
2486:       /* initialize lnk by the column indices of row k */
2487:       nzk   = 0;
2488:       ncols = ai[k+1] - ai[k];
2489:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2490:       cols = aj+ai[k];
2491:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2492:       nzk += nlnk;

2494:       /* update lnk by computing fill-in for each pivot row to be merged in */
2495:       prow = jl[k]; /* 1st pivot row */

2497:       while (prow < k) {
2498:         nextprow = jl[prow];

2500:         /* merge prow into k-th row */
2501:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2502:         jmax  = ui[prow+1];
2503:         ncols = jmax-jmin;
2504:         i     = jmin - ui[prow];

2506:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2507:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2508:         j    = *(uj - 1);
2509:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2510:         nzk += nlnk;

2512:         /* update il and jl for prow */
2513:         if (jmin < jmax) {
2514:           il[prow] = jmin;

2516:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2517:         }
2518:         prow = nextprow;
2519:       }

2521:       /* if free space is not available, make more free space */
2522:       if (current_space->local_remaining<nzk) {
2523:         i    = am - k + 1; /* num of unfactored rows */
2524:         i   *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2525:         PetscFreeSpaceGet(i,&current_space);
2526:         PetscFreeSpaceGet(i,&current_space_lvl);
2527:         reallocs++;
2528:       }

2530:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2531:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2532:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2534:       /* add the k-th row into il and jl */
2535:       if (nzk > 1) {
2536:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2537:         jl[k] = jl[i]; jl[i] = k;
2538:         il[k] = ui[k] + 1;
2539:       }
2540:       uj_ptr[k]     = current_space->array;
2541:       uj_lvl_ptr[k] = current_space_lvl->array;

2543:       current_space->array               += nzk;
2544:       current_space->local_used          += nzk;
2545:       current_space->local_remaining     -= nzk;
2546:       current_space_lvl->array           += nzk;
2547:       current_space_lvl->local_used      += nzk;
2548:       current_space_lvl->local_remaining -= nzk;

2550:       ui[k+1] = ui[k] + nzk;
2551:     }

2553:     ISRestoreIndices(perm,&rip);
2554:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2556:     /* destroy list of free space and other temporary array(s) */
2557:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2558:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2559:     PetscIncompleteLLDestroy(lnk,lnkbt);
2560:     PetscFreeSpaceDestroy(free_space_lvl);

2562:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2564:   /* put together the new matrix in MATSEQSBAIJ format */
2565:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2567:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2568:   PetscFree2(b->imax,b->ilen);

2570:   b->singlemalloc = PETSC_FALSE;
2571:   b->free_a       = PETSC_TRUE;
2572:   b->free_ij      = free_ij;

2574:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);

2576:   b->j         = uj;
2577:   b->i         = ui;
2578:   b->diag      = udiag;
2579:   b->free_diag = PETSC_TRUE;
2580:   b->ilen      = 0;
2581:   b->imax      = 0;
2582:   b->row       = perm;
2583:   b->col       = perm;

2585:   PetscObjectReference((PetscObject)perm);
2586:   PetscObjectReference((PetscObject)perm);

2588:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2590:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2591:   PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2593:   b->maxnz = b->nz = ui[am];

2595:   fact->info.factor_mallocs   = reallocs;
2596:   fact->info.fill_ratio_given = fill;
2597:   if (ai[am] != 0) {
2598:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2599:   } else {
2600:     fact->info.fill_ratio_needed = 0.0;
2601:   }
2602: #if defined(PETSC_USE_INFO)
2603:   if (ai[am] != 0) {
2604:     PetscReal af = fact->info.fill_ratio_needed;
2605:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2606:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2607:     PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2608:   } else {
2609:     PetscInfo(A,"Empty matrix.\n");
2610:   }
2611: #endif
2612:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2613:   return(0);
2614: }

2618: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2619: {
2620:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2621:   Mat_SeqSBAIJ       *b;
2622:   PetscErrorCode     ierr;
2623:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2624:   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2625:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2626:   PetscInt           reallocs=0,i,*ui;
2627:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2628:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2629:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2630:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2631:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2632:   PetscBT            lnkbt;

2635:   MatMissingDiagonal(A,&missing,&d);
2636:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2638:   /*
2639:    This code originally uses Modified Sparse Row (MSR) storage
2640:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2641:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2642:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2643:    thus the original code in MSR format is still used for these cases.
2644:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2645:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2646:   */
2647:   if (bs > 1) {
2648:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2649:     return(0);
2650:   }

2652:   /* check whether perm is the identity mapping */
2653:   ISIdentity(perm,&perm_identity);
2654:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2655:   a->permute = PETSC_FALSE;

2657:   /* special case that simply copies fill pattern */
2658:   if (!levels) {
2659:     /* reuse the column pointers and row offsets for memory savings */
2660:     ui           = a->i;
2661:     uj           = a->j;
2662:     free_ij      = PETSC_FALSE;
2663:     ratio_needed = 1.0;
2664:   } else { /* case: levels>0 */
2665:     ISGetIndices(perm,&rip);

2667:     /* initialization */
2668:     PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2669:     ui[0] = 0;

2671:     /* jl: linked list for storing indices of the pivot rows
2672:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2673:     PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2674:     for (i=0; i<am; i++) {
2675:       jl[i] = am; il[i] = 0;
2676:     }

2678:     /* create and initialize a linked list for storing column indices of the active row k */
2679:     nlnk = am + 1;
2680:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2682:     /* initial FreeSpace size is fill*(ai[am]+1) */
2683:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);

2685:     current_space = free_space;

2687:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);

2689:     current_space_lvl = free_space_lvl;

2691:     for (k=0; k<am; k++) {  /* for each active row k */
2692:       /* initialize lnk by the column indices of row rip[k] */
2693:       nzk   = 0;
2694:       ncols = ai[rip[k]+1] - ai[rip[k]];
2695:       cols  = aj+ai[rip[k]];
2696:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2697:       nzk  += nlnk;

2699:       /* update lnk by computing fill-in for each pivot row to be merged in */
2700:       prow = jl[k]; /* 1st pivot row */

2702:       while (prow < k) {
2703:         nextprow = jl[prow];

2705:         /* merge prow into k-th row */
2706:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2707:         jmax     = ui[prow+1];
2708:         ncols    = jmax-jmin;
2709:         i        = jmin - ui[prow];
2710:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2711:         j        = *(uj_lvl_ptr[prow] + i - 1);
2712:         cols_lvl = uj_lvl_ptr[prow]+i;
2713:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2714:         nzk     += nlnk;

2716:         /* update il and jl for prow */
2717:         if (jmin < jmax) {
2718:           il[prow] = jmin;
2719:           j        = *cols;
2720:           jl[prow] = jl[j];
2721:           jl[j]    = prow;
2722:         }
2723:         prow = nextprow;
2724:       }

2726:       /* if free space is not available, make more free space */
2727:       if (current_space->local_remaining<nzk) {
2728:         i    = am - k + 1; /* num of unfactored rows */
2729:         i    = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2730:         PetscFreeSpaceGet(i,&current_space);
2731:         PetscFreeSpaceGet(i,&current_space_lvl);
2732:         reallocs++;
2733:       }

2735:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2736:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2738:       /* add the k-th row into il and jl */
2739:       if (nzk-1 > 0) {
2740:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2741:         jl[k] = jl[i]; jl[i] = k;
2742:         il[k] = ui[k] + 1;
2743:       }
2744:       uj_ptr[k]     = current_space->array;
2745:       uj_lvl_ptr[k] = current_space_lvl->array;

2747:       current_space->array               += nzk;
2748:       current_space->local_used          += nzk;
2749:       current_space->local_remaining     -= nzk;
2750:       current_space_lvl->array           += nzk;
2751:       current_space_lvl->local_used      += nzk;
2752:       current_space_lvl->local_remaining -= nzk;

2754:       ui[k+1] = ui[k] + nzk;
2755:     }

2757:     ISRestoreIndices(perm,&rip);
2758:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2760:     /* destroy list of free space and other temporary array(s) */
2761:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2762:     PetscFreeSpaceContiguous(&free_space,uj);
2763:     PetscIncompleteLLDestroy(lnk,lnkbt);
2764:     PetscFreeSpaceDestroy(free_space_lvl);
2765:     if (ai[am] != 0) {
2766:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2767:     } else {
2768:       ratio_needed = 0.0;
2769:     }
2770:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2772:   /* put together the new matrix in MATSEQSBAIJ format */
2773:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2775:   b = (Mat_SeqSBAIJ*)(fact)->data;

2777:   PetscFree2(b->imax,b->ilen);

2779:   b->singlemalloc = PETSC_FALSE;
2780:   b->free_a       = PETSC_TRUE;
2781:   b->free_ij      = free_ij;

2783:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);

2785:   b->j             = uj;
2786:   b->i             = ui;
2787:   b->diag          = 0;
2788:   b->ilen          = 0;
2789:   b->imax          = 0;
2790:   b->row           = perm;
2791:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2793:   PetscObjectReference((PetscObject)perm);
2794:   b->icol = perm;
2795:   PetscObjectReference((PetscObject)perm);
2796:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);

2798:   b->maxnz = b->nz = ui[am];

2800:   fact->info.factor_mallocs    = reallocs;
2801:   fact->info.fill_ratio_given  = fill;
2802:   fact->info.fill_ratio_needed = ratio_needed;
2803: #if defined(PETSC_USE_INFO)
2804:   if (ai[am] != 0) {
2805:     PetscReal af = fact->info.fill_ratio_needed;
2806:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2807:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2808:     PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2809:   } else {
2810:     PetscInfo(A,"Empty matrix.\n");
2811:   }
2812: #endif
2813:   if (perm_identity) {
2814:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2815:   } else {
2816:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2817:   }
2818:   return(0);
2819: }