Actual source code: sbaijfact2.c

petsc-3.3-p7 2013-05-11
  2: /*
  3:     Factorization code for SBAIJ format. 
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <../src/mat/blockinvert.h>

 12: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 13: {
 14:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
 15:   IS              isrow=a->row;
 16:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
 17:   PetscErrorCode  ierr;
 18:   const PetscInt  *r;
 19:   PetscInt        nz,*vj,k,idx,k1;
 20:   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
 21:   MatScalar       *aa=a->a,*v,*diag;
 22:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

 25:   VecGetArray(bb,&b);
 26:   VecGetArray(xx,&x);
 27:   t  = a->solve_work;
 28:   ISGetIndices(isrow,&r);
 29:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 31:   /* solve U^T * D * y = b by forward substitution */
 32:   xk = t;
 33:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 34:     idx   = bs*r[k];
 35:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 36:   }
 37:   for (k=0; k<mbs; k++){
 38:     v  = aa + bs2*ai[k];
 39:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
 40:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 41:     nz = ai[k+1] - ai[k];
 42:     vj = aj + ai[k];
 43:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
 44:     while (nz--) {
 45:       /* x(:) += U(k,:)^T*(Dk*xk) */
 46:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 47:       vj++; xj = t + (*vj)*bs;
 48:       v += bs2;
 49:     }
 50:     /* xk = inv(Dk)*(Dk*xk) */
 51:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 52:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 53:   }

 55:   /* solve U*x = y by back substitution */
 56:   for (k=mbs-1; k>=0; k--){
 57:     v  = aa + bs2*ai[k];
 58:     xk = t + k*bs;        /* xk */
 59:     nz = ai[k+1] - ai[k];
 60:     vj = aj + ai[k];
 61:     xj = t + (*vj)*bs;
 62:     while (nz--) {
 63:       /* xk += U(k,:)*x(:) */
 64:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 65:       vj++;
 66:       v += bs2; xj = t + (*vj)*bs;
 67:     }
 68:     idx   = bs*r[k];
 69:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 70:   }

 72:   PetscFree(xk_tmp);
 73:   ISRestoreIndices(isrow,&r);
 74:   VecRestoreArray(bb,&b);
 75:   VecRestoreArray(xx,&x);
 76:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 77:   return(0);
 78: }

 82: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 83: {
 85:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 86:   return(0);
 87: }

 91: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 92: {
 94:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 95:   return(0);
 96: }

100: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
101: {
103:   PetscInt       nz,*vj,k;
104:   PetscInt       bs2 = bs*bs;
105:   MatScalar      *v,*diag;
106:   PetscScalar    *xk,*xj,*xk_tmp;
107: 
109:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
110:   for (k=0; k<mbs; k++){
111:     v  = aa + bs2*ai[k];
112:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
113:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
114:     nz = ai[k+1] - ai[k];
115:     vj = aj + ai[k];
116:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
117:     while (nz--) {
118:       /* x(:) += U(k,:)^T*(Dk*xk) */
119:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
120:       vj++; xj = x + (*vj)*bs;
121:       v += bs2;
122:     }
123:     /* xk = inv(Dk)*(Dk*xk) */
124:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
125:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
126:   }
127:   PetscFree(xk_tmp);
128:   return(0);
129: }

133: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
134: {
135:   PetscInt       nz,*vj,k;
136:   PetscInt       bs2 = bs*bs;
137:   MatScalar      *v;
138:   PetscScalar    *xk,*xj;

141:   for (k=mbs-1; k>=0; k--){
142:     v  = aa + bs2*ai[k];
143:     xk = x + k*bs;        /* xk */
144:     nz = ai[k+1] - ai[k];
145:     vj = aj + ai[k];
146:     xj = x + (*vj)*bs;
147:     while (nz--) {
148:       /* xk += U(k,:)*x(:) */
149:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
150:       vj++;
151:       v += bs2; xj = x + (*vj)*bs;
152:     }
153:   }
154:   return(0);
155: }

159: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
160: {
161:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
163:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
164:   PetscInt       bs=A->rmap->bs;
165:   MatScalar      *aa=a->a;
166:   PetscScalar    *x,*b;

169:   VecGetArray(bb,&b);
170:   VecGetArray(xx,&x);

172:   /* solve U^T * D * y = b by forward substitution */
173:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
174:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

176:   /* solve U*x = y by back substitution */
177:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

179:   VecRestoreArray(bb,&b);
180:   VecRestoreArray(xx,&x);
181:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
182:   return(0);
183: }

187: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
188: {
189:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
191:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
192:   PetscInt       bs=A->rmap->bs;
193:   MatScalar      *aa=a->a;
194:   PetscScalar    *x,*b;

197:   VecGetArray(bb,&b);
198:   VecGetArray(xx,&x);
199:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
200:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
201:   VecRestoreArray(bb,&b);
202:   VecRestoreArray(xx,&x);
203:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
204:   return(0);
205: }

209: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
210: {
211:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
213:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
214:   PetscInt       bs=A->rmap->bs;
215:   MatScalar      *aa=a->a;
216:   PetscScalar    *x,*b;

219:   VecGetArray(bb,&b);
220:   VecGetArray(xx,&x);
221:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
222:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
223:   VecRestoreArray(bb,&b);
224:   VecRestoreArray(xx,&x);
225:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
226:   return(0);
227: }

231: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
232: {
233:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
234:   IS             isrow=a->row;
235:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
237:   const PetscInt *r;
238:   PetscInt       nz,*vj,k,idx;
239:   MatScalar      *aa=a->a,*v,*d;
240:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

243:   VecGetArray(bb,&b);
244:   VecGetArray(xx,&x);
245:   t  = a->solve_work;
246:   ISGetIndices(isrow,&r);

248:   /* solve U^T * D * y = b by forward substitution */
249:   tp = t;
250:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
251:     idx   = 7*r[k];
252:     tp[0] = b[idx];
253:     tp[1] = b[idx+1];
254:     tp[2] = b[idx+2];
255:     tp[3] = b[idx+3];
256:     tp[4] = b[idx+4];
257:     tp[5] = b[idx+5];
258:     tp[6] = b[idx+6];
259:     tp += 7;
260:   }
261: 
262:   for (k=0; k<mbs; k++){
263:     v  = aa + 49*ai[k];
264:     vj = aj + ai[k];
265:     tp = t + k*7;
266:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
267:     nz = ai[k+1] - ai[k];
268:     tp = t + (*vj)*7;
269:     while (nz--) {
270:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
271:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
272:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
273:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
274:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
275:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
276:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
277:       vj++; tp = t + (*vj)*7;
278:       v += 49;
279:     }

281:     /* xk = inv(Dk)*(Dk*xk) */
282:     d  = aa+k*49;          /* ptr to inv(Dk) */
283:     tp    = t + k*7;
284:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
285:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
286:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
287:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
288:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
289:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
290:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
291:   }

293:   /* solve U*x = y by back substitution */
294:   for (k=mbs-1; k>=0; k--){
295:     v  = aa + 49*ai[k];
296:     vj = aj + ai[k];
297:     tp    = t + k*7;
298:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
299:     nz = ai[k+1] - ai[k];
300: 
301:     tp = t + (*vj)*7;
302:     while (nz--) {
303:       /* xk += U(k,:)*x(:) */
304:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
305:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
306:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
307:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
308:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
309:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
310:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
311:       vj++; tp = t + (*vj)*7;
312:       v += 49;
313:     }
314:     tp    = t + k*7;
315:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
316:     idx   = 7*r[k];
317:     x[idx]     = x0;
318:     x[idx+1]   = x1;
319:     x[idx+2]   = x2;
320:     x[idx+3]   = x3;
321:     x[idx+4]   = x4;
322:     x[idx+5]   = x5;
323:     x[idx+6]   = x6;
324:   }

326:   ISRestoreIndices(isrow,&r);
327:   VecRestoreArray(bb,&b);
328:   VecRestoreArray(xx,&x);
329:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
330:   return(0);
331: }

335: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
336: {
337:   MatScalar      *v,*d;
338:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
339:   PetscInt       nz,*vj,k;

342:   for (k=0; k<mbs; k++){
343:     v  = aa + 49*ai[k];
344:     xp = x + k*7;
345:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
346:     nz = ai[k+1] - ai[k];
347:     vj = aj + ai[k];
348:     xp = x + (*vj)*7;
349:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
350:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
351:     while (nz--) {
352:       /* x(:) += U(k,:)^T*(Dk*xk) */
353:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
354:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
355:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
356:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
357:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
358:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
359:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
360:       vj++; xp = x + (*vj)*7;
361:       v += 49;
362:     }
363:     /* xk = inv(Dk)*(Dk*xk) */
364:     d  = aa+k*49;          /* ptr to inv(Dk) */
365:     xp = x + k*7;
366:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
367:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
368:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
369:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
370:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
371:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
372:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
373:   }
374:   return(0);
375: }

379: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
380: {
381:   MatScalar      *v;
382:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
383:   PetscInt       nz,*vj,k;

386:   for (k=mbs-1; k>=0; k--){
387:     v  = aa + 49*ai[k];
388:     xp = x + k*7;
389:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
390:     nz = ai[k+1] - ai[k];
391:     vj = aj + ai[k];
392:     xp = x + (*vj)*7;
393:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
394:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
395:     while (nz--) {
396:       /* xk += U(k,:)*x(:) */
397:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
398:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
399:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
400:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
401:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
402:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
403:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
404:       vj++;
405:       v += 49; xp = x + (*vj)*7;
406:     }
407:     xp = x + k*7;
408:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
409:   }
410:   return(0);
411: }

415: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
416: {
417:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
419:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
420:   MatScalar      *aa=a->a;
421:   PetscScalar    *x,*b;

424:   VecGetArray(bb,&b);
425:   VecGetArray(xx,&x);
426: 
427:   /* solve U^T * D * y = b by forward substitution */
428:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
429:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

431:   /* solve U*x = y by back substitution */
432:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

434:   VecRestoreArray(bb,&b);
435:   VecRestoreArray(xx,&x);
436:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
437:   return(0);
438: }

442: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
443: {
444:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
446:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
447:   MatScalar      *aa=a->a;
448:   PetscScalar    *x,*b;

451:   VecGetArray(bb,&b);
452:   VecGetArray(xx,&x);
453:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
454:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
455:   VecRestoreArray(bb,&b);
456:   VecRestoreArray(xx,&x);
457:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
458:   return(0);
459: }

463: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
464: {
465:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
467:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
468:   MatScalar      *aa=a->a;
469:   PetscScalar    *x,*b;

472:   VecGetArray(bb,&b);
473:   VecGetArray(xx,&x);
474:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
475:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
476:   VecRestoreArray(bb,&b);
477:   VecRestoreArray(xx,&x);
478:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
479:   return(0);
480: }

484: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
485: {
486:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
487:   IS             isrow=a->row;
488:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
490:   const PetscInt *r;
491:   PetscInt       nz,*vj,k,idx;
492:   MatScalar      *aa=a->a,*v,*d;
493:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

496:   VecGetArray(bb,&b);
497:   VecGetArray(xx,&x);
498:   t  = a->solve_work;
499:   ISGetIndices(isrow,&r);

501:   /* solve U^T * D * y = b by forward substitution */
502:   tp = t;
503:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
504:     idx   = 6*r[k];
505:     tp[0] = b[idx];
506:     tp[1] = b[idx+1];
507:     tp[2] = b[idx+2];
508:     tp[3] = b[idx+3];
509:     tp[4] = b[idx+4];
510:     tp[5] = b[idx+5];
511:     tp += 6;
512:   }
513: 
514:   for (k=0; k<mbs; k++){
515:     v  = aa + 36*ai[k];
516:     vj = aj + ai[k];
517:     tp = t + k*6;
518:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
519:     nz = ai[k+1] - ai[k];
520:     tp = t + (*vj)*6;
521:     while (nz--) {
522:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
523:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
524:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
525:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
526:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
527:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
528:       vj++; tp = t + (*vj)*6;
529:       v += 36;
530:     }

532:     /* xk = inv(Dk)*(Dk*xk) */
533:     d  = aa+k*36;          /* ptr to inv(Dk) */
534:     tp    = t + k*6;
535:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
536:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
537:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
538:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
539:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
540:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
541:   }

543:   /* solve U*x = y by back substitution */
544:   for (k=mbs-1; k>=0; k--){
545:     v  = aa + 36*ai[k];
546:     vj = aj + ai[k];
547:     tp    = t + k*6;
548:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
549:     nz = ai[k+1] - ai[k];
550: 
551:     tp = t + (*vj)*6;
552:     while (nz--) {
553:       /* xk += U(k,:)*x(:) */
554:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
555:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
556:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
557:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
558:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
559:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
560:       vj++; tp = t + (*vj)*6;
561:       v += 36;
562:     }
563:     tp    = t + k*6;
564:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
565:     idx   = 6*r[k];
566:     x[idx]     = x0;
567:     x[idx+1]   = x1;
568:     x[idx+2]   = x2;
569:     x[idx+3]   = x3;
570:     x[idx+4]   = x4;
571:     x[idx+5]   = x5;
572:   }

574:   ISRestoreIndices(isrow,&r);
575:   VecRestoreArray(bb,&b);
576:   VecRestoreArray(xx,&x);
577:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
578:   return(0);
579: }

583: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
584: {
585:   MatScalar      *v,*d;
586:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
587:   PetscInt       nz,*vj,k;

590:   for (k=0; k<mbs; k++){
591:     v  = aa + 36*ai[k];
592:     xp = x + k*6;
593:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
594:     nz = ai[k+1] - ai[k];
595:     vj = aj + ai[k];
596:     xp = x + (*vj)*6;
597:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
598:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
599:     while (nz--) {
600:       /* x(:) += U(k,:)^T*(Dk*xk) */
601:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
602:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
603:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
604:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
605:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
606:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
607:       vj++; xp = x + (*vj)*6;
608:       v += 36;
609:     }
610:     /* xk = inv(Dk)*(Dk*xk) */
611:     d  = aa+k*36;          /* ptr to inv(Dk) */
612:     xp = x + k*6;
613:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
614:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
615:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
616:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
617:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
618:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
619:   }
620:   return(0);
621: }
624: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
625: {
626:   MatScalar      *v;
627:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
628:   PetscInt       nz,*vj,k;

631:   for (k=mbs-1; k>=0; k--){
632:     v  = aa + 36*ai[k];
633:     xp = x + k*6;
634:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
635:     nz = ai[k+1] - ai[k];
636:     vj = aj + ai[k];
637:     xp = x + (*vj)*6;
638:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
639:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
640:     while (nz--) {
641:       /* xk += U(k,:)*x(:) */
642:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
643:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
644:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
645:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
646:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
647:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
648:       vj++;
649:       v += 36; xp = x + (*vj)*6;
650:     }
651:     xp = x + k*6;
652:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
653:   }
654:   return(0);
655: }


660: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
661: {
662:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
663:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
664:   MatScalar      *aa=a->a;
665:   PetscScalar    *x,*b;

669:   VecGetArray(bb,&b);
670:   VecGetArray(xx,&x);
671: 
672:   /* solve U^T * D * y = b by forward substitution */
673:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
674:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

676:   /* solve U*x = y by back substitution */
677:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

679:   VecRestoreArray(bb,&b);
680:   VecRestoreArray(xx,&x);
681:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
682:   return(0);
683: }

687: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
688: {
689:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
690:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
691:   MatScalar      *aa=a->a;
692:   PetscScalar    *x,*b;

696:   VecGetArray(bb,&b);
697:   VecGetArray(xx,&x);
698:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
699:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
700:   VecRestoreArray(bb,&b);
701:   VecRestoreArray(xx,&x);
702:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
703:   return(0);
704: }

708: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
709: {
710:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
711:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
712:   MatScalar      *aa=a->a;
713:   PetscScalar    *x,*b;

717:   VecGetArray(bb,&b);
718:   VecGetArray(xx,&x);
719:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
720:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
721:   VecRestoreArray(bb,&b);
722:   VecRestoreArray(xx,&x);
723:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
724:   return(0);
725: }

729: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
730: {
731:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
732:   IS             isrow=a->row;
733:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
735:   const PetscInt *r;
736:   PetscInt       nz,*vj,k,idx;
737:   MatScalar      *aa=a->a,*v,*diag;
738:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

741:   VecGetArray(bb,&b);
742:   VecGetArray(xx,&x);
743:   t  = a->solve_work;
744:   ISGetIndices(isrow,&r);

746:   /* solve U^T * D * y = b by forward substitution */
747:   tp = t;
748:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
749:     idx   = 5*r[k];
750:     tp[0] = b[idx];
751:     tp[1] = b[idx+1];
752:     tp[2] = b[idx+2];
753:     tp[3] = b[idx+3];
754:     tp[4] = b[idx+4];
755:     tp += 5;
756:   }
757: 
758:   for (k=0; k<mbs; k++){
759:     v  = aa + 25*ai[k];
760:     vj = aj + ai[k];
761:     tp = t + k*5;
762:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
763:     nz = ai[k+1] - ai[k];

765:     tp = t + (*vj)*5;
766:     while (nz--) {
767:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
768:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
769:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
770:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
771:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
772:       vj++; tp = t + (*vj)*5;
773:       v += 25;
774:     }

776:     /* xk = inv(Dk)*(Dk*xk) */
777:     diag  = aa+k*25;          /* ptr to inv(Dk) */
778:     tp    = t + k*5;
779:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
780:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
781:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
782:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
783:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
784:   }

786:   /* solve U*x = y by back substitution */
787:   for (k=mbs-1; k>=0; k--){
788:     v  = aa + 25*ai[k];
789:     vj = aj + ai[k];
790:     tp    = t + k*5;
791:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
792:     nz = ai[k+1] - ai[k];
793: 
794:     tp = t + (*vj)*5;
795:     while (nz--) {
796:       /* xk += U(k,:)*x(:) */
797:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
798:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
799:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
800:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
801:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
802:       vj++; tp = t + (*vj)*5;
803:       v += 25;
804:     }
805:     tp    = t + k*5;
806:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
807:     idx   = 5*r[k];
808:     x[idx]     = x0;
809:     x[idx+1]   = x1;
810:     x[idx+2]   = x2;
811:     x[idx+3]   = x3;
812:     x[idx+4]   = x4;
813:   }

815:   ISRestoreIndices(isrow,&r);
816:   VecRestoreArray(bb,&b);
817:   VecRestoreArray(xx,&x);
818:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
819:   return(0);
820: }

824: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
825: {
826:   MatScalar      *v,*diag;
827:   PetscScalar    *xp,x0,x1,x2,x3,x4;
828:   PetscInt       nz,*vj,k;

831:   for (k=0; k<mbs; k++){
832:     v  = aa + 25*ai[k];
833:     xp = x + k*5;
834:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
835:     nz = ai[k+1] - ai[k];
836:     vj = aj + ai[k];
837:     xp = x + (*vj)*5;
838:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
839:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
840:     while (nz--) {
841:       /* x(:) += U(k,:)^T*(Dk*xk) */
842:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
843:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
844:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
845:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
846:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
847:       vj++; xp = x + (*vj)*5;
848:       v += 25;
849:     }
850:     /* xk = inv(Dk)*(Dk*xk) */
851:     diag = aa+k*25;          /* ptr to inv(Dk) */
852:     xp   = x + k*5;
853:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
854:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
855:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
856:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
857:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
858:   }
859:   return(0);
860: }

864: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
865: {
866:   MatScalar      *v;
867:   PetscScalar    *xp,x0,x1,x2,x3,x4;
868:   PetscInt       nz,*vj,k;

871:   for (k=mbs-1; k>=0; k--){
872:     v  = aa + 25*ai[k];
873:     xp = x + k*5;
874:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
875:     nz = ai[k+1] - ai[k];
876:     vj = aj + ai[k];
877:     xp = x + (*vj)*5;
878:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
879:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
880:     while (nz--) {
881:       /* xk += U(k,:)*x(:) */
882:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
883:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
884:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
885:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
886:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
887:       vj++;
888:       v += 25; xp = x + (*vj)*5;
889:     }
890:     xp = x + k*5;
891:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
892:   }
893:   return(0);
894: }

898: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
899: {
900:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
901:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
902:   MatScalar      *aa=a->a;
903:   PetscScalar    *x,*b;

907:   VecGetArray(bb,&b);
908:   VecGetArray(xx,&x);

910:   /* solve U^T * D * y = b by forward substitution */
911:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
912:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

914:   /* solve U*x = y by back substitution */
915:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

917:   VecRestoreArray(bb,&b);
918:   VecRestoreArray(xx,&x);
919:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
920:   return(0);
921: }

925: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
926: {
927:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
928:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
929:   MatScalar      *aa=a->a;
930:   PetscScalar    *x,*b;

934:   VecGetArray(bb,&b);
935:   VecGetArray(xx,&x);
936:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
937:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
938:   VecRestoreArray(bb,&b);
939:   VecRestoreArray(xx,&x);
940:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
941:   return(0);
942: }

946: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
947: {
948:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
949:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
950:   MatScalar      *aa=a->a;
951:   PetscScalar    *x,*b;

955:   VecGetArray(bb,&b);
956:   VecGetArray(xx,&x);
957:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
958:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
959:   VecRestoreArray(bb,&b);
960:   VecRestoreArray(xx,&x);
961:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
962:   return(0);
963: }

967: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
968: {
969:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
970:   IS             isrow=a->row;
971:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
973:   const PetscInt *r;
974:   PetscInt       nz,*vj,k,idx;
975:   MatScalar      *aa=a->a,*v,*diag;
976:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

979:   VecGetArray(bb,&b);
980:   VecGetArray(xx,&x);
981:   t  = a->solve_work;
982:   ISGetIndices(isrow,&r);

984:   /* solve U^T * D * y = b by forward substitution */
985:   tp = t;
986:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
987:     idx   = 4*r[k];
988:     tp[0] = b[idx];
989:     tp[1] = b[idx+1];
990:     tp[2] = b[idx+2];
991:     tp[3] = b[idx+3];
992:     tp += 4;
993:   }
994: 
995:   for (k=0; k<mbs; k++){
996:     v  = aa + 16*ai[k];
997:     vj = aj + ai[k];
998:     tp = t + k*4;
999:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1000:     nz = ai[k+1] - ai[k];

1002:     tp = t + (*vj)*4;
1003:     while (nz--) {
1004:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1005:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1006:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1007:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1008:       vj++; tp = t + (*vj)*4;
1009:       v += 16;
1010:     }

1012:     /* xk = inv(Dk)*(Dk*xk) */
1013:     diag  = aa+k*16;          /* ptr to inv(Dk) */
1014:     tp    = t + k*4;
1015:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1016:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1017:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1018:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1019:   }

1021:   /* solve U*x = y by back substitution */
1022:   for (k=mbs-1; k>=0; k--){
1023:     v  = aa + 16*ai[k];
1024:     vj = aj + ai[k];
1025:     tp    = t + k*4;
1026:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1027:     nz = ai[k+1] - ai[k];
1028: 
1029:     tp = t + (*vj)*4;
1030:     while (nz--) {
1031:       /* xk += U(k,:)*x(:) */
1032:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1033:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1034:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1035:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1036:       vj++; tp = t + (*vj)*4;
1037:       v += 16;
1038:     }
1039:     tp    = t + k*4;
1040:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1041:     idx        = 4*r[k];
1042:     x[idx]     = x0;
1043:     x[idx+1]   = x1;
1044:     x[idx+2]   = x2;
1045:     x[idx+3]   = x3;
1046:   }

1048:   ISRestoreIndices(isrow,&r);
1049:   VecRestoreArray(bb,&b);
1050:   VecRestoreArray(xx,&x);
1051:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1052:    return(0);
1053: }

1057: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1058: {
1059:   MatScalar      *v,*diag;
1060:   PetscScalar    *xp,x0,x1,x2,x3;
1061:   PetscInt       nz,*vj,k;

1064:   for (k=0; k<mbs; k++){
1065:     v  = aa + 16*ai[k];
1066:     xp = x + k*4;
1067:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1068:     nz = ai[k+1] - ai[k];
1069:     vj = aj + ai[k];
1070:     xp = x + (*vj)*4;
1071:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1072:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1073:     while (nz--) {
1074:       /* x(:) += U(k,:)^T*(Dk*xk) */
1075:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1076:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1077:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1078:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1079:       vj++; xp = x + (*vj)*4;
1080:       v += 16;
1081:     }
1082:     /* xk = inv(Dk)*(Dk*xk) */
1083:     diag = aa+k*16;          /* ptr to inv(Dk) */
1084:     xp   = x + k*4;
1085:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1086:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1087:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1088:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1089:   }
1090:   return(0);
1091: }

1095: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1096: {
1097:   MatScalar      *v;
1098:   PetscScalar    *xp,x0,x1,x2,x3;
1099:   PetscInt       nz,*vj,k;

1102:   for (k=mbs-1; k>=0; k--){
1103:     v  = aa + 16*ai[k];
1104:     xp = x + k*4;
1105:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1106:     nz = ai[k+1] - ai[k];
1107:     vj = aj + ai[k];
1108:     xp = x + (*vj)*4;
1109:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1110:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1111:     while (nz--) {
1112:       /* xk += U(k,:)*x(:) */
1113:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1114:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1115:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1116:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1117:       vj++;
1118:       v += 16; xp = x + (*vj)*4;
1119:     }
1120:     xp = x + k*4;
1121:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1122:   }
1123:   return(0);
1124: }

1128: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1129: {
1130:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1131:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1132:   MatScalar      *aa=a->a;
1133:   PetscScalar    *x,*b;

1137:   VecGetArray(bb,&b);
1138:   VecGetArray(xx,&x);

1140:   /* solve U^T * D * y = b by forward substitution */
1141:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1142:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1144:   /* solve U*x = y by back substitution */
1145:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1146:   VecRestoreArray(bb,&b);
1147:   VecRestoreArray(xx,&x);
1148:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1149:   return(0);
1150: }

1154: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1155: {
1156:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1157:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1158:   MatScalar      *aa=a->a;
1159:   PetscScalar    *x,*b;

1163:   VecGetArray(bb,&b);
1164:   VecGetArray(xx,&x);
1165:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1166:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1167:   VecRestoreArray(bb,&b);
1168:   VecRestoreArray(xx,&x);
1169:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1170:   return(0);
1171: }

1175: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1176: {
1177:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1178:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1179:   MatScalar      *aa=a->a;
1180:   PetscScalar    *x,*b;

1184:   VecGetArray(bb,&b);
1185:   VecGetArray(xx,&x);
1186:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1187:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1188:   VecRestoreArray(bb,&b);
1189:   VecRestoreArray(xx,&x);
1190:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1191:   return(0);
1192: }

1196: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1197: {
1198:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1199:   IS             isrow=a->row;
1200:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1202:   const PetscInt *r;
1203:   PetscInt       nz,*vj,k,idx;
1204:   MatScalar      *aa=a->a,*v,*diag;
1205:   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;

1208:   VecGetArray(bb,&b);
1209:   VecGetArray(xx,&x);
1210:   t  = a->solve_work;
1211:   ISGetIndices(isrow,&r);

1213:   /* solve U^T * D * y = b by forward substitution */
1214:   tp = t;
1215:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1216:     idx   = 3*r[k];
1217:     tp[0] = b[idx];
1218:     tp[1] = b[idx+1];
1219:     tp[2] = b[idx+2];
1220:     tp += 3;
1221:   }
1222: 
1223:   for (k=0; k<mbs; k++){
1224:     v  = aa + 9*ai[k];
1225:     vj = aj + ai[k];
1226:     tp = t + k*3;
1227:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1228:     nz = ai[k+1] - ai[k];

1230:     tp = t + (*vj)*3;
1231:     while (nz--) {
1232:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1233:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1234:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1235:       vj++; tp = t + (*vj)*3;
1236:       v += 9;
1237:     }

1239:     /* xk = inv(Dk)*(Dk*xk) */
1240:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1241:     tp    = t + k*3;
1242:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1243:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1244:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1245:   }

1247:   /* solve U*x = y by back substitution */
1248:   for (k=mbs-1; k>=0; k--){
1249:     v  = aa + 9*ai[k];
1250:     vj = aj + ai[k];
1251:     tp    = t + k*3;
1252:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1253:     nz = ai[k+1] - ai[k];
1254: 
1255:     tp = t + (*vj)*3;
1256:     while (nz--) {
1257:       /* xk += U(k,:)*x(:) */
1258:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1259:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1260:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1261:       vj++; tp = t + (*vj)*3;
1262:       v += 9;
1263:     }
1264:     tp    = t + k*3;
1265:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1266:     idx      = 3*r[k];
1267:     x[idx]   = x0;
1268:     x[idx+1] = x1;
1269:     x[idx+2] = x2;
1270:   }

1272:   ISRestoreIndices(isrow,&r);
1273:   VecRestoreArray(bb,&b);
1274:   VecRestoreArray(xx,&x);
1275:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1276:   return(0);
1277: }

1281: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1282: {
1283:   MatScalar      *v,*diag;
1284:   PetscScalar    *xp,x0,x1,x2;
1285:   PetscInt       nz,*vj,k;

1288:   for (k=0; k<mbs; k++){
1289:     v  = aa + 9*ai[k];
1290:     xp = x + k*3;
1291:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1292:     nz = ai[k+1] - ai[k];
1293:     vj = aj + ai[k];
1294:     xp = x + (*vj)*3;
1295:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1296:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1297:     while (nz--) {
1298:       /* x(:) += U(k,:)^T*(Dk*xk) */
1299:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1300:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1301:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1302:       vj++; xp = x + (*vj)*3;
1303:       v += 9;
1304:     }
1305:     /* xk = inv(Dk)*(Dk*xk) */
1306:     diag = aa+k*9;          /* ptr to inv(Dk) */
1307:     xp   = x + k*3;
1308:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1309:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1310:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1311:   }
1312:   return(0);
1313: }

1317: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1318: {
1319:   MatScalar      *v;
1320:   PetscScalar    *xp,x0,x1,x2;
1321:   PetscInt       nz,*vj,k;

1324:   for (k=mbs-1; k>=0; k--){
1325:     v  = aa + 9*ai[k];
1326:     xp = x + k*3;
1327:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1328:     nz = ai[k+1] - ai[k];
1329:     vj = aj + ai[k];
1330:     xp = x + (*vj)*3;
1331:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1332:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1333:     while (nz--) {
1334:       /* xk += U(k,:)*x(:) */
1335:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1336:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1337:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1338:       vj++;
1339:       v += 9; xp = x + (*vj)*3;
1340:     }
1341:     xp = x + k*3;
1342:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1343:   }
1344:   return(0);
1345: }

1349: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1350: {
1351:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1352:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1353:   MatScalar      *aa=a->a;
1354:   PetscScalar    *x,*b;
1356: 
1358:   VecGetArray(bb,&b);
1359:   VecGetArray(xx,&x);

1361:   /* solve U^T * D * y = b by forward substitution */
1362:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1363:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1365:   /* solve U*x = y by back substitution */
1366:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1368:   VecRestoreArray(bb,&b);
1369:   VecRestoreArray(xx,&x);
1370:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1371:   return(0);
1372: }

1376: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1377: {
1378:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1379:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1380:   MatScalar      *aa=a->a;
1381:   PetscScalar    *x,*b;

1385:   VecGetArray(bb,&b);
1386:   VecGetArray(xx,&x);
1387:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1388:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1389:   VecRestoreArray(bb,&b);
1390:   VecRestoreArray(xx,&x);
1391:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1392:   return(0);
1393: }

1397: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1398: {
1399:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1400:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1401:   MatScalar      *aa=a->a;
1402:   PetscScalar    *x,*b;

1406:   VecGetArray(bb,&b);
1407:   VecGetArray(xx,&x);
1408:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1409:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1410:   VecRestoreArray(bb,&b);
1411:   VecRestoreArray(xx,&x);
1412:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1413:   return(0);
1414: }

1418: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1419: {
1420:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1421:   IS             isrow=a->row;
1422:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1424:   const PetscInt *r;
1425:   PetscInt       nz,*vj,k,k2,idx;
1426:   MatScalar      *aa=a->a,*v,*diag;
1427:   PetscScalar    *x,*b,x0,x1,*t;

1430:   VecGetArray(bb,&b);
1431:   VecGetArray(xx,&x);
1432:   t  = a->solve_work;
1433:   ISGetIndices(isrow,&r);

1435:   /* solve U^T * D * y = perm(b) by forward substitution */
1436:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1437:     idx = 2*r[k];
1438:     t[k*2]   = b[idx];
1439:     t[k*2+1] = b[idx+1];
1440:   }
1441:   for (k=0; k<mbs; k++){
1442:     v  = aa + 4*ai[k];
1443:     vj = aj + ai[k];
1444:     k2 = k*2;
1445:     x0 = t[k2]; x1 = t[k2+1];
1446:     nz = ai[k+1] - ai[k];
1447:     while (nz--) {
1448:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1449:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1450:       vj++; v += 4;
1451:     }
1452:     diag = aa+k*4;  /* ptr to inv(Dk) */
1453:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1454:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1455:   }

1457:   /* solve U*x = y by back substitution */
1458:   for (k=mbs-1; k>=0; k--){
1459:     v  = aa + 4*ai[k];
1460:     vj = aj + ai[k];
1461:     k2 = k*2;
1462:     x0 = t[k2]; x1 = t[k2+1];
1463:     nz = ai[k+1] - ai[k];
1464:     while (nz--) {
1465:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1466:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1467:       vj++; v += 4;
1468:     }
1469:     t[k2]    = x0;
1470:     t[k2+1]  = x1;
1471:     idx      = 2*r[k];
1472:     x[idx]   = x0;
1473:     x[idx+1] = x1;
1474:   }

1476:   ISRestoreIndices(isrow,&r);
1477:   VecRestoreArray(bb,&b);
1478:   VecRestoreArray(xx,&x);
1479:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1480:   return(0);
1481: }

1485: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1486: {
1487:   MatScalar      *v,*diag;
1488:   PetscScalar    x0,x1;
1489:   PetscInt       nz,*vj,k,k2;

1492:   for (k=0; k<mbs; k++){
1493:     v  = aa + 4*ai[k];
1494:     vj = aj + ai[k];
1495:     k2 = k*2;
1496:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1497:     nz = ai[k+1] - ai[k];
1498:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1499:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1500:     while (nz--) {
1501:       /* x(:) += U(k,:)^T*(Dk*xk) */
1502:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1503:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1504:       vj++; v += 4;
1505:     }
1506:     /* xk = inv(Dk)*(Dk*xk) */
1507:     diag = aa+k*4;          /* ptr to inv(Dk) */
1508:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1509:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1510:   }
1511:   return(0);
1512: }

1516: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1517: {
1518:   MatScalar      *v;
1519:   PetscScalar    x0,x1;
1520:   PetscInt       nz,*vj,k,k2;

1523:   for (k=mbs-1; k>=0; k--){
1524:     v  = aa + 4*ai[k];
1525:     vj = aj + ai[k];
1526:     k2 = k*2;
1527:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1528:     nz = ai[k+1] - ai[k];
1529:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1530:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1531:     while (nz--) {
1532:       /* xk += U(k,:)*x(:) */
1533:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1534:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1535:       vj++; v += 4;
1536:     }
1537:     x[k2]     = x0;
1538:     x[k2+1]   = x1;
1539:   }
1540:   return(0);
1541: }

1545: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1546: {
1547:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1548:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1549:   MatScalar      *aa=a->a;
1550:   PetscScalar    *x,*b;

1554:   VecGetArray(bb,&b);
1555:   VecGetArray(xx,&x);

1557:   /* solve U^T * D * y = b by forward substitution */
1558:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1559:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1561:   /* solve U*x = y by back substitution */
1562:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1564:   VecRestoreArray(bb,&b);
1565:   VecRestoreArray(xx,&x);
1566:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1567:   return(0);
1568: }

1572: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1573: {
1574:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1575:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1576:   MatScalar      *aa=a->a;
1577:   PetscScalar    *x,*b;

1581:   VecGetArray(bb,&b);
1582:   VecGetArray(xx,&x);
1583:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1584:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1585:   VecRestoreArray(bb,&b);
1586:   VecRestoreArray(xx,&x);
1587:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1588:   return(0);
1589: }

1593: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1594: {
1595:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1596:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1597:   MatScalar      *aa=a->a;
1598:   PetscScalar    *x,*b;

1602:   VecGetArray(bb,&b);
1603:   VecGetArray(xx,&x);
1604:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1605:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1606:   VecRestoreArray(bb,&b);
1607:   VecRestoreArray(xx,&x);
1608:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
1609:   return(0);
1610: }

1614: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1615: {
1616:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1617:   IS                isrow=a->row;
1618:   PetscErrorCode    ierr;
1619:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1620:   const MatScalar   *aa=a->a,*v;
1621:   const PetscScalar *b;
1622:   PetscScalar       *x,xk,*t;
1623:   PetscInt          nz,k,j;

1626:   VecGetArrayRead(bb,&b);
1627:   VecGetArray(xx,&x);
1628:   t    = a->solve_work;
1629:   ISGetIndices(isrow,&rp);
1630: 
1631:   /* solve U^T*D*y = perm(b) by forward substitution */
1632:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1633:   for (k=0; k<mbs; k++){
1634:     v  = aa + ai[k];
1635:     vj = aj + ai[k];
1636:     xk = t[k];
1637:     nz = ai[k+1] - ai[k] - 1;
1638:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1639:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1640:   }

1642:   /* solve U*perm(x) = y by back substitution */
1643:   for (k=mbs-1; k>=0; k--){
1644:     v  = aa + adiag[k] - 1;
1645:     vj = aj + adiag[k] - 1;
1646:     nz = ai[k+1] - ai[k] - 1;
1647:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1648:     x[rp[k]] = t[k];
1649:   }

1651:   ISRestoreIndices(isrow,&rp);
1652:   VecRestoreArrayRead(bb,&b);
1653:   VecRestoreArray(xx,&x);
1654:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1655:   return(0);
1656: }

1660: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1661: {
1662:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1663:   IS              isrow=a->row;
1664:   PetscErrorCode  ierr;
1665:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1666:   const MatScalar *aa=a->a,*v;
1667:   PetscScalar     *x,*b,xk,*t;
1668:   PetscInt        nz,k;

1671:   VecGetArray(bb,&b);
1672:   VecGetArray(xx,&x);
1673:   t    = a->solve_work;
1674:   ISGetIndices(isrow,&rp);
1675: 
1676:   /* solve U^T*D*y = perm(b) by forward substitution */
1677:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1678:   for (k=0; k<mbs; k++){
1679:     v  = aa + ai[k] + 1;
1680:     vj = aj + ai[k] + 1;
1681:     xk = t[k];
1682:     nz = ai[k+1] - ai[k] - 1;
1683:     while (nz--) t[*vj++] += (*v++) * xk;
1684:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1685:   }

1687:   /* solve U*perm(x) = y by back substitution */
1688:   for (k=mbs-1; k>=0; k--){
1689:     v  = aa + ai[k] + 1;
1690:     vj = aj + ai[k] + 1;
1691:     nz = ai[k+1] - ai[k] - 1;
1692:     while (nz--) t[k] += (*v++) * t[*vj++];
1693:     x[rp[k]] = t[k];
1694:   }

1696:   ISRestoreIndices(isrow,&rp);
1697:   VecRestoreArray(bb,&b);
1698:   VecRestoreArray(xx,&x);
1699:   PetscLogFlops(4.0*a->nz - 3*mbs);
1700:   return(0);
1701: }

1705: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1706: {
1707:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1708:   IS              isrow=a->row;
1709:   PetscErrorCode  ierr;
1710:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1711:   const MatScalar *aa=a->a,*v;
1712:   PetscReal       diagk;
1713:   PetscScalar     *x,*b,xk;
1714:   PetscInt        nz,k;

1717:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1718:   VecGetArray(bb,&b);
1719:   VecGetArray(xx,&x);
1720:   ISGetIndices(isrow,&rp);
1721: 
1722:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1723:   for (k=0; k<mbs; k++){
1724:     v  = aa + ai[k];
1725:     vj = aj + ai[k];
1726:     xk = x[k];
1727:     nz = ai[k+1] - ai[k] - 1;
1728:     while (nz--) x[*vj++] += (*v++) * xk;

1730:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1731:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1732:     x[k] = xk*PetscSqrtReal(diagk);
1733:   }
1734:   ISRestoreIndices(isrow,&rp);
1735:   VecRestoreArray(bb,&b);
1736:   VecRestoreArray(xx,&x);
1737:   PetscLogFlops(2.0*a->nz - mbs);
1738:   return(0);
1739: }

1743: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1744: {
1745:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1746:   IS              isrow=a->row;
1747:   PetscErrorCode  ierr;
1748:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1749:   const MatScalar *aa=a->a,*v;
1750:   PetscReal       diagk;
1751:   PetscScalar     *x,*b,xk;
1752:   PetscInt        nz,k;

1755:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1756:   VecGetArray(bb,&b);
1757:   VecGetArray(xx,&x);
1758:   ISGetIndices(isrow,&rp);
1759: 
1760:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1761:   for (k=0; k<mbs; k++){
1762:     v  = aa + ai[k] + 1;
1763:     vj = aj + ai[k] + 1;
1764:     xk = x[k];
1765:     nz = ai[k+1] - ai[k] - 1;
1766:     while (nz--) x[*vj++] += (*v++) * xk;

1768:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1769:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1770:     x[k] = xk*PetscSqrtReal(diagk);
1771:   }
1772:   ISRestoreIndices(isrow,&rp);
1773:   VecRestoreArray(bb,&b);
1774:   VecRestoreArray(xx,&x);
1775:   PetscLogFlops(2.0*a->nz - mbs);
1776:   return(0);
1777: }

1781: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1782: {
1783:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1784:   IS              isrow=a->row;
1785:   PetscErrorCode  ierr;
1786:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1787:   const MatScalar *aa=a->a,*v;
1788:   PetscReal       diagk;
1789:   PetscScalar     *x,*b,*t;
1790:   PetscInt        nz,k;

1793:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1794:   VecGetArray(bb,&b);
1795:   VecGetArray(xx,&x);
1796:   t    = a->solve_work;
1797:   ISGetIndices(isrow,&rp);

1799:   for (k=mbs-1; k>=0; k--){
1800:     v  = aa + ai[k];
1801:     vj = aj + ai[k];
1802:     diagk = PetscRealPart(aa[adiag[k]]);
1803:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1804:     t[k] = b[k] * PetscSqrtReal(diagk);
1805:     nz = ai[k+1] - ai[k] - 1;
1806:     while (nz--) t[k] += (*v++) * t[*vj++];
1807:     x[rp[k]] = t[k];
1808:   }
1809:   ISRestoreIndices(isrow,&rp);
1810:   VecRestoreArray(bb,&b);
1811:   VecRestoreArray(xx,&x);
1812:   PetscLogFlops(2.0*a->nz - mbs);
1813:   return(0);
1814: }

1818: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1819: {
1820:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1821:   IS              isrow=a->row;
1822:   PetscErrorCode  ierr;
1823:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1824:   const MatScalar *aa=a->a,*v;
1825:   PetscReal       diagk;
1826:   PetscScalar     *x,*b,*t;
1827:   PetscInt        nz,k;

1830:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1831:   VecGetArray(bb,&b);
1832:   VecGetArray(xx,&x);
1833:   t    = a->solve_work;
1834:   ISGetIndices(isrow,&rp);

1836:   for (k=mbs-1; k>=0; k--){
1837:     v  = aa + ai[k] + 1;
1838:     vj = aj + ai[k] + 1;
1839:     diagk = PetscRealPart(aa[ai[k]]);
1840:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1841:     t[k] = b[k] * PetscSqrtReal(diagk);
1842:     nz = ai[k+1] - ai[k] - 1;
1843:     while (nz--) t[k] += (*v++) * t[*vj++];
1844:     x[rp[k]] = t[k];
1845:   }
1846:   ISRestoreIndices(isrow,&rp);
1847:   VecRestoreArray(bb,&b);
1848:   VecRestoreArray(xx,&x);
1849:   PetscLogFlops(2.0*a->nz - mbs);
1850:   return(0);
1851: }

1855: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1856: {
1857:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1861:   if (A->rmap->bs == 1) {
1862:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1863:   } else {
1864:     IS              isrow=a->row;
1865:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1866:     const MatScalar *aa=a->a,*v;
1867:     PetscScalar     *x,*b,*t;
1868:     PetscInt        nz,k,n,i,j;
1869:     if (bb->n > a->solves_work_n) {
1870:       PetscFree(a->solves_work);
1871:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1872:       a->solves_work_n = bb->n;
1873:     }
1874:     n    = bb->n;
1875:     VecGetArray(bb->v,&b);
1876:     VecGetArray(xx->v,&x);
1877:     t    = a->solves_work;

1879:     ISGetIndices(isrow,&rp);
1880: 
1881:     /* solve U^T*D*y = perm(b) by forward substitution */
1882:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1883:     for (k=0; k<mbs; k++){
1884:       v  = aa + ai[k];
1885:       vj = aj + ai[k];
1886:       nz = ai[k+1] - ai[k] - 1;
1887:       for (j=0; j<nz; j++){
1888:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1889:         v++;vj++;
1890:       }
1891:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1892:     }
1893: 
1894:     /* solve U*perm(x) = y by back substitution */
1895:     for (k=mbs-1; k>=0; k--){
1896:       v  = aa + ai[k] - 1;
1897:       vj = aj + ai[k] - 1;
1898:       nz = ai[k+1] - ai[k] - 1;
1899:       for (j=0; j<nz; j++){
1900:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1901:         v++;vj++;
1902:       }
1903:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1904:     }

1906:     ISRestoreIndices(isrow,&rp);
1907:     VecRestoreArray(bb->v,&b);
1908:     VecRestoreArray(xx->v,&x);
1909:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1910:   }
1911:   return(0);
1912: }

1916: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1917: {
1918:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1922:   if (A->rmap->bs == 1) {
1923:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1924:   } else {
1925:     IS              isrow=a->row;
1926:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1927:     const MatScalar *aa=a->a,*v;
1928:     PetscScalar     *x,*b,*t;
1929:     PetscInt        nz,k,n,i;
1930:     if (bb->n > a->solves_work_n) {
1931:       PetscFree(a->solves_work);
1932:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1933:       a->solves_work_n = bb->n;
1934:     }
1935:     n    = bb->n;
1936:     VecGetArray(bb->v,&b);
1937:     VecGetArray(xx->v,&x);
1938:     t    = a->solves_work;

1940:     ISGetIndices(isrow,&rp);
1941: 
1942:     /* solve U^T*D*y = perm(b) by forward substitution */
1943:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1944:     for (k=0; k<mbs; k++){
1945:       v  = aa + ai[k];
1946:       vj = aj + ai[k];
1947:       nz = ai[k+1] - ai[k];
1948:       while (nz--) {
1949:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1950:         v++;vj++;
1951:       }
1952:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1953:     }
1954: 
1955:     /* solve U*perm(x) = y by back substitution */
1956:     for (k=mbs-1; k>=0; k--){
1957:       v  = aa + ai[k];
1958:       vj = aj + ai[k];
1959:       nz = ai[k+1] - ai[k];
1960:       while (nz--) {
1961:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1962:         v++;vj++;
1963:       }
1964:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1965:     }

1967:     ISRestoreIndices(isrow,&rp);
1968:     VecRestoreArray(bb->v,&b);
1969:     VecRestoreArray(xx->v,&x);
1970:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1971:   }
1972:   return(0);
1973: }

1977: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1978: {
1979:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1980:   PetscErrorCode    ierr;
1981:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1982:   const MatScalar   *aa=a->a,*v;
1983:   const PetscScalar *b;
1984:   PetscScalar       *x,xi;
1985:   PetscInt          nz,i,j;

1988:   VecGetArrayRead(bb,&b);
1989:   VecGetArray(xx,&x);
1990: 
1991:   /* solve U^T*D*y = b by forward substitution */
1992:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1993:   for (i=0; i<mbs; i++){
1994:     v  = aa + ai[i];
1995:     vj = aj + ai[i];
1996:     xi = x[i];
1997:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1998:     for (j=0; j<nz; j++){
1999:       x[vj[j]] += v[j]* xi;
2000:     }
2001:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2002:   }

2004:   /* solve U*x = y by backward substitution */
2005:   for (i=mbs-2; i>=0; i--){
2006:     xi = x[i];
2007:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2008:     vj = aj + adiag[i] - 1;
2009:     nz = ai[i+1] - ai[i] - 1;
2010:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2011:     x[i] = xi;
2012:   }
2013: 
2014:   VecRestoreArrayRead(bb,&b);
2015:   VecRestoreArray(xx,&x);
2016:   PetscLogFlops(4.0*a->nz - 3*mbs);
2017:   return(0);
2018: }

2022: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2023: {
2024:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2026:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2027:   MatScalar      *aa=a->a,*v;
2028:   PetscScalar    *x,*b,xk;
2029:   PetscInt       nz,*vj,k;

2032:   VecGetArray(bb,&b);
2033:   VecGetArray(xx,&x);
2034: 
2035:   /* solve U^T*D*y = b by forward substitution */
2036:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2037:   for (k=0; k<mbs; k++){
2038:     v  = aa + ai[k] + 1;
2039:     vj = aj + ai[k] + 1;
2040:     xk = x[k];
2041:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2042:     while (nz--) x[*vj++] += (*v++) * xk;
2043:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2044:   }

2046:   /* solve U*x = y by back substitution */
2047:   for (k=mbs-2; k>=0; k--){
2048:     v  = aa + ai[k] + 1;
2049:     vj = aj + ai[k] + 1;
2050:     xk = x[k];
2051:     nz = ai[k+1] - ai[k] - 1;
2052:     while (nz--) {
2053:       xk += (*v++) * x[*vj++];
2054:     }
2055:     x[k] = xk;
2056:   }

2058:   VecRestoreArray(bb,&b);
2059:   VecRestoreArray(xx,&x);
2060:   PetscLogFlops(4.0*a->nz - 3*mbs);
2061:   return(0);
2062: }

2066: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2067: {
2068:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
2069:   PetscErrorCode  ierr;
2070:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2071:   const MatScalar *aa=a->a,*v;
2072:   PetscReal       diagk;
2073:   PetscScalar     *x,*b;
2074:   PetscInt        nz,*vj,k;

2077:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2078:   VecGetArray(bb,&b);
2079:   VecGetArray(xx,&x);
2080:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2081:   for (k=0; k<mbs; k++){
2082:     v  = aa + ai[k];
2083:     vj = aj + ai[k];
2084:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2085:     while (nz--) x[*vj++] += (*v++) * x[k];
2086:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2087:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2088:     x[k] *= PetscSqrtReal(diagk);
2089:   }
2090:   VecRestoreArray(bb,&b);
2091:   VecRestoreArray(xx,&x);
2092:   PetscLogFlops(2.0*a->nz - mbs);
2093:   return(0);
2094: }

2098: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2099: {
2100:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2102:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2103:   MatScalar      *aa=a->a,*v;
2104:   PetscReal      diagk;
2105:   PetscScalar    *x,*b;
2106:   PetscInt       nz,*vj,k;

2109:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2110:   VecGetArray(bb,&b);
2111:   VecGetArray(xx,&x);
2112:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2113:   for (k=0; k<mbs; k++){
2114:     v  = aa + ai[k] + 1;
2115:     vj = aj + ai[k] + 1;
2116:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2117:     while (nz--) x[*vj++] += (*v++) * x[k];
2118:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2119:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2120:     x[k] *= PetscSqrtReal(diagk);
2121:   }
2122:   VecRestoreArray(bb,&b);
2123:   VecRestoreArray(xx,&x);
2124:   PetscLogFlops(2.0*a->nz - mbs);
2125:   return(0);
2126: }

2130: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2131: {
2132:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2134:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2135:   MatScalar      *aa=a->a,*v;
2136:   PetscReal      diagk;
2137:   PetscScalar    *x,*b;
2138:   PetscInt       nz,*vj,k;

2141:   /* solve D^(1/2)*U*x = b by back substitution */
2142:   VecGetArray(bb,&b);
2143:   VecGetArray(xx,&x);

2145:   for (k=mbs-1; k>=0; k--){
2146:     v  = aa + ai[k];
2147:     vj = aj + ai[k];
2148:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2149:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2150:     x[k] = PetscSqrtReal(diagk)*b[k];
2151:     nz = ai[k+1] - ai[k] - 1;
2152:     while (nz--) x[k] += (*v++) * x[*vj++];
2153:   }
2154:   VecRestoreArray(bb,&b);
2155:   VecRestoreArray(xx,&x);
2156:   PetscLogFlops(2.0*a->nz - mbs);
2157:   return(0);
2158: }

2162: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2163: {
2164:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2166:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2167:   MatScalar      *aa=a->a,*v;
2168:   PetscReal      diagk;
2169:   PetscScalar    *x,*b;
2170:   PetscInt       nz,*vj,k;

2173:   /* solve D^(1/2)*U*x = b by back substitution */
2174:   VecGetArray(bb,&b);
2175:   VecGetArray(xx,&x);

2177:   for (k=mbs-1; k>=0; k--){
2178:     v  = aa + ai[k] + 1;
2179:     vj = aj + ai[k] + 1;
2180:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2181:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2182:     x[k] = PetscSqrtReal(diagk)*b[k];
2183:     nz = ai[k+1] - ai[k] - 1;
2184:     while (nz--) x[k] += (*v++) * x[*vj++];
2185:   }
2186:   VecRestoreArray(bb,&b);
2187:   VecRestoreArray(xx,&x);
2188:   PetscLogFlops(2.0*a->nz - mbs);
2189:   return(0);
2190: }

2192: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2195: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2196: {
2197:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2199:   const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2200:   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2201:   PetscInt       m,reallocs = 0,*levtmp;
2202:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2203:   PetscInt       incrlev,*lev,shift,prow,nz;
2204:   PetscReal      f = info->fill,levels = info->levels;
2205:   PetscBool      perm_identity;

2208:   /* check whether perm is the identity mapping */
2209:   ISIdentity(perm,&perm_identity);

2211:   if (perm_identity){
2212:     a->permute = PETSC_FALSE;
2213:     ai = a->i; aj = a->j;
2214:   } else { /*  non-trivial permutation */
2215:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2216:     a->permute = PETSC_TRUE;
2217:     MatReorderingSeqSBAIJ(A, perm);
2218:     ai = a->inew; aj = a->jnew;
2219:   }
2220: 
2221:   /* initialization */
2222:   ISGetIndices(perm,&rip);
2223:   umax  = (PetscInt)(f*ai[mbs] + 1);
2224:   PetscMalloc(umax*sizeof(PetscInt),&lev);
2225:   umax += mbs + 1;
2226:   shift = mbs + 1;
2227:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
2228:   PetscMalloc(umax*sizeof(PetscInt),&ju);
2229:   iu[0] = mbs + 1;
2230:   juidx = mbs + 1;
2231:   /* prowl: linked list for pivot row */
2232:   PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);
2233:   /* q: linked list for col index */
2234: 
2235:   for (i=0; i<mbs; i++){
2236:     prowl[i] = mbs;
2237:     q[i] = 0;
2238:   }

2240:   /* for each row k */
2241:   for (k=0; k<mbs; k++){
2242:     nzk  = 0;
2243:     q[k] = mbs;
2244:     /* copy current row into linked list */
2245:     nz = ai[rip[k]+1] - ai[rip[k]];
2246:     j = ai[rip[k]];
2247:     while (nz--){
2248:       vj = rip[aj[j++]];
2249:       if (vj > k){
2250:         qm = k;
2251:         do {
2252:           m  = qm; qm = q[m];
2253:         } while(qm < vj);
2254:         if (qm == vj) {
2255:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2256:         }
2257:         nzk++;
2258:         q[m]       = vj;
2259:         q[vj]      = qm;
2260:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2261:       }
2262:     }

2264:     /* modify nonzero structure of k-th row by computing fill-in
2265:        for each row prow to be merged in */
2266:     prow = k;
2267:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2268: 
2269:     while (prow < k){
2270:       /* merge row prow into k-th row */
2271:       jmin = iu[prow] + 1;
2272:       jmax = iu[prow+1];
2273:       qm = k;
2274:       for (j=jmin; j<jmax; j++){
2275:         incrlev = lev[j-shift] + 1;
2276:         if (incrlev > levels) continue;
2277:         vj      = ju[j];
2278:         do {
2279:           m = qm; qm = q[m];
2280:         } while (qm < vj);
2281:         if (qm != vj){      /* a fill */
2282:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2283:           levtmp[vj] = incrlev;
2284:         } else {
2285:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2286:         }
2287:       }
2288:       prow = prowl[prow]; /* next pivot row */
2289:     }
2290: 
2291:     /* add k to row list for first nonzero element in k-th row */
2292:     if (nzk > 1){
2293:       i = q[k]; /* col value of first nonzero element in k_th row of U */
2294:       prowl[k] = prowl[i]; prowl[i] = k;
2295:     }
2296:     iu[k+1] = iu[k] + nzk;

2298:     /* allocate more space to ju and lev if needed */
2299:     if (iu[k+1] > umax) {
2300:       /* estimate how much additional space we will need */
2301:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2302:       /* just double the memory each time */
2303:       maxadd = umax;
2304:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2305:       umax += maxadd;

2307:       /* allocate a longer ju */
2308:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2309:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2310:       PetscFree(ju);
2311:       ju       = jutmp;

2313:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2314:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2315:       PetscFree(lev);
2316:       lev      = jutmp;
2317:       reallocs += 2; /* count how many times we realloc */
2318:     }

2320:     /* save nonzero structure of k-th row in ju */
2321:     i=k;
2322:     while (nzk --) {
2323:       i                = q[i];
2324:       ju[juidx]        = i;
2325:       lev[juidx-shift] = levtmp[i];
2326:       juidx++;
2327:     }
2328:   }
2329: 
2330: #if defined(PETSC_USE_INFO)
2331:   if (ai[mbs] != 0) {
2332:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2333:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2334:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2335:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2336:     PetscInfo(A,"for best performance.\n");
2337:   } else {
2338:     PetscInfo(A,"Empty matrix.\n");
2339:   }
2340: #endif

2342:   ISRestoreIndices(perm,&rip);
2343:   PetscFree3(prowl,q,levtmp);
2344:   PetscFree(lev);

2346:   /* put together the new matrix */
2347:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);

2349:   /* PetscLogObjectParent(B,iperm); */
2350:   b    = (Mat_SeqSBAIJ*)(B)->data;
2351:   PetscFree2(b->imax,b->ilen);
2352:   b->singlemalloc = PETSC_FALSE;
2353:   b->free_a       = PETSC_TRUE;
2354:   b->free_ij      = PETSC_TRUE;
2355:   /* the next line frees the default space generated by the Create() */
2356:   PetscFree3(b->a,b->j,b->i);
2357:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2358:   b->j    = ju;
2359:   b->i    = iu;
2360:   b->diag = 0;
2361:   b->ilen = 0;
2362:   b->imax = 0;
2363: 
2364:   ISDestroy(&b->row);
2365:   ISDestroy(&b->icol);
2366:   b->row  = perm;
2367:   b->icol = perm;
2368:   PetscObjectReference((PetscObject)perm);
2369:   PetscObjectReference((PetscObject)perm);
2370:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2371:   /* In b structure:  Free imax, ilen, old a, old j.  
2372:      Allocate idnew, solve_work, new a, new j */
2373:   PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2374:   b->maxnz = b->nz = iu[mbs];
2375: 
2376:   (B)->info.factor_mallocs    = reallocs;
2377:   (B)->info.fill_ratio_given  = f;
2378:   if (ai[mbs] != 0) {
2379:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2380:   } else {
2381:     (B)->info.fill_ratio_needed = 0.0;
2382:   }
2383:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2384:   return(0);
2385: }

2387: /*
2388:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2389: */
2390: #include <petscbt.h>
2391: #include <../src/mat/utils/freespace.h>
2394: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2395: {
2396:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2397:   PetscErrorCode     ierr;
2398:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2399:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2400:   const PetscInt     *rip;
2401:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2402:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2403:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2404:   PetscReal          fill=info->fill,levels=info->levels;
2405:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2406:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2407:   PetscBT            lnkbt;

2410:   if (bs > 1){
2411:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2412:     return(0);
2413:   }
2414:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2415:   MatMissingDiagonal(A,&missing,&d);
2416:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2418:   /* check whether perm is the identity mapping */
2419:   ISIdentity(perm,&perm_identity);
2420:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2421:   a->permute = PETSC_FALSE;

2423:   PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2424:   PetscMalloc((am+1)*sizeof(PetscInt),&udiag);
2425:   ui[0] = 0;
2426: 
2427:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2428:   if (!levels){
2429:     /* reuse the column pointers and row offsets for memory savings */
2430:     for (i=0; i<am; i++) {
2431:       ncols    = ai[i+1] - ai[i];
2432:       ui[i+1]  = ui[i] + ncols;
2433:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2434:     }
2435:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2436:     cols = uj;
2437:     for (i=0; i<am; i++) {
2438:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2439:       ncols = ai[i+1] - ai[i] -1;
2440:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2441:       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2442:     }
2443:   } else { /* case: levels>0 */
2444:     ISGetIndices(perm,&rip);

2446:     /* initialization */
2447:     /* jl: linked list for storing indices of the pivot rows 
2448:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2449:     PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2450:     for (i=0; i<am; i++){
2451:       jl[i] = am; il[i] = 0;
2452:     }

2454:     /* create and initialize a linked list for storing column indices of the active row k */
2455:     nlnk = am + 1;
2456:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2458:     /* initial FreeSpace size is fill*(ai[am]+1) */
2459:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2460:     current_space = free_space;
2461:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2462:     current_space_lvl = free_space_lvl;

2464:     for (k=0; k<am; k++){  /* for each active row k */
2465:       /* initialize lnk by the column indices of row k */
2466:       nzk   = 0;
2467:       ncols = ai[k+1] - ai[k];
2468:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2469:       cols  = aj+ai[k];
2470:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2471:       nzk += nlnk;

2473:       /* update lnk by computing fill-in for each pivot row to be merged in */
2474:       prow = jl[k]; /* 1st pivot row */
2475: 
2476:       while (prow < k){
2477:         nextprow = jl[prow];
2478: 
2479:         /* merge prow into k-th row */
2480:         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2481:         jmax = ui[prow+1];
2482:         ncols = jmax-jmin;
2483:         i     = jmin - ui[prow];

2485:         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2486:         uj    = uj_lvl_ptr[prow] + i; /* levels of cols */
2487:         j     = *(uj - 1);
2488:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2489:         nzk += nlnk;

2491:         /* update il and jl for prow */
2492:         if (jmin < jmax){
2493:           il[prow] = jmin;
2494:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2495:         }
2496:         prow = nextprow;
2497:       }

2499:       /* if free space is not available, make more free space */
2500:       if (current_space->local_remaining<nzk) {
2501:         i = am - k + 1; /* num of unfactored rows */
2502:         i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2503:         PetscFreeSpaceGet(i,&current_space);
2504:         PetscFreeSpaceGet(i,&current_space_lvl);
2505:         reallocs++;
2506:       }

2508:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2509:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2510:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2512:       /* add the k-th row into il and jl */
2513:       if (nzk > 1){
2514:         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2515:         jl[k] = jl[i]; jl[i] = k;
2516:         il[k] = ui[k] + 1;
2517:       }
2518:       uj_ptr[k]     = current_space->array;
2519:       uj_lvl_ptr[k] = current_space_lvl->array;

2521:       current_space->array           += nzk;
2522:       current_space->local_used      += nzk;
2523:       current_space->local_remaining -= nzk;
2524:       current_space_lvl->array           += nzk;
2525:       current_space_lvl->local_used      += nzk;
2526:       current_space_lvl->local_remaining -= nzk;

2528:       ui[k+1] = ui[k] + nzk;
2529:     }

2531:     ISRestoreIndices(perm,&rip);
2532:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2534:     /* destroy list of free space and other temporary array(s) */
2535:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2536:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2537:     PetscIncompleteLLDestroy(lnk,lnkbt);
2538:     PetscFreeSpaceDestroy(free_space_lvl);

2540:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2542:   /* put together the new matrix in MATSEQSBAIJ format */
2543:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

2545:   b = (Mat_SeqSBAIJ*)(fact)->data;
2546:   PetscFree2(b->imax,b->ilen);
2547:   b->singlemalloc = PETSC_FALSE;
2548:   b->free_a  = PETSC_TRUE;
2549:   b->free_ij = free_ij;
2550:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2551:   b->j     = uj;
2552:   b->i     = ui;
2553:   b->diag  = udiag;
2554:   b->free_diag = PETSC_TRUE;
2555:   b->ilen  = 0;
2556:   b->imax  = 0;
2557:   b->row   = perm;
2558:   b->col   = perm;
2559:   PetscObjectReference((PetscObject)perm);
2560:   PetscObjectReference((PetscObject)perm);
2561:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2562:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2563:   PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2564:   b->maxnz = b->nz = ui[am];
2565: 
2566:   fact->info.factor_mallocs    = reallocs;
2567:   fact->info.fill_ratio_given  = fill;
2568:   if (ai[am] != 0) {
2569:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2570:   } else {
2571:     fact->info.fill_ratio_needed = 0.0;
2572:   }
2573: #if defined(PETSC_USE_INFO)
2574:     if (ai[am] != 0) {
2575:       PetscReal af = fact->info.fill_ratio_needed;
2576:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2577:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2578:       PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2579:     } else {
2580:       PetscInfo(A,"Empty matrix.\n");
2581:     }
2582: #endif
2583:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2584:   return(0);
2585: }

2589: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2590: {
2591:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2592:   Mat_SeqSBAIJ       *b;
2593:   PetscErrorCode     ierr;
2594:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2595:   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2596:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2597:   PetscInt           reallocs=0,i,*ui;
2598:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2599:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2600:   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2601:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2602:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2603:   PetscBT            lnkbt;

2606:   MatMissingDiagonal(A,&missing,&d);
2607:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2609:   /*  
2610:    This code originally uses Modified Sparse Row (MSR) storage
2611:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2612:    Then it is rewritten so the factor B takes seqsbaij format. However the associated 
2613:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 
2614:    thus the original code in MSR format is still used for these cases. 
2615:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 
2616:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2617:   */
2618:   if (bs > 1){
2619:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2620:     return(0);
2621:   }

2623:   /* check whether perm is the identity mapping */
2624:   ISIdentity(perm,&perm_identity);
2625:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2626:   a->permute = PETSC_FALSE;

2628:   /* special case that simply copies fill pattern */
2629:   if (!levels ) {
2630:     /* reuse the column pointers and row offsets for memory savings */
2631:     ui           = a->i;
2632:     uj           = a->j;
2633:     free_ij      = PETSC_FALSE;
2634:     ratio_needed = 1.0;
2635:   } else { /* case: levels>0 */
2636:     ISGetIndices(perm,&rip);

2638:     /* initialization */
2639:     PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2640:     ui[0] = 0;

2642:     /* jl: linked list for storing indices of the pivot rows 
2643:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2644:     PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2645:     for (i=0; i<am; i++){
2646:       jl[i] = am; il[i] = 0;
2647:     }

2649:     /* create and initialize a linked list for storing column indices of the active row k */
2650:     nlnk = am + 1;
2651:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2653:     /* initial FreeSpace size is fill*(ai[am]+1) */
2654:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2655:     current_space = free_space;
2656:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2657:     current_space_lvl = free_space_lvl;

2659:     for (k=0; k<am; k++){  /* for each active row k */
2660:       /* initialize lnk by the column indices of row rip[k] */
2661:       nzk   = 0;
2662:       ncols = ai[rip[k]+1] - ai[rip[k]];
2663:       cols  = aj+ai[rip[k]];
2664:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2665:       nzk += nlnk;

2667:       /* update lnk by computing fill-in for each pivot row to be merged in */
2668:       prow = jl[k]; /* 1st pivot row */
2669: 
2670:       while (prow < k){
2671:         nextprow = jl[prow];
2672: 
2673:         /* merge prow into k-th row */
2674:         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2675:         jmax = ui[prow+1];
2676:         ncols = jmax-jmin;
2677:         i     = jmin - ui[prow];
2678:         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2679:         j     = *(uj_lvl_ptr[prow] + i - 1);
2680:         cols_lvl = uj_lvl_ptr[prow]+i;
2681:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2682:         nzk += nlnk;

2684:         /* update il and jl for prow */
2685:         if (jmin < jmax){
2686:           il[prow] = jmin;
2687:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2688:         }
2689:         prow = nextprow;
2690:       }

2692:       /* if free space is not available, make more free space */
2693:       if (current_space->local_remaining<nzk) {
2694:         i = am - k + 1; /* num of unfactored rows */
2695:         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2696:         PetscFreeSpaceGet(i,&current_space);
2697:         PetscFreeSpaceGet(i,&current_space_lvl);
2698:         reallocs++;
2699:       }

2701:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2702:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2704:       /* add the k-th row into il and jl */
2705:       if (nzk-1 > 0){
2706:         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2707:         jl[k] = jl[i]; jl[i] = k;
2708:         il[k] = ui[k] + 1;
2709:       }
2710:       uj_ptr[k]     = current_space->array;
2711:       uj_lvl_ptr[k] = current_space_lvl->array;

2713:       current_space->array           += nzk;
2714:       current_space->local_used      += nzk;
2715:       current_space->local_remaining -= nzk;
2716:       current_space_lvl->array           += nzk;
2717:       current_space_lvl->local_used      += nzk;
2718:       current_space_lvl->local_remaining -= nzk;

2720:       ui[k+1] = ui[k] + nzk;
2721:     }

2723:     ISRestoreIndices(perm,&rip);
2724:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2726:     /* destroy list of free space and other temporary array(s) */
2727:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2728:     PetscFreeSpaceContiguous(&free_space,uj);
2729:     PetscIncompleteLLDestroy(lnk,lnkbt);
2730:     PetscFreeSpaceDestroy(free_space_lvl);
2731:     if (ai[am] != 0) {
2732:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2733:     } else {
2734:       ratio_needed = 0.0;
2735:     }
2736:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2738:   /* put together the new matrix in MATSEQSBAIJ format */
2739:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

2741:   b = (Mat_SeqSBAIJ*)(fact)->data;
2742:   PetscFree2(b->imax,b->ilen);
2743:   b->singlemalloc = PETSC_FALSE;
2744:   b->free_a  = PETSC_TRUE;
2745:   b->free_ij = free_ij;
2746:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2747:   b->j    = uj;
2748:   b->i    = ui;
2749:   b->diag    = 0;
2750:   b->ilen    = 0;
2751:   b->imax    = 0;
2752:   b->row     = perm;
2753:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2754:   PetscObjectReference((PetscObject)perm);
2755:   b->icol = perm;
2756:   PetscObjectReference((PetscObject)perm);
2757:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2758:   b->maxnz = b->nz = ui[am];
2759: 
2760:   fact->info.factor_mallocs    = reallocs;
2761:   fact->info.fill_ratio_given  = fill;
2762:   fact->info.fill_ratio_needed = ratio_needed;
2763: #if defined(PETSC_USE_INFO)
2764:     if (ai[am] != 0) {
2765:       PetscReal af = fact->info.fill_ratio_needed;
2766:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2767:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2768:       PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2769:     } else {
2770:       PetscInfo(A,"Empty matrix.\n");
2771:     }
2772: #endif
2773:   if (perm_identity){
2774:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2775:   } else {
2776:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2777:   }
2778:   return(0);
2779: }