Actual source code: sbaijfact2.c
petsc-3.3-p7 2013-05-11
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <../src/mat/blockinvert.h>
12: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
13: {
14: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
15: IS isrow=a->row;
16: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
17: PetscErrorCode ierr;
18: const PetscInt *r;
19: PetscInt nz,*vj,k,idx,k1;
20: PetscInt bs=A->rmap->bs,bs2 = a->bs2;
21: MatScalar *aa=a->a,*v,*diag;
22: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
25: VecGetArray(bb,&b);
26: VecGetArray(xx,&x);
27: t = a->solve_work;
28: ISGetIndices(isrow,&r);
29: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
31: /* solve U^T * D * y = b by forward substitution */
32: xk = t;
33: for (k=0; k<mbs; k++) { /* t <- perm(b) */
34: idx = bs*r[k];
35: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
36: }
37: for (k=0; k<mbs; k++){
38: v = aa + bs2*ai[k];
39: xk = t + k*bs; /* Dk*xk = k-th block of x */
40: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
41: nz = ai[k+1] - ai[k];
42: vj = aj + ai[k];
43: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
44: while (nz--) {
45: /* x(:) += U(k,:)^T*(Dk*xk) */
46: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
47: vj++; xj = t + (*vj)*bs;
48: v += bs2;
49: }
50: /* xk = inv(Dk)*(Dk*xk) */
51: diag = aa+k*bs2; /* ptr to inv(Dk) */
52: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
53: }
55: /* solve U*x = y by back substitution */
56: for (k=mbs-1; k>=0; k--){
57: v = aa + bs2*ai[k];
58: xk = t + k*bs; /* xk */
59: nz = ai[k+1] - ai[k];
60: vj = aj + ai[k];
61: xj = t + (*vj)*bs;
62: while (nz--) {
63: /* xk += U(k,:)*x(:) */
64: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
65: vj++;
66: v += bs2; xj = t + (*vj)*bs;
67: }
68: idx = bs*r[k];
69: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
70: }
72: PetscFree(xk_tmp);
73: ISRestoreIndices(isrow,&r);
74: VecRestoreArray(bb,&b);
75: VecRestoreArray(xx,&x);
76: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
77: return(0);
78: }
82: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
83: {
85: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
86: return(0);
87: }
91: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
92: {
94: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
95: return(0);
96: }
100: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
101: {
103: PetscInt nz,*vj,k;
104: PetscInt bs2 = bs*bs;
105: MatScalar *v,*diag;
106: PetscScalar *xk,*xj,*xk_tmp;
107:
109: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
110: for (k=0; k<mbs; k++){
111: v = aa + bs2*ai[k];
112: xk = x + k*bs; /* Dk*xk = k-th block of x */
113: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
114: nz = ai[k+1] - ai[k];
115: vj = aj + ai[k];
116: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
117: while (nz--) {
118: /* x(:) += U(k,:)^T*(Dk*xk) */
119: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
120: vj++; xj = x + (*vj)*bs;
121: v += bs2;
122: }
123: /* xk = inv(Dk)*(Dk*xk) */
124: diag = aa+k*bs2; /* ptr to inv(Dk) */
125: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
126: }
127: PetscFree(xk_tmp);
128: return(0);
129: }
133: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
134: {
135: PetscInt nz,*vj,k;
136: PetscInt bs2 = bs*bs;
137: MatScalar *v;
138: PetscScalar *xk,*xj;
141: for (k=mbs-1; k>=0; k--){
142: v = aa + bs2*ai[k];
143: xk = x + k*bs; /* xk */
144: nz = ai[k+1] - ai[k];
145: vj = aj + ai[k];
146: xj = x + (*vj)*bs;
147: while (nz--) {
148: /* xk += U(k,:)*x(:) */
149: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
150: vj++;
151: v += bs2; xj = x + (*vj)*bs;
152: }
153: }
154: return(0);
155: }
159: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
160: {
161: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
163: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
164: PetscInt bs=A->rmap->bs;
165: MatScalar *aa=a->a;
166: PetscScalar *x,*b;
169: VecGetArray(bb,&b);
170: VecGetArray(xx,&x);
172: /* solve U^T * D * y = b by forward substitution */
173: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
174: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
176: /* solve U*x = y by back substitution */
177: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
179: VecRestoreArray(bb,&b);
180: VecRestoreArray(xx,&x);
181: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
182: return(0);
183: }
187: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
188: {
189: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
191: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
192: PetscInt bs=A->rmap->bs;
193: MatScalar *aa=a->a;
194: PetscScalar *x,*b;
197: VecGetArray(bb,&b);
198: VecGetArray(xx,&x);
199: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
200: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
201: VecRestoreArray(bb,&b);
202: VecRestoreArray(xx,&x);
203: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
204: return(0);
205: }
209: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
210: {
211: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
213: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
214: PetscInt bs=A->rmap->bs;
215: MatScalar *aa=a->a;
216: PetscScalar *x,*b;
219: VecGetArray(bb,&b);
220: VecGetArray(xx,&x);
221: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
222: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
223: VecRestoreArray(bb,&b);
224: VecRestoreArray(xx,&x);
225: PetscLogFlops(2.0*bs2*(a->nz-mbs));
226: return(0);
227: }
231: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
232: {
233: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
234: IS isrow=a->row;
235: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
237: const PetscInt *r;
238: PetscInt nz,*vj,k,idx;
239: MatScalar *aa=a->a,*v,*d;
240: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
243: VecGetArray(bb,&b);
244: VecGetArray(xx,&x);
245: t = a->solve_work;
246: ISGetIndices(isrow,&r);
248: /* solve U^T * D * y = b by forward substitution */
249: tp = t;
250: for (k=0; k<mbs; k++) { /* t <- perm(b) */
251: idx = 7*r[k];
252: tp[0] = b[idx];
253: tp[1] = b[idx+1];
254: tp[2] = b[idx+2];
255: tp[3] = b[idx+3];
256: tp[4] = b[idx+4];
257: tp[5] = b[idx+5];
258: tp[6] = b[idx+6];
259: tp += 7;
260: }
261:
262: for (k=0; k<mbs; k++){
263: v = aa + 49*ai[k];
264: vj = aj + ai[k];
265: tp = t + k*7;
266: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
267: nz = ai[k+1] - ai[k];
268: tp = t + (*vj)*7;
269: while (nz--) {
270: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
271: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
272: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
273: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
274: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
275: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
276: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
277: vj++; tp = t + (*vj)*7;
278: v += 49;
279: }
281: /* xk = inv(Dk)*(Dk*xk) */
282: d = aa+k*49; /* ptr to inv(Dk) */
283: tp = t + k*7;
284: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
285: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
286: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
287: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
288: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
289: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
290: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
291: }
293: /* solve U*x = y by back substitution */
294: for (k=mbs-1; k>=0; k--){
295: v = aa + 49*ai[k];
296: vj = aj + ai[k];
297: tp = t + k*7;
298: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
299: nz = ai[k+1] - ai[k];
300:
301: tp = t + (*vj)*7;
302: while (nz--) {
303: /* xk += U(k,:)*x(:) */
304: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
305: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
306: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
307: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
308: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
309: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
310: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
311: vj++; tp = t + (*vj)*7;
312: v += 49;
313: }
314: tp = t + k*7;
315: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
316: idx = 7*r[k];
317: x[idx] = x0;
318: x[idx+1] = x1;
319: x[idx+2] = x2;
320: x[idx+3] = x3;
321: x[idx+4] = x4;
322: x[idx+5] = x5;
323: x[idx+6] = x6;
324: }
326: ISRestoreIndices(isrow,&r);
327: VecRestoreArray(bb,&b);
328: VecRestoreArray(xx,&x);
329: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
330: return(0);
331: }
335: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
336: {
337: MatScalar *v,*d;
338: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
339: PetscInt nz,*vj,k;
342: for (k=0; k<mbs; k++){
343: v = aa + 49*ai[k];
344: xp = x + k*7;
345: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
346: nz = ai[k+1] - ai[k];
347: vj = aj + ai[k];
348: xp = x + (*vj)*7;
349: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
350: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
351: while (nz--) {
352: /* x(:) += U(k,:)^T*(Dk*xk) */
353: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
354: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
355: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
356: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
357: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
358: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
359: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
360: vj++; xp = x + (*vj)*7;
361: v += 49;
362: }
363: /* xk = inv(Dk)*(Dk*xk) */
364: d = aa+k*49; /* ptr to inv(Dk) */
365: xp = x + k*7;
366: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
367: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
368: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
369: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
370: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
371: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
372: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
373: }
374: return(0);
375: }
379: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
380: {
381: MatScalar *v;
382: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
383: PetscInt nz,*vj,k;
386: for (k=mbs-1; k>=0; k--){
387: v = aa + 49*ai[k];
388: xp = x + k*7;
389: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
390: nz = ai[k+1] - ai[k];
391: vj = aj + ai[k];
392: xp = x + (*vj)*7;
393: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
394: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
395: while (nz--) {
396: /* xk += U(k,:)*x(:) */
397: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
398: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
399: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
400: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
401: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
402: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
403: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
404: vj++;
405: v += 49; xp = x + (*vj)*7;
406: }
407: xp = x + k*7;
408: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
409: }
410: return(0);
411: }
415: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
416: {
417: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
419: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
420: MatScalar *aa=a->a;
421: PetscScalar *x,*b;
424: VecGetArray(bb,&b);
425: VecGetArray(xx,&x);
426:
427: /* solve U^T * D * y = b by forward substitution */
428: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
429: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
431: /* solve U*x = y by back substitution */
432: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
434: VecRestoreArray(bb,&b);
435: VecRestoreArray(xx,&x);
436: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
437: return(0);
438: }
442: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
443: {
444: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
446: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
447: MatScalar *aa=a->a;
448: PetscScalar *x,*b;
451: VecGetArray(bb,&b);
452: VecGetArray(xx,&x);
453: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
454: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
455: VecRestoreArray(bb,&b);
456: VecRestoreArray(xx,&x);
457: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
458: return(0);
459: }
463: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
464: {
465: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
467: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
468: MatScalar *aa=a->a;
469: PetscScalar *x,*b;
472: VecGetArray(bb,&b);
473: VecGetArray(xx,&x);
474: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
475: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
476: VecRestoreArray(bb,&b);
477: VecRestoreArray(xx,&x);
478: PetscLogFlops(2.0*bs2*(a->nz-mbs));
479: return(0);
480: }
484: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
485: {
486: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
487: IS isrow=a->row;
488: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
490: const PetscInt *r;
491: PetscInt nz,*vj,k,idx;
492: MatScalar *aa=a->a,*v,*d;
493: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
496: VecGetArray(bb,&b);
497: VecGetArray(xx,&x);
498: t = a->solve_work;
499: ISGetIndices(isrow,&r);
501: /* solve U^T * D * y = b by forward substitution */
502: tp = t;
503: for (k=0; k<mbs; k++) { /* t <- perm(b) */
504: idx = 6*r[k];
505: tp[0] = b[idx];
506: tp[1] = b[idx+1];
507: tp[2] = b[idx+2];
508: tp[3] = b[idx+3];
509: tp[4] = b[idx+4];
510: tp[5] = b[idx+5];
511: tp += 6;
512: }
513:
514: for (k=0; k<mbs; k++){
515: v = aa + 36*ai[k];
516: vj = aj + ai[k];
517: tp = t + k*6;
518: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
519: nz = ai[k+1] - ai[k];
520: tp = t + (*vj)*6;
521: while (nz--) {
522: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
523: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
524: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
525: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
526: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
527: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
528: vj++; tp = t + (*vj)*6;
529: v += 36;
530: }
532: /* xk = inv(Dk)*(Dk*xk) */
533: d = aa+k*36; /* ptr to inv(Dk) */
534: tp = t + k*6;
535: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
536: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
537: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
538: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
539: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
540: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
541: }
543: /* solve U*x = y by back substitution */
544: for (k=mbs-1; k>=0; k--){
545: v = aa + 36*ai[k];
546: vj = aj + ai[k];
547: tp = t + k*6;
548: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
549: nz = ai[k+1] - ai[k];
550:
551: tp = t + (*vj)*6;
552: while (nz--) {
553: /* xk += U(k,:)*x(:) */
554: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
555: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
556: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
557: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
558: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
559: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
560: vj++; tp = t + (*vj)*6;
561: v += 36;
562: }
563: tp = t + k*6;
564: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
565: idx = 6*r[k];
566: x[idx] = x0;
567: x[idx+1] = x1;
568: x[idx+2] = x2;
569: x[idx+3] = x3;
570: x[idx+4] = x4;
571: x[idx+5] = x5;
572: }
574: ISRestoreIndices(isrow,&r);
575: VecRestoreArray(bb,&b);
576: VecRestoreArray(xx,&x);
577: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
578: return(0);
579: }
583: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
584: {
585: MatScalar *v,*d;
586: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
587: PetscInt nz,*vj,k;
590: for (k=0; k<mbs; k++){
591: v = aa + 36*ai[k];
592: xp = x + k*6;
593: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
594: nz = ai[k+1] - ai[k];
595: vj = aj + ai[k];
596: xp = x + (*vj)*6;
597: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
598: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
599: while (nz--) {
600: /* x(:) += U(k,:)^T*(Dk*xk) */
601: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
602: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
603: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
604: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
605: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
606: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
607: vj++; xp = x + (*vj)*6;
608: v += 36;
609: }
610: /* xk = inv(Dk)*(Dk*xk) */
611: d = aa+k*36; /* ptr to inv(Dk) */
612: xp = x + k*6;
613: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
614: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
615: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
616: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
617: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
618: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
619: }
620: return(0);
621: }
624: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
625: {
626: MatScalar *v;
627: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
628: PetscInt nz,*vj,k;
631: for (k=mbs-1; k>=0; k--){
632: v = aa + 36*ai[k];
633: xp = x + k*6;
634: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
635: nz = ai[k+1] - ai[k];
636: vj = aj + ai[k];
637: xp = x + (*vj)*6;
638: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
639: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
640: while (nz--) {
641: /* xk += U(k,:)*x(:) */
642: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
643: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
644: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
645: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
646: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
647: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
648: vj++;
649: v += 36; xp = x + (*vj)*6;
650: }
651: xp = x + k*6;
652: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
653: }
654: return(0);
655: }
660: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
661: {
662: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
663: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
664: MatScalar *aa=a->a;
665: PetscScalar *x,*b;
669: VecGetArray(bb,&b);
670: VecGetArray(xx,&x);
671:
672: /* solve U^T * D * y = b by forward substitution */
673: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
674: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
676: /* solve U*x = y by back substitution */
677: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
679: VecRestoreArray(bb,&b);
680: VecRestoreArray(xx,&x);
681: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
682: return(0);
683: }
687: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
688: {
689: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
690: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
691: MatScalar *aa=a->a;
692: PetscScalar *x,*b;
696: VecGetArray(bb,&b);
697: VecGetArray(xx,&x);
698: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
699: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
700: VecRestoreArray(bb,&b);
701: VecRestoreArray(xx,&x);
702: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
703: return(0);
704: }
708: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
709: {
710: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
711: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
712: MatScalar *aa=a->a;
713: PetscScalar *x,*b;
717: VecGetArray(bb,&b);
718: VecGetArray(xx,&x);
719: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
720: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
721: VecRestoreArray(bb,&b);
722: VecRestoreArray(xx,&x);
723: PetscLogFlops(2.0*bs2*(a->nz - mbs));
724: return(0);
725: }
729: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
730: {
731: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
732: IS isrow=a->row;
733: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
735: const PetscInt *r;
736: PetscInt nz,*vj,k,idx;
737: MatScalar *aa=a->a,*v,*diag;
738: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
741: VecGetArray(bb,&b);
742: VecGetArray(xx,&x);
743: t = a->solve_work;
744: ISGetIndices(isrow,&r);
746: /* solve U^T * D * y = b by forward substitution */
747: tp = t;
748: for (k=0; k<mbs; k++) { /* t <- perm(b) */
749: idx = 5*r[k];
750: tp[0] = b[idx];
751: tp[1] = b[idx+1];
752: tp[2] = b[idx+2];
753: tp[3] = b[idx+3];
754: tp[4] = b[idx+4];
755: tp += 5;
756: }
757:
758: for (k=0; k<mbs; k++){
759: v = aa + 25*ai[k];
760: vj = aj + ai[k];
761: tp = t + k*5;
762: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
763: nz = ai[k+1] - ai[k];
765: tp = t + (*vj)*5;
766: while (nz--) {
767: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
768: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
769: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
770: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
771: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
772: vj++; tp = t + (*vj)*5;
773: v += 25;
774: }
776: /* xk = inv(Dk)*(Dk*xk) */
777: diag = aa+k*25; /* ptr to inv(Dk) */
778: tp = t + k*5;
779: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
780: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
781: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
782: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
783: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
784: }
786: /* solve U*x = y by back substitution */
787: for (k=mbs-1; k>=0; k--){
788: v = aa + 25*ai[k];
789: vj = aj + ai[k];
790: tp = t + k*5;
791: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
792: nz = ai[k+1] - ai[k];
793:
794: tp = t + (*vj)*5;
795: while (nz--) {
796: /* xk += U(k,:)*x(:) */
797: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
798: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
799: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
800: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
801: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
802: vj++; tp = t + (*vj)*5;
803: v += 25;
804: }
805: tp = t + k*5;
806: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
807: idx = 5*r[k];
808: x[idx] = x0;
809: x[idx+1] = x1;
810: x[idx+2] = x2;
811: x[idx+3] = x3;
812: x[idx+4] = x4;
813: }
815: ISRestoreIndices(isrow,&r);
816: VecRestoreArray(bb,&b);
817: VecRestoreArray(xx,&x);
818: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
819: return(0);
820: }
824: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
825: {
826: MatScalar *v,*diag;
827: PetscScalar *xp,x0,x1,x2,x3,x4;
828: PetscInt nz,*vj,k;
831: for (k=0; k<mbs; k++){
832: v = aa + 25*ai[k];
833: xp = x + k*5;
834: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
835: nz = ai[k+1] - ai[k];
836: vj = aj + ai[k];
837: xp = x + (*vj)*5;
838: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
839: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
840: while (nz--) {
841: /* x(:) += U(k,:)^T*(Dk*xk) */
842: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
843: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
844: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
845: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
846: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
847: vj++; xp = x + (*vj)*5;
848: v += 25;
849: }
850: /* xk = inv(Dk)*(Dk*xk) */
851: diag = aa+k*25; /* ptr to inv(Dk) */
852: xp = x + k*5;
853: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
854: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
855: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
856: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
857: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
858: }
859: return(0);
860: }
864: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
865: {
866: MatScalar *v;
867: PetscScalar *xp,x0,x1,x2,x3,x4;
868: PetscInt nz,*vj,k;
871: for (k=mbs-1; k>=0; k--){
872: v = aa + 25*ai[k];
873: xp = x + k*5;
874: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
875: nz = ai[k+1] - ai[k];
876: vj = aj + ai[k];
877: xp = x + (*vj)*5;
878: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
879: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
880: while (nz--) {
881: /* xk += U(k,:)*x(:) */
882: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
883: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
884: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
885: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
886: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
887: vj++;
888: v += 25; xp = x + (*vj)*5;
889: }
890: xp = x + k*5;
891: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
892: }
893: return(0);
894: }
898: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
899: {
900: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
901: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
902: MatScalar *aa=a->a;
903: PetscScalar *x,*b;
907: VecGetArray(bb,&b);
908: VecGetArray(xx,&x);
910: /* solve U^T * D * y = b by forward substitution */
911: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
912: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
914: /* solve U*x = y by back substitution */
915: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
917: VecRestoreArray(bb,&b);
918: VecRestoreArray(xx,&x);
919: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
920: return(0);
921: }
925: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
926: {
927: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
928: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
929: MatScalar *aa=a->a;
930: PetscScalar *x,*b;
934: VecGetArray(bb,&b);
935: VecGetArray(xx,&x);
936: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
937: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
938: VecRestoreArray(bb,&b);
939: VecRestoreArray(xx,&x);
940: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
941: return(0);
942: }
946: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
947: {
948: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
949: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
950: MatScalar *aa=a->a;
951: PetscScalar *x,*b;
955: VecGetArray(bb,&b);
956: VecGetArray(xx,&x);
957: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
958: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
959: VecRestoreArray(bb,&b);
960: VecRestoreArray(xx,&x);
961: PetscLogFlops(2.0*bs2*(a->nz-mbs));
962: return(0);
963: }
967: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
968: {
969: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
970: IS isrow=a->row;
971: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
973: const PetscInt *r;
974: PetscInt nz,*vj,k,idx;
975: MatScalar *aa=a->a,*v,*diag;
976: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
979: VecGetArray(bb,&b);
980: VecGetArray(xx,&x);
981: t = a->solve_work;
982: ISGetIndices(isrow,&r);
984: /* solve U^T * D * y = b by forward substitution */
985: tp = t;
986: for (k=0; k<mbs; k++) { /* t <- perm(b) */
987: idx = 4*r[k];
988: tp[0] = b[idx];
989: tp[1] = b[idx+1];
990: tp[2] = b[idx+2];
991: tp[3] = b[idx+3];
992: tp += 4;
993: }
994:
995: for (k=0; k<mbs; k++){
996: v = aa + 16*ai[k];
997: vj = aj + ai[k];
998: tp = t + k*4;
999: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1000: nz = ai[k+1] - ai[k];
1002: tp = t + (*vj)*4;
1003: while (nz--) {
1004: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1005: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1006: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1007: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1008: vj++; tp = t + (*vj)*4;
1009: v += 16;
1010: }
1012: /* xk = inv(Dk)*(Dk*xk) */
1013: diag = aa+k*16; /* ptr to inv(Dk) */
1014: tp = t + k*4;
1015: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1016: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1017: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1018: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1019: }
1021: /* solve U*x = y by back substitution */
1022: for (k=mbs-1; k>=0; k--){
1023: v = aa + 16*ai[k];
1024: vj = aj + ai[k];
1025: tp = t + k*4;
1026: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1027: nz = ai[k+1] - ai[k];
1028:
1029: tp = t + (*vj)*4;
1030: while (nz--) {
1031: /* xk += U(k,:)*x(:) */
1032: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1033: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1034: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1035: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1036: vj++; tp = t + (*vj)*4;
1037: v += 16;
1038: }
1039: tp = t + k*4;
1040: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1041: idx = 4*r[k];
1042: x[idx] = x0;
1043: x[idx+1] = x1;
1044: x[idx+2] = x2;
1045: x[idx+3] = x3;
1046: }
1048: ISRestoreIndices(isrow,&r);
1049: VecRestoreArray(bb,&b);
1050: VecRestoreArray(xx,&x);
1051: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1052: return(0);
1053: }
1057: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1058: {
1059: MatScalar *v,*diag;
1060: PetscScalar *xp,x0,x1,x2,x3;
1061: PetscInt nz,*vj,k;
1064: for (k=0; k<mbs; k++){
1065: v = aa + 16*ai[k];
1066: xp = x + k*4;
1067: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1068: nz = ai[k+1] - ai[k];
1069: vj = aj + ai[k];
1070: xp = x + (*vj)*4;
1071: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1072: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1073: while (nz--) {
1074: /* x(:) += U(k,:)^T*(Dk*xk) */
1075: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1076: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1077: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1078: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1079: vj++; xp = x + (*vj)*4;
1080: v += 16;
1081: }
1082: /* xk = inv(Dk)*(Dk*xk) */
1083: diag = aa+k*16; /* ptr to inv(Dk) */
1084: xp = x + k*4;
1085: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1086: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1087: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1088: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1089: }
1090: return(0);
1091: }
1095: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1096: {
1097: MatScalar *v;
1098: PetscScalar *xp,x0,x1,x2,x3;
1099: PetscInt nz,*vj,k;
1102: for (k=mbs-1; k>=0; k--){
1103: v = aa + 16*ai[k];
1104: xp = x + k*4;
1105: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1106: nz = ai[k+1] - ai[k];
1107: vj = aj + ai[k];
1108: xp = x + (*vj)*4;
1109: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1110: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1111: while (nz--) {
1112: /* xk += U(k,:)*x(:) */
1113: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1114: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1115: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1116: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1117: vj++;
1118: v += 16; xp = x + (*vj)*4;
1119: }
1120: xp = x + k*4;
1121: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1122: }
1123: return(0);
1124: }
1128: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1129: {
1130: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1131: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1132: MatScalar *aa=a->a;
1133: PetscScalar *x,*b;
1137: VecGetArray(bb,&b);
1138: VecGetArray(xx,&x);
1140: /* solve U^T * D * y = b by forward substitution */
1141: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1142: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1144: /* solve U*x = y by back substitution */
1145: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1146: VecRestoreArray(bb,&b);
1147: VecRestoreArray(xx,&x);
1148: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1149: return(0);
1150: }
1154: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1155: {
1156: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1157: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1158: MatScalar *aa=a->a;
1159: PetscScalar *x,*b;
1163: VecGetArray(bb,&b);
1164: VecGetArray(xx,&x);
1165: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1166: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1167: VecRestoreArray(bb,&b);
1168: VecRestoreArray(xx,&x);
1169: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1170: return(0);
1171: }
1175: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1176: {
1177: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1178: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1179: MatScalar *aa=a->a;
1180: PetscScalar *x,*b;
1184: VecGetArray(bb,&b);
1185: VecGetArray(xx,&x);
1186: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1187: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1188: VecRestoreArray(bb,&b);
1189: VecRestoreArray(xx,&x);
1190: PetscLogFlops(2.0*bs2*(a->nz-mbs));
1191: return(0);
1192: }
1196: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1197: {
1198: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1199: IS isrow=a->row;
1200: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1202: const PetscInt *r;
1203: PetscInt nz,*vj,k,idx;
1204: MatScalar *aa=a->a,*v,*diag;
1205: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
1208: VecGetArray(bb,&b);
1209: VecGetArray(xx,&x);
1210: t = a->solve_work;
1211: ISGetIndices(isrow,&r);
1213: /* solve U^T * D * y = b by forward substitution */
1214: tp = t;
1215: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1216: idx = 3*r[k];
1217: tp[0] = b[idx];
1218: tp[1] = b[idx+1];
1219: tp[2] = b[idx+2];
1220: tp += 3;
1221: }
1222:
1223: for (k=0; k<mbs; k++){
1224: v = aa + 9*ai[k];
1225: vj = aj + ai[k];
1226: tp = t + k*3;
1227: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1228: nz = ai[k+1] - ai[k];
1230: tp = t + (*vj)*3;
1231: while (nz--) {
1232: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1233: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1234: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1235: vj++; tp = t + (*vj)*3;
1236: v += 9;
1237: }
1239: /* xk = inv(Dk)*(Dk*xk) */
1240: diag = aa+k*9; /* ptr to inv(Dk) */
1241: tp = t + k*3;
1242: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1243: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1244: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1245: }
1247: /* solve U*x = y by back substitution */
1248: for (k=mbs-1; k>=0; k--){
1249: v = aa + 9*ai[k];
1250: vj = aj + ai[k];
1251: tp = t + k*3;
1252: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1253: nz = ai[k+1] - ai[k];
1254:
1255: tp = t + (*vj)*3;
1256: while (nz--) {
1257: /* xk += U(k,:)*x(:) */
1258: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1259: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1260: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1261: vj++; tp = t + (*vj)*3;
1262: v += 9;
1263: }
1264: tp = t + k*3;
1265: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1266: idx = 3*r[k];
1267: x[idx] = x0;
1268: x[idx+1] = x1;
1269: x[idx+2] = x2;
1270: }
1272: ISRestoreIndices(isrow,&r);
1273: VecRestoreArray(bb,&b);
1274: VecRestoreArray(xx,&x);
1275: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1276: return(0);
1277: }
1281: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1282: {
1283: MatScalar *v,*diag;
1284: PetscScalar *xp,x0,x1,x2;
1285: PetscInt nz,*vj,k;
1288: for (k=0; k<mbs; k++){
1289: v = aa + 9*ai[k];
1290: xp = x + k*3;
1291: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1292: nz = ai[k+1] - ai[k];
1293: vj = aj + ai[k];
1294: xp = x + (*vj)*3;
1295: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1296: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1297: while (nz--) {
1298: /* x(:) += U(k,:)^T*(Dk*xk) */
1299: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1300: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1301: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1302: vj++; xp = x + (*vj)*3;
1303: v += 9;
1304: }
1305: /* xk = inv(Dk)*(Dk*xk) */
1306: diag = aa+k*9; /* ptr to inv(Dk) */
1307: xp = x + k*3;
1308: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1309: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1310: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1311: }
1312: return(0);
1313: }
1317: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1318: {
1319: MatScalar *v;
1320: PetscScalar *xp,x0,x1,x2;
1321: PetscInt nz,*vj,k;
1324: for (k=mbs-1; k>=0; k--){
1325: v = aa + 9*ai[k];
1326: xp = x + k*3;
1327: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1328: nz = ai[k+1] - ai[k];
1329: vj = aj + ai[k];
1330: xp = x + (*vj)*3;
1331: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1332: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1333: while (nz--) {
1334: /* xk += U(k,:)*x(:) */
1335: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1336: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1337: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1338: vj++;
1339: v += 9; xp = x + (*vj)*3;
1340: }
1341: xp = x + k*3;
1342: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1343: }
1344: return(0);
1345: }
1349: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1350: {
1351: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1352: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1353: MatScalar *aa=a->a;
1354: PetscScalar *x,*b;
1356:
1358: VecGetArray(bb,&b);
1359: VecGetArray(xx,&x);
1361: /* solve U^T * D * y = b by forward substitution */
1362: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1363: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1365: /* solve U*x = y by back substitution */
1366: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1368: VecRestoreArray(bb,&b);
1369: VecRestoreArray(xx,&x);
1370: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1371: return(0);
1372: }
1376: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1377: {
1378: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1379: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1380: MatScalar *aa=a->a;
1381: PetscScalar *x,*b;
1385: VecGetArray(bb,&b);
1386: VecGetArray(xx,&x);
1387: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1388: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1389: VecRestoreArray(bb,&b);
1390: VecRestoreArray(xx,&x);
1391: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1392: return(0);
1393: }
1397: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1398: {
1399: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1400: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1401: MatScalar *aa=a->a;
1402: PetscScalar *x,*b;
1406: VecGetArray(bb,&b);
1407: VecGetArray(xx,&x);
1408: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1409: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1410: VecRestoreArray(bb,&b);
1411: VecRestoreArray(xx,&x);
1412: PetscLogFlops(2.0*bs2*(a->nz-mbs));
1413: return(0);
1414: }
1418: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1419: {
1420: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
1421: IS isrow=a->row;
1422: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1424: const PetscInt *r;
1425: PetscInt nz,*vj,k,k2,idx;
1426: MatScalar *aa=a->a,*v,*diag;
1427: PetscScalar *x,*b,x0,x1,*t;
1430: VecGetArray(bb,&b);
1431: VecGetArray(xx,&x);
1432: t = a->solve_work;
1433: ISGetIndices(isrow,&r);
1435: /* solve U^T * D * y = perm(b) by forward substitution */
1436: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1437: idx = 2*r[k];
1438: t[k*2] = b[idx];
1439: t[k*2+1] = b[idx+1];
1440: }
1441: for (k=0; k<mbs; k++){
1442: v = aa + 4*ai[k];
1443: vj = aj + ai[k];
1444: k2 = k*2;
1445: x0 = t[k2]; x1 = t[k2+1];
1446: nz = ai[k+1] - ai[k];
1447: while (nz--) {
1448: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1449: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1450: vj++; v += 4;
1451: }
1452: diag = aa+k*4; /* ptr to inv(Dk) */
1453: t[k2] = diag[0]*x0 + diag[2]*x1;
1454: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1455: }
1457: /* solve U*x = y by back substitution */
1458: for (k=mbs-1; k>=0; k--){
1459: v = aa + 4*ai[k];
1460: vj = aj + ai[k];
1461: k2 = k*2;
1462: x0 = t[k2]; x1 = t[k2+1];
1463: nz = ai[k+1] - ai[k];
1464: while (nz--) {
1465: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1466: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1467: vj++; v += 4;
1468: }
1469: t[k2] = x0;
1470: t[k2+1] = x1;
1471: idx = 2*r[k];
1472: x[idx] = x0;
1473: x[idx+1] = x1;
1474: }
1476: ISRestoreIndices(isrow,&r);
1477: VecRestoreArray(bb,&b);
1478: VecRestoreArray(xx,&x);
1479: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1480: return(0);
1481: }
1485: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1486: {
1487: MatScalar *v,*diag;
1488: PetscScalar x0,x1;
1489: PetscInt nz,*vj,k,k2;
1492: for (k=0; k<mbs; k++){
1493: v = aa + 4*ai[k];
1494: vj = aj + ai[k];
1495: k2 = k*2;
1496: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1497: nz = ai[k+1] - ai[k];
1498: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1499: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1500: while (nz--) {
1501: /* x(:) += U(k,:)^T*(Dk*xk) */
1502: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1503: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1504: vj++; v += 4;
1505: }
1506: /* xk = inv(Dk)*(Dk*xk) */
1507: diag = aa+k*4; /* ptr to inv(Dk) */
1508: x[k2] = diag[0]*x0 + diag[2]*x1;
1509: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1510: }
1511: return(0);
1512: }
1516: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1517: {
1518: MatScalar *v;
1519: PetscScalar x0,x1;
1520: PetscInt nz,*vj,k,k2;
1523: for (k=mbs-1; k>=0; k--){
1524: v = aa + 4*ai[k];
1525: vj = aj + ai[k];
1526: k2 = k*2;
1527: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1528: nz = ai[k+1] - ai[k];
1529: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1530: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1531: while (nz--) {
1532: /* xk += U(k,:)*x(:) */
1533: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1534: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1535: vj++; v += 4;
1536: }
1537: x[k2] = x0;
1538: x[k2+1] = x1;
1539: }
1540: return(0);
1541: }
1545: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1546: {
1547: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1548: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1549: MatScalar *aa=a->a;
1550: PetscScalar *x,*b;
1554: VecGetArray(bb,&b);
1555: VecGetArray(xx,&x);
1557: /* solve U^T * D * y = b by forward substitution */
1558: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1559: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1561: /* solve U*x = y by back substitution */
1562: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1564: VecRestoreArray(bb,&b);
1565: VecRestoreArray(xx,&x);
1566: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1567: return(0);
1568: }
1572: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1573: {
1574: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1575: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1576: MatScalar *aa=a->a;
1577: PetscScalar *x,*b;
1581: VecGetArray(bb,&b);
1582: VecGetArray(xx,&x);
1583: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1584: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1585: VecRestoreArray(bb,&b);
1586: VecRestoreArray(xx,&x);
1587: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1588: return(0);
1589: }
1593: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1594: {
1595: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1596: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1597: MatScalar *aa=a->a;
1598: PetscScalar *x,*b;
1602: VecGetArray(bb,&b);
1603: VecGetArray(xx,&x);
1604: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1605: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1606: VecRestoreArray(bb,&b);
1607: VecRestoreArray(xx,&x);
1608: PetscLogFlops(2.0*bs2*(a->nz - mbs));
1609: return(0);
1610: }
1614: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1615: {
1616: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1617: IS isrow=a->row;
1618: PetscErrorCode ierr;
1619: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1620: const MatScalar *aa=a->a,*v;
1621: const PetscScalar *b;
1622: PetscScalar *x,xk,*t;
1623: PetscInt nz,k,j;
1626: VecGetArrayRead(bb,&b);
1627: VecGetArray(xx,&x);
1628: t = a->solve_work;
1629: ISGetIndices(isrow,&rp);
1630:
1631: /* solve U^T*D*y = perm(b) by forward substitution */
1632: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1633: for (k=0; k<mbs; k++){
1634: v = aa + ai[k];
1635: vj = aj + ai[k];
1636: xk = t[k];
1637: nz = ai[k+1] - ai[k] - 1;
1638: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1639: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1640: }
1642: /* solve U*perm(x) = y by back substitution */
1643: for (k=mbs-1; k>=0; k--){
1644: v = aa + adiag[k] - 1;
1645: vj = aj + adiag[k] - 1;
1646: nz = ai[k+1] - ai[k] - 1;
1647: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1648: x[rp[k]] = t[k];
1649: }
1651: ISRestoreIndices(isrow,&rp);
1652: VecRestoreArrayRead(bb,&b);
1653: VecRestoreArray(xx,&x);
1654: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1655: return(0);
1656: }
1660: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1661: {
1662: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1663: IS isrow=a->row;
1664: PetscErrorCode ierr;
1665: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1666: const MatScalar *aa=a->a,*v;
1667: PetscScalar *x,*b,xk,*t;
1668: PetscInt nz,k;
1671: VecGetArray(bb,&b);
1672: VecGetArray(xx,&x);
1673: t = a->solve_work;
1674: ISGetIndices(isrow,&rp);
1675:
1676: /* solve U^T*D*y = perm(b) by forward substitution */
1677: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1678: for (k=0; k<mbs; k++){
1679: v = aa + ai[k] + 1;
1680: vj = aj + ai[k] + 1;
1681: xk = t[k];
1682: nz = ai[k+1] - ai[k] - 1;
1683: while (nz--) t[*vj++] += (*v++) * xk;
1684: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1685: }
1687: /* solve U*perm(x) = y by back substitution */
1688: for (k=mbs-1; k>=0; k--){
1689: v = aa + ai[k] + 1;
1690: vj = aj + ai[k] + 1;
1691: nz = ai[k+1] - ai[k] - 1;
1692: while (nz--) t[k] += (*v++) * t[*vj++];
1693: x[rp[k]] = t[k];
1694: }
1696: ISRestoreIndices(isrow,&rp);
1697: VecRestoreArray(bb,&b);
1698: VecRestoreArray(xx,&x);
1699: PetscLogFlops(4.0*a->nz - 3*mbs);
1700: return(0);
1701: }
1705: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1706: {
1707: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1708: IS isrow=a->row;
1709: PetscErrorCode ierr;
1710: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1711: const MatScalar *aa=a->a,*v;
1712: PetscReal diagk;
1713: PetscScalar *x,*b,xk;
1714: PetscInt nz,k;
1717: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1718: VecGetArray(bb,&b);
1719: VecGetArray(xx,&x);
1720: ISGetIndices(isrow,&rp);
1721:
1722: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1723: for (k=0; k<mbs; k++){
1724: v = aa + ai[k];
1725: vj = aj + ai[k];
1726: xk = x[k];
1727: nz = ai[k+1] - ai[k] - 1;
1728: while (nz--) x[*vj++] += (*v++) * xk;
1730: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1731: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1732: x[k] = xk*PetscSqrtReal(diagk);
1733: }
1734: ISRestoreIndices(isrow,&rp);
1735: VecRestoreArray(bb,&b);
1736: VecRestoreArray(xx,&x);
1737: PetscLogFlops(2.0*a->nz - mbs);
1738: return(0);
1739: }
1743: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1744: {
1745: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1746: IS isrow=a->row;
1747: PetscErrorCode ierr;
1748: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1749: const MatScalar *aa=a->a,*v;
1750: PetscReal diagk;
1751: PetscScalar *x,*b,xk;
1752: PetscInt nz,k;
1755: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1756: VecGetArray(bb,&b);
1757: VecGetArray(xx,&x);
1758: ISGetIndices(isrow,&rp);
1759:
1760: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1761: for (k=0; k<mbs; k++){
1762: v = aa + ai[k] + 1;
1763: vj = aj + ai[k] + 1;
1764: xk = x[k];
1765: nz = ai[k+1] - ai[k] - 1;
1766: while (nz--) x[*vj++] += (*v++) * xk;
1768: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1769: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1770: x[k] = xk*PetscSqrtReal(diagk);
1771: }
1772: ISRestoreIndices(isrow,&rp);
1773: VecRestoreArray(bb,&b);
1774: VecRestoreArray(xx,&x);
1775: PetscLogFlops(2.0*a->nz - mbs);
1776: return(0);
1777: }
1781: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1782: {
1783: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1784: IS isrow=a->row;
1785: PetscErrorCode ierr;
1786: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1787: const MatScalar *aa=a->a,*v;
1788: PetscReal diagk;
1789: PetscScalar *x,*b,*t;
1790: PetscInt nz,k;
1793: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1794: VecGetArray(bb,&b);
1795: VecGetArray(xx,&x);
1796: t = a->solve_work;
1797: ISGetIndices(isrow,&rp);
1799: for (k=mbs-1; k>=0; k--){
1800: v = aa + ai[k];
1801: vj = aj + ai[k];
1802: diagk = PetscRealPart(aa[adiag[k]]);
1803: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1804: t[k] = b[k] * PetscSqrtReal(diagk);
1805: nz = ai[k+1] - ai[k] - 1;
1806: while (nz--) t[k] += (*v++) * t[*vj++];
1807: x[rp[k]] = t[k];
1808: }
1809: ISRestoreIndices(isrow,&rp);
1810: VecRestoreArray(bb,&b);
1811: VecRestoreArray(xx,&x);
1812: PetscLogFlops(2.0*a->nz - mbs);
1813: return(0);
1814: }
1818: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1819: {
1820: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1821: IS isrow=a->row;
1822: PetscErrorCode ierr;
1823: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1824: const MatScalar *aa=a->a,*v;
1825: PetscReal diagk;
1826: PetscScalar *x,*b,*t;
1827: PetscInt nz,k;
1830: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1831: VecGetArray(bb,&b);
1832: VecGetArray(xx,&x);
1833: t = a->solve_work;
1834: ISGetIndices(isrow,&rp);
1836: for (k=mbs-1; k>=0; k--){
1837: v = aa + ai[k] + 1;
1838: vj = aj + ai[k] + 1;
1839: diagk = PetscRealPart(aa[ai[k]]);
1840: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1841: t[k] = b[k] * PetscSqrtReal(diagk);
1842: nz = ai[k+1] - ai[k] - 1;
1843: while (nz--) t[k] += (*v++) * t[*vj++];
1844: x[rp[k]] = t[k];
1845: }
1846: ISRestoreIndices(isrow,&rp);
1847: VecRestoreArray(bb,&b);
1848: VecRestoreArray(xx,&x);
1849: PetscLogFlops(2.0*a->nz - mbs);
1850: return(0);
1851: }
1855: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1856: {
1857: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1861: if (A->rmap->bs == 1) {
1862: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1863: } else {
1864: IS isrow=a->row;
1865: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1866: const MatScalar *aa=a->a,*v;
1867: PetscScalar *x,*b,*t;
1868: PetscInt nz,k,n,i,j;
1869: if (bb->n > a->solves_work_n) {
1870: PetscFree(a->solves_work);
1871: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1872: a->solves_work_n = bb->n;
1873: }
1874: n = bb->n;
1875: VecGetArray(bb->v,&b);
1876: VecGetArray(xx->v,&x);
1877: t = a->solves_work;
1879: ISGetIndices(isrow,&rp);
1880:
1881: /* solve U^T*D*y = perm(b) by forward substitution */
1882: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1883: for (k=0; k<mbs; k++){
1884: v = aa + ai[k];
1885: vj = aj + ai[k];
1886: nz = ai[k+1] - ai[k] - 1;
1887: for (j=0; j<nz; j++){
1888: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1889: v++;vj++;
1890: }
1891: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1892: }
1893:
1894: /* solve U*perm(x) = y by back substitution */
1895: for (k=mbs-1; k>=0; k--){
1896: v = aa + ai[k] - 1;
1897: vj = aj + ai[k] - 1;
1898: nz = ai[k+1] - ai[k] - 1;
1899: for (j=0; j<nz; j++){
1900: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1901: v++;vj++;
1902: }
1903: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1904: }
1906: ISRestoreIndices(isrow,&rp);
1907: VecRestoreArray(bb->v,&b);
1908: VecRestoreArray(xx->v,&x);
1909: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1910: }
1911: return(0);
1912: }
1916: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1917: {
1918: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1922: if (A->rmap->bs == 1) {
1923: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1924: } else {
1925: IS isrow=a->row;
1926: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1927: const MatScalar *aa=a->a,*v;
1928: PetscScalar *x,*b,*t;
1929: PetscInt nz,k,n,i;
1930: if (bb->n > a->solves_work_n) {
1931: PetscFree(a->solves_work);
1932: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1933: a->solves_work_n = bb->n;
1934: }
1935: n = bb->n;
1936: VecGetArray(bb->v,&b);
1937: VecGetArray(xx->v,&x);
1938: t = a->solves_work;
1940: ISGetIndices(isrow,&rp);
1941:
1942: /* solve U^T*D*y = perm(b) by forward substitution */
1943: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1944: for (k=0; k<mbs; k++){
1945: v = aa + ai[k];
1946: vj = aj + ai[k];
1947: nz = ai[k+1] - ai[k];
1948: while (nz--) {
1949: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1950: v++;vj++;
1951: }
1952: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1953: }
1954:
1955: /* solve U*perm(x) = y by back substitution */
1956: for (k=mbs-1; k>=0; k--){
1957: v = aa + ai[k];
1958: vj = aj + ai[k];
1959: nz = ai[k+1] - ai[k];
1960: while (nz--) {
1961: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1962: v++;vj++;
1963: }
1964: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1965: }
1967: ISRestoreIndices(isrow,&rp);
1968: VecRestoreArray(bb->v,&b);
1969: VecRestoreArray(xx->v,&x);
1970: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1971: }
1972: return(0);
1973: }
1977: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1978: {
1979: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1980: PetscErrorCode ierr;
1981: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1982: const MatScalar *aa=a->a,*v;
1983: const PetscScalar *b;
1984: PetscScalar *x,xi;
1985: PetscInt nz,i,j;
1988: VecGetArrayRead(bb,&b);
1989: VecGetArray(xx,&x);
1990:
1991: /* solve U^T*D*y = b by forward substitution */
1992: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1993: for (i=0; i<mbs; i++){
1994: v = aa + ai[i];
1995: vj = aj + ai[i];
1996: xi = x[i];
1997: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1998: for (j=0; j<nz; j++){
1999: x[vj[j]] += v[j]* xi;
2000: }
2001: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2002: }
2004: /* solve U*x = y by backward substitution */
2005: for (i=mbs-2; i>=0; i--){
2006: xi = x[i];
2007: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2008: vj = aj + adiag[i] - 1;
2009: nz = ai[i+1] - ai[i] - 1;
2010: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2011: x[i] = xi;
2012: }
2013:
2014: VecRestoreArrayRead(bb,&b);
2015: VecRestoreArray(xx,&x);
2016: PetscLogFlops(4.0*a->nz - 3*mbs);
2017: return(0);
2018: }
2022: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2023: {
2024: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2026: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2027: MatScalar *aa=a->a,*v;
2028: PetscScalar *x,*b,xk;
2029: PetscInt nz,*vj,k;
2032: VecGetArray(bb,&b);
2033: VecGetArray(xx,&x);
2034:
2035: /* solve U^T*D*y = b by forward substitution */
2036: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2037: for (k=0; k<mbs; k++){
2038: v = aa + ai[k] + 1;
2039: vj = aj + ai[k] + 1;
2040: xk = x[k];
2041: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2042: while (nz--) x[*vj++] += (*v++) * xk;
2043: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2044: }
2046: /* solve U*x = y by back substitution */
2047: for (k=mbs-2; k>=0; k--){
2048: v = aa + ai[k] + 1;
2049: vj = aj + ai[k] + 1;
2050: xk = x[k];
2051: nz = ai[k+1] - ai[k] - 1;
2052: while (nz--) {
2053: xk += (*v++) * x[*vj++];
2054: }
2055: x[k] = xk;
2056: }
2058: VecRestoreArray(bb,&b);
2059: VecRestoreArray(xx,&x);
2060: PetscLogFlops(4.0*a->nz - 3*mbs);
2061: return(0);
2062: }
2066: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2067: {
2068: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2069: PetscErrorCode ierr;
2070: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2071: const MatScalar *aa=a->a,*v;
2072: PetscReal diagk;
2073: PetscScalar *x,*b;
2074: PetscInt nz,*vj,k;
2077: /* solve U^T*D^(1/2)*x = b by forward substitution */
2078: VecGetArray(bb,&b);
2079: VecGetArray(xx,&x);
2080: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2081: for (k=0; k<mbs; k++){
2082: v = aa + ai[k];
2083: vj = aj + ai[k];
2084: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2085: while (nz--) x[*vj++] += (*v++) * x[k];
2086: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2087: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2088: x[k] *= PetscSqrtReal(diagk);
2089: }
2090: VecRestoreArray(bb,&b);
2091: VecRestoreArray(xx,&x);
2092: PetscLogFlops(2.0*a->nz - mbs);
2093: return(0);
2094: }
2098: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2099: {
2100: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2102: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2103: MatScalar *aa=a->a,*v;
2104: PetscReal diagk;
2105: PetscScalar *x,*b;
2106: PetscInt nz,*vj,k;
2109: /* solve U^T*D^(1/2)*x = b by forward substitution */
2110: VecGetArray(bb,&b);
2111: VecGetArray(xx,&x);
2112: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2113: for (k=0; k<mbs; k++){
2114: v = aa + ai[k] + 1;
2115: vj = aj + ai[k] + 1;
2116: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2117: while (nz--) x[*vj++] += (*v++) * x[k];
2118: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2119: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2120: x[k] *= PetscSqrtReal(diagk);
2121: }
2122: VecRestoreArray(bb,&b);
2123: VecRestoreArray(xx,&x);
2124: PetscLogFlops(2.0*a->nz - mbs);
2125: return(0);
2126: }
2130: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2131: {
2132: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2134: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2135: MatScalar *aa=a->a,*v;
2136: PetscReal diagk;
2137: PetscScalar *x,*b;
2138: PetscInt nz,*vj,k;
2141: /* solve D^(1/2)*U*x = b by back substitution */
2142: VecGetArray(bb,&b);
2143: VecGetArray(xx,&x);
2145: for (k=mbs-1; k>=0; k--){
2146: v = aa + ai[k];
2147: vj = aj + ai[k];
2148: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2149: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2150: x[k] = PetscSqrtReal(diagk)*b[k];
2151: nz = ai[k+1] - ai[k] - 1;
2152: while (nz--) x[k] += (*v++) * x[*vj++];
2153: }
2154: VecRestoreArray(bb,&b);
2155: VecRestoreArray(xx,&x);
2156: PetscLogFlops(2.0*a->nz - mbs);
2157: return(0);
2158: }
2162: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2163: {
2164: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2166: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2167: MatScalar *aa=a->a,*v;
2168: PetscReal diagk;
2169: PetscScalar *x,*b;
2170: PetscInt nz,*vj,k;
2173: /* solve D^(1/2)*U*x = b by back substitution */
2174: VecGetArray(bb,&b);
2175: VecGetArray(xx,&x);
2177: for (k=mbs-1; k>=0; k--){
2178: v = aa + ai[k] + 1;
2179: vj = aj + ai[k] + 1;
2180: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2181: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2182: x[k] = PetscSqrtReal(diagk)*b[k];
2183: nz = ai[k+1] - ai[k] - 1;
2184: while (nz--) x[k] += (*v++) * x[*vj++];
2185: }
2186: VecRestoreArray(bb,&b);
2187: VecRestoreArray(xx,&x);
2188: PetscLogFlops(2.0*a->nz - mbs);
2189: return(0);
2190: }
2192: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2195: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2196: {
2197: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2199: const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2200: PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2201: PetscInt m,reallocs = 0,*levtmp;
2202: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2203: PetscInt incrlev,*lev,shift,prow,nz;
2204: PetscReal f = info->fill,levels = info->levels;
2205: PetscBool perm_identity;
2208: /* check whether perm is the identity mapping */
2209: ISIdentity(perm,&perm_identity);
2211: if (perm_identity){
2212: a->permute = PETSC_FALSE;
2213: ai = a->i; aj = a->j;
2214: } else { /* non-trivial permutation */
2215: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2216: a->permute = PETSC_TRUE;
2217: MatReorderingSeqSBAIJ(A, perm);
2218: ai = a->inew; aj = a->jnew;
2219: }
2220:
2221: /* initialization */
2222: ISGetIndices(perm,&rip);
2223: umax = (PetscInt)(f*ai[mbs] + 1);
2224: PetscMalloc(umax*sizeof(PetscInt),&lev);
2225: umax += mbs + 1;
2226: shift = mbs + 1;
2227: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
2228: PetscMalloc(umax*sizeof(PetscInt),&ju);
2229: iu[0] = mbs + 1;
2230: juidx = mbs + 1;
2231: /* prowl: linked list for pivot row */
2232: PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);
2233: /* q: linked list for col index */
2234:
2235: for (i=0; i<mbs; i++){
2236: prowl[i] = mbs;
2237: q[i] = 0;
2238: }
2240: /* for each row k */
2241: for (k=0; k<mbs; k++){
2242: nzk = 0;
2243: q[k] = mbs;
2244: /* copy current row into linked list */
2245: nz = ai[rip[k]+1] - ai[rip[k]];
2246: j = ai[rip[k]];
2247: while (nz--){
2248: vj = rip[aj[j++]];
2249: if (vj > k){
2250: qm = k;
2251: do {
2252: m = qm; qm = q[m];
2253: } while(qm < vj);
2254: if (qm == vj) {
2255: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2256: }
2257: nzk++;
2258: q[m] = vj;
2259: q[vj] = qm;
2260: levtmp[vj] = 0; /* initialize lev for nonzero element */
2261: }
2262: }
2264: /* modify nonzero structure of k-th row by computing fill-in
2265: for each row prow to be merged in */
2266: prow = k;
2267: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2268:
2269: while (prow < k){
2270: /* merge row prow into k-th row */
2271: jmin = iu[prow] + 1;
2272: jmax = iu[prow+1];
2273: qm = k;
2274: for (j=jmin; j<jmax; j++){
2275: incrlev = lev[j-shift] + 1;
2276: if (incrlev > levels) continue;
2277: vj = ju[j];
2278: do {
2279: m = qm; qm = q[m];
2280: } while (qm < vj);
2281: if (qm != vj){ /* a fill */
2282: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2283: levtmp[vj] = incrlev;
2284: } else {
2285: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2286: }
2287: }
2288: prow = prowl[prow]; /* next pivot row */
2289: }
2290:
2291: /* add k to row list for first nonzero element in k-th row */
2292: if (nzk > 1){
2293: i = q[k]; /* col value of first nonzero element in k_th row of U */
2294: prowl[k] = prowl[i]; prowl[i] = k;
2295: }
2296: iu[k+1] = iu[k] + nzk;
2298: /* allocate more space to ju and lev if needed */
2299: if (iu[k+1] > umax) {
2300: /* estimate how much additional space we will need */
2301: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2302: /* just double the memory each time */
2303: maxadd = umax;
2304: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2305: umax += maxadd;
2307: /* allocate a longer ju */
2308: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2309: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2310: PetscFree(ju);
2311: ju = jutmp;
2313: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2314: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2315: PetscFree(lev);
2316: lev = jutmp;
2317: reallocs += 2; /* count how many times we realloc */
2318: }
2320: /* save nonzero structure of k-th row in ju */
2321: i=k;
2322: while (nzk --) {
2323: i = q[i];
2324: ju[juidx] = i;
2325: lev[juidx-shift] = levtmp[i];
2326: juidx++;
2327: }
2328: }
2329:
2330: #if defined(PETSC_USE_INFO)
2331: if (ai[mbs] != 0) {
2332: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2333: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2334: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2335: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2336: PetscInfo(A,"for best performance.\n");
2337: } else {
2338: PetscInfo(A,"Empty matrix.\n");
2339: }
2340: #endif
2342: ISRestoreIndices(perm,&rip);
2343: PetscFree3(prowl,q,levtmp);
2344: PetscFree(lev);
2346: /* put together the new matrix */
2347: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);
2349: /* PetscLogObjectParent(B,iperm); */
2350: b = (Mat_SeqSBAIJ*)(B)->data;
2351: PetscFree2(b->imax,b->ilen);
2352: b->singlemalloc = PETSC_FALSE;
2353: b->free_a = PETSC_TRUE;
2354: b->free_ij = PETSC_TRUE;
2355: /* the next line frees the default space generated by the Create() */
2356: PetscFree3(b->a,b->j,b->i);
2357: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2358: b->j = ju;
2359: b->i = iu;
2360: b->diag = 0;
2361: b->ilen = 0;
2362: b->imax = 0;
2363:
2364: ISDestroy(&b->row);
2365: ISDestroy(&b->icol);
2366: b->row = perm;
2367: b->icol = perm;
2368: PetscObjectReference((PetscObject)perm);
2369: PetscObjectReference((PetscObject)perm);
2370: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2371: /* In b structure: Free imax, ilen, old a, old j.
2372: Allocate idnew, solve_work, new a, new j */
2373: PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2374: b->maxnz = b->nz = iu[mbs];
2375:
2376: (B)->info.factor_mallocs = reallocs;
2377: (B)->info.fill_ratio_given = f;
2378: if (ai[mbs] != 0) {
2379: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2380: } else {
2381: (B)->info.fill_ratio_needed = 0.0;
2382: }
2383: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2384: return(0);
2385: }
2387: /*
2388: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2389: */
2390: #include <petscbt.h>
2391: #include <../src/mat/utils/freespace.h>
2394: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2395: {
2396: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2397: PetscErrorCode ierr;
2398: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2399: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2400: const PetscInt *rip;
2401: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2402: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2403: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2404: PetscReal fill=info->fill,levels=info->levels;
2405: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2406: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2407: PetscBT lnkbt;
2410: if (bs > 1){
2411: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2412: return(0);
2413: }
2414: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2415: MatMissingDiagonal(A,&missing,&d);
2416: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2418: /* check whether perm is the identity mapping */
2419: ISIdentity(perm,&perm_identity);
2420: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2421: a->permute = PETSC_FALSE;
2423: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2424: PetscMalloc((am+1)*sizeof(PetscInt),&udiag);
2425: ui[0] = 0;
2426:
2427: /* ICC(0) without matrix ordering: simply rearrange column indices */
2428: if (!levels){
2429: /* reuse the column pointers and row offsets for memory savings */
2430: for (i=0; i<am; i++) {
2431: ncols = ai[i+1] - ai[i];
2432: ui[i+1] = ui[i] + ncols;
2433: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2434: }
2435: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2436: cols = uj;
2437: for (i=0; i<am; i++) {
2438: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2439: ncols = ai[i+1] - ai[i] -1;
2440: for (j=0; j<ncols; j++) *cols++ = aj[j];
2441: *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2442: }
2443: } else { /* case: levels>0 */
2444: ISGetIndices(perm,&rip);
2446: /* initialization */
2447: /* jl: linked list for storing indices of the pivot rows
2448: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2449: PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2450: for (i=0; i<am; i++){
2451: jl[i] = am; il[i] = 0;
2452: }
2454: /* create and initialize a linked list for storing column indices of the active row k */
2455: nlnk = am + 1;
2456: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2458: /* initial FreeSpace size is fill*(ai[am]+1) */
2459: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2460: current_space = free_space;
2461: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2462: current_space_lvl = free_space_lvl;
2464: for (k=0; k<am; k++){ /* for each active row k */
2465: /* initialize lnk by the column indices of row k */
2466: nzk = 0;
2467: ncols = ai[k+1] - ai[k];
2468: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2469: cols = aj+ai[k];
2470: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2471: nzk += nlnk;
2473: /* update lnk by computing fill-in for each pivot row to be merged in */
2474: prow = jl[k]; /* 1st pivot row */
2475:
2476: while (prow < k){
2477: nextprow = jl[prow];
2478:
2479: /* merge prow into k-th row */
2480: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2481: jmax = ui[prow+1];
2482: ncols = jmax-jmin;
2483: i = jmin - ui[prow];
2485: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2486: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2487: j = *(uj - 1);
2488: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2489: nzk += nlnk;
2491: /* update il and jl for prow */
2492: if (jmin < jmax){
2493: il[prow] = jmin;
2494: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2495: }
2496: prow = nextprow;
2497: }
2499: /* if free space is not available, make more free space */
2500: if (current_space->local_remaining<nzk) {
2501: i = am - k + 1; /* num of unfactored rows */
2502: i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2503: PetscFreeSpaceGet(i,¤t_space);
2504: PetscFreeSpaceGet(i,¤t_space_lvl);
2505: reallocs++;
2506: }
2508: /* copy data into free_space and free_space_lvl, then initialize lnk */
2509: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2510: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2512: /* add the k-th row into il and jl */
2513: if (nzk > 1){
2514: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2515: jl[k] = jl[i]; jl[i] = k;
2516: il[k] = ui[k] + 1;
2517: }
2518: uj_ptr[k] = current_space->array;
2519: uj_lvl_ptr[k] = current_space_lvl->array;
2521: current_space->array += nzk;
2522: current_space->local_used += nzk;
2523: current_space->local_remaining -= nzk;
2524: current_space_lvl->array += nzk;
2525: current_space_lvl->local_used += nzk;
2526: current_space_lvl->local_remaining -= nzk;
2528: ui[k+1] = ui[k] + nzk;
2529: }
2531: ISRestoreIndices(perm,&rip);
2532: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2534: /* destroy list of free space and other temporary array(s) */
2535: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2536: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2537: PetscIncompleteLLDestroy(lnk,lnkbt);
2538: PetscFreeSpaceDestroy(free_space_lvl);
2540: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2542: /* put together the new matrix in MATSEQSBAIJ format */
2543: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
2545: b = (Mat_SeqSBAIJ*)(fact)->data;
2546: PetscFree2(b->imax,b->ilen);
2547: b->singlemalloc = PETSC_FALSE;
2548: b->free_a = PETSC_TRUE;
2549: b->free_ij = free_ij;
2550: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2551: b->j = uj;
2552: b->i = ui;
2553: b->diag = udiag;
2554: b->free_diag = PETSC_TRUE;
2555: b->ilen = 0;
2556: b->imax = 0;
2557: b->row = perm;
2558: b->col = perm;
2559: PetscObjectReference((PetscObject)perm);
2560: PetscObjectReference((PetscObject)perm);
2561: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2562: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2563: PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2564: b->maxnz = b->nz = ui[am];
2565:
2566: fact->info.factor_mallocs = reallocs;
2567: fact->info.fill_ratio_given = fill;
2568: if (ai[am] != 0) {
2569: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2570: } else {
2571: fact->info.fill_ratio_needed = 0.0;
2572: }
2573: #if defined(PETSC_USE_INFO)
2574: if (ai[am] != 0) {
2575: PetscReal af = fact->info.fill_ratio_needed;
2576: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2577: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2578: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2579: } else {
2580: PetscInfo(A,"Empty matrix.\n");
2581: }
2582: #endif
2583: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2584: return(0);
2585: }
2589: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2590: {
2591: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2592: Mat_SeqSBAIJ *b;
2593: PetscErrorCode ierr;
2594: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2595: PetscInt bs=A->rmap->bs,am=a->mbs,d;
2596: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2597: PetscInt reallocs=0,i,*ui;
2598: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2599: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2600: PetscReal fill=info->fill,levels=info->levels,ratio_needed;
2601: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2602: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2603: PetscBT lnkbt;
2606: MatMissingDiagonal(A,&missing,&d);
2607: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2609: /*
2610: This code originally uses Modified Sparse Row (MSR) storage
2611: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2612: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2613: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2614: thus the original code in MSR format is still used for these cases.
2615: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2616: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2617: */
2618: if (bs > 1){
2619: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2620: return(0);
2621: }
2623: /* check whether perm is the identity mapping */
2624: ISIdentity(perm,&perm_identity);
2625: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2626: a->permute = PETSC_FALSE;
2628: /* special case that simply copies fill pattern */
2629: if (!levels ) {
2630: /* reuse the column pointers and row offsets for memory savings */
2631: ui = a->i;
2632: uj = a->j;
2633: free_ij = PETSC_FALSE;
2634: ratio_needed = 1.0;
2635: } else { /* case: levels>0 */
2636: ISGetIndices(perm,&rip);
2638: /* initialization */
2639: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2640: ui[0] = 0;
2642: /* jl: linked list for storing indices of the pivot rows
2643: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2644: PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2645: for (i=0; i<am; i++){
2646: jl[i] = am; il[i] = 0;
2647: }
2649: /* create and initialize a linked list for storing column indices of the active row k */
2650: nlnk = am + 1;
2651: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2653: /* initial FreeSpace size is fill*(ai[am]+1) */
2654: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2655: current_space = free_space;
2656: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2657: current_space_lvl = free_space_lvl;
2659: for (k=0; k<am; k++){ /* for each active row k */
2660: /* initialize lnk by the column indices of row rip[k] */
2661: nzk = 0;
2662: ncols = ai[rip[k]+1] - ai[rip[k]];
2663: cols = aj+ai[rip[k]];
2664: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2665: nzk += nlnk;
2667: /* update lnk by computing fill-in for each pivot row to be merged in */
2668: prow = jl[k]; /* 1st pivot row */
2669:
2670: while (prow < k){
2671: nextprow = jl[prow];
2672:
2673: /* merge prow into k-th row */
2674: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2675: jmax = ui[prow+1];
2676: ncols = jmax-jmin;
2677: i = jmin - ui[prow];
2678: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2679: j = *(uj_lvl_ptr[prow] + i - 1);
2680: cols_lvl = uj_lvl_ptr[prow]+i;
2681: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2682: nzk += nlnk;
2684: /* update il and jl for prow */
2685: if (jmin < jmax){
2686: il[prow] = jmin;
2687: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2688: }
2689: prow = nextprow;
2690: }
2692: /* if free space is not available, make more free space */
2693: if (current_space->local_remaining<nzk) {
2694: i = am - k + 1; /* num of unfactored rows */
2695: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2696: PetscFreeSpaceGet(i,¤t_space);
2697: PetscFreeSpaceGet(i,¤t_space_lvl);
2698: reallocs++;
2699: }
2701: /* copy data into free_space and free_space_lvl, then initialize lnk */
2702: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2704: /* add the k-th row into il and jl */
2705: if (nzk-1 > 0){
2706: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2707: jl[k] = jl[i]; jl[i] = k;
2708: il[k] = ui[k] + 1;
2709: }
2710: uj_ptr[k] = current_space->array;
2711: uj_lvl_ptr[k] = current_space_lvl->array;
2713: current_space->array += nzk;
2714: current_space->local_used += nzk;
2715: current_space->local_remaining -= nzk;
2716: current_space_lvl->array += nzk;
2717: current_space_lvl->local_used += nzk;
2718: current_space_lvl->local_remaining -= nzk;
2720: ui[k+1] = ui[k] + nzk;
2721: }
2723: ISRestoreIndices(perm,&rip);
2724: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2726: /* destroy list of free space and other temporary array(s) */
2727: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2728: PetscFreeSpaceContiguous(&free_space,uj);
2729: PetscIncompleteLLDestroy(lnk,lnkbt);
2730: PetscFreeSpaceDestroy(free_space_lvl);
2731: if (ai[am] != 0) {
2732: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2733: } else {
2734: ratio_needed = 0.0;
2735: }
2736: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2738: /* put together the new matrix in MATSEQSBAIJ format */
2739: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
2741: b = (Mat_SeqSBAIJ*)(fact)->data;
2742: PetscFree2(b->imax,b->ilen);
2743: b->singlemalloc = PETSC_FALSE;
2744: b->free_a = PETSC_TRUE;
2745: b->free_ij = free_ij;
2746: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2747: b->j = uj;
2748: b->i = ui;
2749: b->diag = 0;
2750: b->ilen = 0;
2751: b->imax = 0;
2752: b->row = perm;
2753: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2754: PetscObjectReference((PetscObject)perm);
2755: b->icol = perm;
2756: PetscObjectReference((PetscObject)perm);
2757: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2758: b->maxnz = b->nz = ui[am];
2759:
2760: fact->info.factor_mallocs = reallocs;
2761: fact->info.fill_ratio_given = fill;
2762: fact->info.fill_ratio_needed = ratio_needed;
2763: #if defined(PETSC_USE_INFO)
2764: if (ai[am] != 0) {
2765: PetscReal af = fact->info.fill_ratio_needed;
2766: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2767: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2768: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2769: } else {
2770: PetscInfo(A,"Empty matrix.\n");
2771: }
2772: #endif
2773: if (perm_identity){
2774: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2775: } else {
2776: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2777: }
2778: return(0);
2779: }