Actual source code: sbaijfact2.c
petsc-3.9.4 2018-09-11
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11: {
12: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
13: IS isrow=a->row;
14: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
15: PetscErrorCode ierr;
16: const PetscInt *r;
17: PetscInt nz,*vj,k,idx,k1;
18: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
19: const MatScalar *aa=a->a,*v,*diag;
20: PetscScalar *x,*xk,*xj,*xk_tmp,*t;
21: const PetscScalar *b;
24: VecGetArrayRead(bb,&b);
25: VecGetArray(xx,&x);
26: t = a->solve_work;
27: ISGetIndices(isrow,&r);
28: PetscMalloc1(bs,&xk_tmp);
30: /* solve U^T * D * y = b by forward substitution */
31: xk = t;
32: for (k=0; k<mbs; k++) { /* t <- perm(b) */
33: idx = bs*r[k];
34: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35: }
36: for (k=0; k<mbs; k++) {
37: v = aa + bs2*ai[k];
38: xk = t + k*bs; /* Dk*xk = k-th block of x */
39: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
40: nz = ai[k+1] - ai[k];
41: vj = aj + ai[k];
42: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43: while (nz--) {
44: /* x(:) += U(k,:)^T*(Dk*xk) */
45: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46: vj++; xj = t + (*vj)*bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa+k*bs2; /* ptr to inv(Dk) */
51: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k=mbs-1; k>=0; k--) {
56: v = aa + bs2*ai[k];
57: xk = t + k*bs; /* xk */
58: nz = ai[k+1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj)*bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2; xj = t + (*vj)*bs;
66: }
67: idx = bs*r[k];
68: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69: }
71: PetscFree(xk_tmp);
72: ISRestoreIndices(isrow,&r);
73: VecRestoreArrayRead(bb,&b);
74: VecRestoreArray(xx,&x);
75: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
76: return(0);
77: }
79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80: {
82: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
83: return(0);
84: }
86: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
87: {
89: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
90: return(0);
91: }
93: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
94: {
95: PetscErrorCode ierr;
96: PetscInt nz,k;
97: const PetscInt *vj,bs2 = bs*bs;
98: const MatScalar *v,*diag;
99: PetscScalar *xk,*xj,*xk_tmp;
102: PetscMalloc1(bs,&xk_tmp);
103: for (k=0; k<mbs; k++) {
104: v = aa + bs2*ai[k];
105: xk = x + k*bs; /* Dk*xk = k-th block of x */
106: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
107: nz = ai[k+1] - ai[k];
108: vj = aj + ai[k];
109: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
110: while (nz--) {
111: /* x(:) += U(k,:)^T*(Dk*xk) */
112: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
113: vj++; xj = x + (*vj)*bs;
114: v += bs2;
115: }
116: /* xk = inv(Dk)*(Dk*xk) */
117: diag = aa+k*bs2; /* ptr to inv(Dk) */
118: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
119: }
120: PetscFree(xk_tmp);
121: return(0);
122: }
124: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
125: {
126: PetscInt nz,k;
127: const PetscInt *vj,bs2 = bs*bs;
128: const MatScalar *v;
129: PetscScalar *xk,*xj;
132: for (k=mbs-1; k>=0; k--) {
133: v = aa + bs2*ai[k];
134: xk = x + k*bs; /* xk */
135: nz = ai[k+1] - ai[k];
136: vj = aj + ai[k];
137: xj = x + (*vj)*bs;
138: while (nz--) {
139: /* xk += U(k,:)*x(:) */
140: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
141: vj++;
142: v += bs2; xj = x + (*vj)*bs;
143: }
144: }
145: return(0);
146: }
148: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
149: {
150: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
151: PetscErrorCode ierr;
152: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
153: PetscInt bs =A->rmap->bs;
154: const MatScalar *aa=a->a;
155: PetscScalar *x;
156: const PetscScalar *b;
159: VecGetArrayRead(bb,&b);
160: VecGetArray(xx,&x);
162: /* solve U^T * D * y = b by forward substitution */
163: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
164: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
166: /* solve U*x = y by back substitution */
167: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
169: VecRestoreArrayRead(bb,&b);
170: VecRestoreArray(xx,&x);
171: PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
172: return(0);
173: }
175: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
176: {
177: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
178: PetscErrorCode ierr;
179: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
180: PetscInt bs =A->rmap->bs;
181: const MatScalar *aa=a->a;
182: const PetscScalar *b;
183: PetscScalar *x;
186: VecGetArrayRead(bb,&b);
187: VecGetArray(xx,&x);
188: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
189: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
190: VecRestoreArrayRead(bb,&b);
191: VecRestoreArray(xx,&x);
192: PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
193: return(0);
194: }
196: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
197: {
198: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
199: PetscErrorCode ierr;
200: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
201: PetscInt bs =A->rmap->bs;
202: const MatScalar *aa=a->a;
203: const PetscScalar *b;
204: PetscScalar *x;
207: VecGetArrayRead(bb,&b);
208: VecGetArray(xx,&x);
209: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
210: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
211: VecRestoreArrayRead(bb,&b);
212: VecRestoreArray(xx,&x);
213: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
214: return(0);
215: }
217: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
218: {
219: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
220: IS isrow=a->row;
221: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
222: PetscErrorCode ierr;
223: PetscInt nz,k,idx;
224: const MatScalar *aa=a->a,*v,*d;
225: PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
226: const PetscScalar *b;
229: VecGetArrayRead(bb,&b);
230: VecGetArray(xx,&x);
231: t = a->solve_work;
232: ISGetIndices(isrow,&r);
234: /* solve U^T * D * y = b by forward substitution */
235: tp = t;
236: for (k=0; k<mbs; k++) { /* t <- perm(b) */
237: idx = 7*r[k];
238: tp[0] = b[idx];
239: tp[1] = b[idx+1];
240: tp[2] = b[idx+2];
241: tp[3] = b[idx+3];
242: tp[4] = b[idx+4];
243: tp[5] = b[idx+5];
244: tp[6] = b[idx+6];
245: tp += 7;
246: }
248: for (k=0; k<mbs; k++) {
249: v = aa + 49*ai[k];
250: vj = aj + ai[k];
251: tp = t + k*7;
252: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
253: nz = ai[k+1] - ai[k];
254: tp = t + (*vj)*7;
255: while (nz--) {
256: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
257: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
258: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
259: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
260: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
261: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
262: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
263: vj++;
264: tp = t + (*vj)*7;
265: v += 49;
266: }
268: /* xk = inv(Dk)*(Dk*xk) */
269: d = aa+k*49; /* ptr to inv(Dk) */
270: tp = t + k*7;
271: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
272: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
273: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
274: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
275: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
276: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
277: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
278: }
280: /* solve U*x = y by back substitution */
281: for (k=mbs-1; k>=0; k--) {
282: v = aa + 49*ai[k];
283: vj = aj + ai[k];
284: tp = t + k*7;
285: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
286: nz = ai[k+1] - ai[k];
288: tp = t + (*vj)*7;
289: while (nz--) {
290: /* xk += U(k,:)*x(:) */
291: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
292: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
293: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
294: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
295: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
296: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
297: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
298: vj++;
299: tp = t + (*vj)*7;
300: v += 49;
301: }
302: tp = t + k*7;
303: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
304: idx = 7*r[k];
305: x[idx] = x0;
306: x[idx+1] = x1;
307: x[idx+2] = x2;
308: x[idx+3] = x3;
309: x[idx+4] = x4;
310: x[idx+5] = x5;
311: x[idx+6] = x6;
312: }
314: ISRestoreIndices(isrow,&r);
315: VecRestoreArrayRead(bb,&b);
316: VecRestoreArray(xx,&x);
317: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
318: return(0);
319: }
321: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
322: {
323: const MatScalar *v,*d;
324: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
325: PetscInt nz,k;
326: const PetscInt *vj;
329: for (k=0; k<mbs; k++) {
330: v = aa + 49*ai[k];
331: xp = x + k*7;
332: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
333: nz = ai[k+1] - ai[k];
334: vj = aj + ai[k];
335: xp = x + (*vj)*7;
336: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
337: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
338: while (nz--) {
339: /* x(:) += U(k,:)^T*(Dk*xk) */
340: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
341: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
342: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
343: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
344: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
345: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
346: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
347: vj++;
348: xp = x + (*vj)*7;
349: v += 49;
350: }
351: /* xk = inv(Dk)*(Dk*xk) */
352: d = aa+k*49; /* ptr to inv(Dk) */
353: xp = x + k*7;
354: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
355: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
356: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
357: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
358: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
359: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
360: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
361: }
362: return(0);
363: }
365: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
366: {
367: const MatScalar *v;
368: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
369: PetscInt nz,k;
370: const PetscInt *vj;
373: for (k=mbs-1; k>=0; k--) {
374: v = aa + 49*ai[k];
375: xp = x + k*7;
376: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
377: nz = ai[k+1] - ai[k];
378: vj = aj + ai[k];
379: xp = x + (*vj)*7;
380: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
381: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
382: while (nz--) {
383: /* xk += U(k,:)*x(:) */
384: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
385: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
386: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
387: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
388: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
389: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
390: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
391: vj++;
392: v += 49; xp = x + (*vj)*7;
393: }
394: xp = x + k*7;
395: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
396: }
397: return(0);
398: }
400: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
401: {
402: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
403: PetscErrorCode ierr;
404: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
405: const MatScalar *aa=a->a;
406: PetscScalar *x;
407: const PetscScalar *b;
410: VecGetArrayRead(bb,&b);
411: VecGetArray(xx,&x);
413: /* solve U^T * D * y = b by forward substitution */
414: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
415: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
417: /* solve U*x = y by back substitution */
418: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
420: VecRestoreArrayRead(bb,&b);
421: VecRestoreArray(xx,&x);
422: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
423: return(0);
424: }
426: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
427: {
428: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
429: PetscErrorCode ierr;
430: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
431: const MatScalar *aa=a->a;
432: PetscScalar *x;
433: const PetscScalar *b;
436: VecGetArrayRead(bb,&b);
437: VecGetArray(xx,&x);
438: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
439: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
440: VecRestoreArrayRead(bb,&b);
441: VecRestoreArray(xx,&x);
442: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
443: return(0);
444: }
446: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
447: {
448: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
449: PetscErrorCode ierr;
450: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
451: const MatScalar *aa=a->a;
452: PetscScalar *x;
453: const PetscScalar *b;
456: VecGetArrayRead(bb,&b);
457: VecGetArray(xx,&x);
458: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
459: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
460: VecRestoreArrayRead(bb,&b);
461: VecRestoreArray(xx,&x);
462: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
463: return(0);
464: }
466: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
467: {
468: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
469: IS isrow=a->row;
470: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
471: PetscErrorCode ierr;
472: PetscInt nz,k,idx;
473: const MatScalar *aa=a->a,*v,*d;
474: PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp;
475: const PetscScalar *b;
478: VecGetArrayRead(bb,&b);
479: VecGetArray(xx,&x);
480: t = a->solve_work;
481: ISGetIndices(isrow,&r);
483: /* solve U^T * D * y = b by forward substitution */
484: tp = t;
485: for (k=0; k<mbs; k++) { /* t <- perm(b) */
486: idx = 6*r[k];
487: tp[0] = b[idx];
488: tp[1] = b[idx+1];
489: tp[2] = b[idx+2];
490: tp[3] = b[idx+3];
491: tp[4] = b[idx+4];
492: tp[5] = b[idx+5];
493: tp += 6;
494: }
496: for (k=0; k<mbs; k++) {
497: v = aa + 36*ai[k];
498: vj = aj + ai[k];
499: tp = t + k*6;
500: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
501: nz = ai[k+1] - ai[k];
502: tp = t + (*vj)*6;
503: while (nz--) {
504: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
505: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
506: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
507: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
508: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
509: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
510: vj++;
511: tp = t + (*vj)*6;
512: v += 36;
513: }
515: /* xk = inv(Dk)*(Dk*xk) */
516: d = aa+k*36; /* ptr to inv(Dk) */
517: tp = t + k*6;
518: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
519: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
520: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
521: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
522: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
523: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
524: }
526: /* solve U*x = y by back substitution */
527: for (k=mbs-1; k>=0; k--) {
528: v = aa + 36*ai[k];
529: vj = aj + ai[k];
530: tp = t + k*6;
531: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
532: nz = ai[k+1] - ai[k];
534: tp = t + (*vj)*6;
535: while (nz--) {
536: /* xk += U(k,:)*x(:) */
537: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
538: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
539: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
540: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
541: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
542: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
543: vj++;
544: tp = t + (*vj)*6;
545: v += 36;
546: }
547: tp = t + k*6;
548: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
549: idx = 6*r[k];
550: x[idx] = x0;
551: x[idx+1] = x1;
552: x[idx+2] = x2;
553: x[idx+3] = x3;
554: x[idx+4] = x4;
555: x[idx+5] = x5;
556: }
558: ISRestoreIndices(isrow,&r);
559: VecRestoreArrayRead(bb,&b);
560: VecRestoreArray(xx,&x);
561: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
562: return(0);
563: }
565: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
566: {
567: const MatScalar *v,*d;
568: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
569: PetscInt nz,k;
570: const PetscInt *vj;
573: for (k=0; k<mbs; k++) {
574: v = aa + 36*ai[k];
575: xp = x + k*6;
576: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
577: nz = ai[k+1] - ai[k];
578: vj = aj + ai[k];
579: xp = x + (*vj)*6;
580: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
581: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
582: while (nz--) {
583: /* x(:) += U(k,:)^T*(Dk*xk) */
584: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
585: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
586: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
587: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
588: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
589: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
590: vj++;
591: xp = x + (*vj)*6;
592: v += 36;
593: }
594: /* xk = inv(Dk)*(Dk*xk) */
595: d = aa+k*36; /* ptr to inv(Dk) */
596: xp = x + k*6;
597: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
598: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
599: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
600: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
601: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
602: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
603: }
604: return(0);
605: }
606: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
607: {
608: const MatScalar *v;
609: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
610: PetscInt nz,k;
611: const PetscInt *vj;
614: for (k=mbs-1; k>=0; k--) {
615: v = aa + 36*ai[k];
616: xp = x + k*6;
617: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
618: nz = ai[k+1] - ai[k];
619: vj = aj + ai[k];
620: xp = x + (*vj)*6;
621: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
622: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
623: while (nz--) {
624: /* xk += U(k,:)*x(:) */
625: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
626: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
627: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
628: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
629: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
630: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
631: vj++;
632: v += 36; xp = x + (*vj)*6;
633: }
634: xp = x + k*6;
635: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
636: }
637: return(0);
638: }
641: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
642: {
643: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
644: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
645: const MatScalar *aa=a->a;
646: PetscScalar *x;
647: const PetscScalar *b;
648: PetscErrorCode ierr;
651: VecGetArrayRead(bb,&b);
652: VecGetArray(xx,&x);
654: /* solve U^T * D * y = b by forward substitution */
655: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
656: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
658: /* solve U*x = y by back substitution */
659: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
661: VecRestoreArrayRead(bb,&b);
662: VecRestoreArray(xx,&x);
663: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
664: return(0);
665: }
667: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
668: {
669: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
670: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
671: const MatScalar *aa=a->a;
672: PetscScalar *x;
673: const PetscScalar *b;
674: PetscErrorCode ierr;
677: VecGetArrayRead(bb,&b);
678: VecGetArray(xx,&x);
679: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
680: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
681: VecRestoreArrayRead(bb,&b);
682: VecRestoreArray(xx,&x);
683: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
684: return(0);
685: }
687: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
688: {
689: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
690: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
691: const MatScalar *aa=a->a;
692: PetscScalar *x;
693: const PetscScalar *b;
694: PetscErrorCode ierr;
697: VecGetArrayRead(bb,&b);
698: VecGetArray(xx,&x);
699: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
700: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
701: VecRestoreArrayRead(bb,&b);
702: VecRestoreArray(xx,&x);
703: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
704: return(0);
705: }
707: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
708: {
709: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
710: IS isrow=a->row;
711: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
712: PetscErrorCode ierr;
713: const PetscInt *r,*vj;
714: PetscInt nz,k,idx;
715: const MatScalar *aa=a->a,*v,*diag;
716: PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp;
717: const PetscScalar *b;
720: VecGetArrayRead(bb,&b);
721: VecGetArray(xx,&x);
722: t = a->solve_work;
723: ISGetIndices(isrow,&r);
725: /* solve U^T * D * y = b by forward substitution */
726: tp = t;
727: for (k=0; k<mbs; k++) { /* t <- perm(b) */
728: idx = 5*r[k];
729: tp[0] = b[idx];
730: tp[1] = b[idx+1];
731: tp[2] = b[idx+2];
732: tp[3] = b[idx+3];
733: tp[4] = b[idx+4];
734: tp += 5;
735: }
737: for (k=0; k<mbs; k++) {
738: v = aa + 25*ai[k];
739: vj = aj + ai[k];
740: tp = t + k*5;
741: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
742: nz = ai[k+1] - ai[k];
744: tp = t + (*vj)*5;
745: while (nz--) {
746: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
747: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
748: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
749: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
750: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
751: vj++;
752: tp = t + (*vj)*5;
753: v += 25;
754: }
756: /* xk = inv(Dk)*(Dk*xk) */
757: diag = aa+k*25; /* ptr to inv(Dk) */
758: tp = t + k*5;
759: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
760: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
761: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
762: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
763: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
764: }
766: /* solve U*x = y by back substitution */
767: for (k=mbs-1; k>=0; k--) {
768: v = aa + 25*ai[k];
769: vj = aj + ai[k];
770: tp = t + k*5;
771: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
772: nz = ai[k+1] - ai[k];
774: tp = t + (*vj)*5;
775: while (nz--) {
776: /* xk += U(k,:)*x(:) */
777: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
778: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
779: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
780: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
781: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
782: vj++;
783: tp = t + (*vj)*5;
784: v += 25;
785: }
786: tp = t + k*5;
787: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
788: idx = 5*r[k];
789: x[idx] = x0;
790: x[idx+1] = x1;
791: x[idx+2] = x2;
792: x[idx+3] = x3;
793: x[idx+4] = x4;
794: }
796: ISRestoreIndices(isrow,&r);
797: VecRestoreArrayRead(bb,&b);
798: VecRestoreArray(xx,&x);
799: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
800: return(0);
801: }
803: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
804: {
805: const MatScalar *v,*diag;
806: PetscScalar *xp,x0,x1,x2,x3,x4;
807: PetscInt nz,k;
808: const PetscInt *vj;
811: for (k=0; k<mbs; k++) {
812: v = aa + 25*ai[k];
813: xp = x + k*5;
814: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
815: nz = ai[k+1] - ai[k];
816: vj = aj + ai[k];
817: xp = x + (*vj)*5;
818: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
819: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
820: while (nz--) {
821: /* x(:) += U(k,:)^T*(Dk*xk) */
822: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
823: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
824: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
825: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
826: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
827: vj++;
828: xp = x + (*vj)*5;
829: v += 25;
830: }
831: /* xk = inv(Dk)*(Dk*xk) */
832: diag = aa+k*25; /* ptr to inv(Dk) */
833: xp = x + k*5;
834: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
835: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
836: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
837: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
838: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
839: }
840: return(0);
841: }
843: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
844: {
845: const MatScalar *v;
846: PetscScalar *xp,x0,x1,x2,x3,x4;
847: PetscInt nz,k;
848: const PetscInt *vj;
851: for (k=mbs-1; k>=0; k--) {
852: v = aa + 25*ai[k];
853: xp = x + k*5;
854: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
855: nz = ai[k+1] - ai[k];
856: vj = aj + ai[k];
857: xp = x + (*vj)*5;
858: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
860: while (nz--) {
861: /* xk += U(k,:)*x(:) */
862: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
863: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
864: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
865: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
866: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
867: vj++;
868: v += 25; xp = x + (*vj)*5;
869: }
870: xp = x + k*5;
871: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
872: }
873: return(0);
874: }
876: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
877: {
878: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
879: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
880: const MatScalar *aa=a->a;
881: PetscScalar *x;
882: const PetscScalar *b;
883: PetscErrorCode ierr;
886: VecGetArrayRead(bb,&b);
887: VecGetArray(xx,&x);
889: /* solve U^T * D * y = b by forward substitution */
890: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
891: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
893: /* solve U*x = y by back substitution */
894: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
896: VecRestoreArrayRead(bb,&b);
897: VecRestoreArray(xx,&x);
898: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
899: return(0);
900: }
902: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
903: {
904: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
905: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
906: const MatScalar *aa=a->a;
907: PetscScalar *x;
908: const PetscScalar *b;
909: PetscErrorCode ierr;
912: VecGetArrayRead(bb,&b);
913: VecGetArray(xx,&x);
914: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
915: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
916: VecRestoreArrayRead(bb,&b);
917: VecRestoreArray(xx,&x);
918: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
919: return(0);
920: }
922: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
923: {
924: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
925: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
926: const MatScalar *aa=a->a;
927: PetscScalar *x;
928: const PetscScalar *b;
929: PetscErrorCode ierr;
932: VecGetArrayRead(bb,&b);
933: VecGetArray(xx,&x);
934: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
935: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
936: VecRestoreArrayRead(bb,&b);
937: VecRestoreArray(xx,&x);
938: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
939: return(0);
940: }
942: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
943: {
944: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
945: IS isrow=a->row;
946: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
947: PetscErrorCode ierr;
948: const PetscInt *r,*vj;
949: PetscInt nz,k,idx;
950: const MatScalar *aa=a->a,*v,*diag;
951: PetscScalar *x,x0,x1,x2,x3,*t,*tp;
952: const PetscScalar *b;
955: VecGetArrayRead(bb,&b);
956: VecGetArray(xx,&x);
957: t = a->solve_work;
958: ISGetIndices(isrow,&r);
960: /* solve U^T * D * y = b by forward substitution */
961: tp = t;
962: for (k=0; k<mbs; k++) { /* t <- perm(b) */
963: idx = 4*r[k];
964: tp[0] = b[idx];
965: tp[1] = b[idx+1];
966: tp[2] = b[idx+2];
967: tp[3] = b[idx+3];
968: tp += 4;
969: }
971: for (k=0; k<mbs; k++) {
972: v = aa + 16*ai[k];
973: vj = aj + ai[k];
974: tp = t + k*4;
975: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
976: nz = ai[k+1] - ai[k];
978: tp = t + (*vj)*4;
979: while (nz--) {
980: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
981: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
982: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
983: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
984: vj++;
985: tp = t + (*vj)*4;
986: v += 16;
987: }
989: /* xk = inv(Dk)*(Dk*xk) */
990: diag = aa+k*16; /* ptr to inv(Dk) */
991: tp = t + k*4;
992: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
993: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
994: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
995: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
996: }
998: /* solve U*x = y by back substitution */
999: for (k=mbs-1; k>=0; k--) {
1000: v = aa + 16*ai[k];
1001: vj = aj + ai[k];
1002: tp = t + k*4;
1003: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1004: nz = ai[k+1] - ai[k];
1006: tp = t + (*vj)*4;
1007: while (nz--) {
1008: /* xk += U(k,:)*x(:) */
1009: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1010: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1011: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1012: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1013: vj++;
1014: tp = t + (*vj)*4;
1015: v += 16;
1016: }
1017: tp = t + k*4;
1018: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1019: idx = 4*r[k];
1020: x[idx] = x0;
1021: x[idx+1] = x1;
1022: x[idx+2] = x2;
1023: x[idx+3] = x3;
1024: }
1026: ISRestoreIndices(isrow,&r);
1027: VecRestoreArrayRead(bb,&b);
1028: VecRestoreArray(xx,&x);
1029: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1030: return(0);
1031: }
1033: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1034: {
1035: const MatScalar *v,*diag;
1036: PetscScalar *xp,x0,x1,x2,x3;
1037: PetscInt nz,k;
1038: const PetscInt *vj;
1041: for (k=0; k<mbs; k++) {
1042: v = aa + 16*ai[k];
1043: xp = x + k*4;
1044: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1045: nz = ai[k+1] - ai[k];
1046: vj = aj + ai[k];
1047: xp = x + (*vj)*4;
1048: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1049: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050: while (nz--) {
1051: /* x(:) += U(k,:)^T*(Dk*xk) */
1052: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1053: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1054: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1055: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1056: vj++;
1057: xp = x + (*vj)*4;
1058: v += 16;
1059: }
1060: /* xk = inv(Dk)*(Dk*xk) */
1061: diag = aa+k*16; /* ptr to inv(Dk) */
1062: xp = x + k*4;
1063: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1064: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1065: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1066: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1067: }
1068: return(0);
1069: }
1071: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1072: {
1073: const MatScalar *v;
1074: PetscScalar *xp,x0,x1,x2,x3;
1075: PetscInt nz,k;
1076: const PetscInt *vj;
1079: for (k=mbs-1; k>=0; k--) {
1080: v = aa + 16*ai[k];
1081: xp = x + k*4;
1082: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1083: nz = ai[k+1] - ai[k];
1084: vj = aj + ai[k];
1085: xp = x + (*vj)*4;
1086: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1087: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1088: while (nz--) {
1089: /* xk += U(k,:)*x(:) */
1090: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1091: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1092: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1093: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1094: vj++;
1095: v += 16; xp = x + (*vj)*4;
1096: }
1097: xp = x + k*4;
1098: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1099: }
1100: return(0);
1101: }
1103: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1104: {
1105: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1106: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1107: const MatScalar *aa=a->a;
1108: PetscScalar *x;
1109: const PetscScalar *b;
1110: PetscErrorCode ierr;
1113: VecGetArrayRead(bb,&b);
1114: VecGetArray(xx,&x);
1116: /* solve U^T * D * y = b by forward substitution */
1117: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1118: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1120: /* solve U*x = y by back substitution */
1121: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1122: VecRestoreArrayRead(bb,&b);
1123: VecRestoreArray(xx,&x);
1124: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1125: return(0);
1126: }
1128: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1129: {
1130: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1131: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1132: const MatScalar *aa=a->a;
1133: PetscScalar *x;
1134: const PetscScalar *b;
1135: PetscErrorCode ierr;
1138: VecGetArrayRead(bb,&b);
1139: VecGetArray(xx,&x);
1140: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1141: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1142: VecRestoreArrayRead(bb,&b);
1143: VecRestoreArray(xx,&x);
1144: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1145: return(0);
1146: }
1148: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1149: {
1150: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1151: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1152: const MatScalar *aa=a->a;
1153: PetscScalar *x;
1154: const PetscScalar *b;
1155: PetscErrorCode ierr;
1158: VecGetArrayRead(bb,&b);
1159: VecGetArray(xx,&x);
1160: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1161: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1162: VecRestoreArrayRead(bb,&b);
1163: VecRestoreArray(xx,&x);
1164: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1165: return(0);
1166: }
1168: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1169: {
1170: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1171: IS isrow=a->row;
1172: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1173: PetscErrorCode ierr;
1174: const PetscInt *r;
1175: PetscInt nz,k,idx;
1176: const PetscInt *vj;
1177: const MatScalar *aa=a->a,*v,*diag;
1178: PetscScalar *x,x0,x1,x2,*t,*tp;
1179: const PetscScalar *b;
1182: VecGetArrayRead(bb,&b);
1183: VecGetArray(xx,&x);
1184: t = a->solve_work;
1185: ISGetIndices(isrow,&r);
1187: /* solve U^T * D * y = b by forward substitution */
1188: tp = t;
1189: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1190: idx = 3*r[k];
1191: tp[0] = b[idx];
1192: tp[1] = b[idx+1];
1193: tp[2] = b[idx+2];
1194: tp += 3;
1195: }
1197: for (k=0; k<mbs; k++) {
1198: v = aa + 9*ai[k];
1199: vj = aj + ai[k];
1200: tp = t + k*3;
1201: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1202: nz = ai[k+1] - ai[k];
1204: tp = t + (*vj)*3;
1205: while (nz--) {
1206: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1207: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1208: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1209: vj++;
1210: tp = t + (*vj)*3;
1211: v += 9;
1212: }
1214: /* xk = inv(Dk)*(Dk*xk) */
1215: diag = aa+k*9; /* ptr to inv(Dk) */
1216: tp = t + k*3;
1217: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1218: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1219: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1220: }
1222: /* solve U*x = y by back substitution */
1223: for (k=mbs-1; k>=0; k--) {
1224: v = aa + 9*ai[k];
1225: vj = aj + ai[k];
1226: tp = t + k*3;
1227: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1228: nz = ai[k+1] - ai[k];
1230: tp = t + (*vj)*3;
1231: while (nz--) {
1232: /* xk += U(k,:)*x(:) */
1233: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1234: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1235: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1236: vj++;
1237: tp = t + (*vj)*3;
1238: v += 9;
1239: }
1240: tp = t + k*3;
1241: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1242: idx = 3*r[k];
1243: x[idx] = x0;
1244: x[idx+1] = x1;
1245: x[idx+2] = x2;
1246: }
1248: ISRestoreIndices(isrow,&r);
1249: VecRestoreArrayRead(bb,&b);
1250: VecRestoreArray(xx,&x);
1251: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1252: return(0);
1253: }
1255: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1256: {
1257: const MatScalar *v,*diag;
1258: PetscScalar *xp,x0,x1,x2;
1259: PetscInt nz,k;
1260: const PetscInt *vj;
1263: for (k=0; k<mbs; k++) {
1264: v = aa + 9*ai[k];
1265: xp = x + k*3;
1266: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1267: nz = ai[k+1] - ai[k];
1268: vj = aj + ai[k];
1269: xp = x + (*vj)*3;
1270: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1271: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1272: while (nz--) {
1273: /* x(:) += U(k,:)^T*(Dk*xk) */
1274: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1275: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1276: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1277: vj++;
1278: xp = x + (*vj)*3;
1279: v += 9;
1280: }
1281: /* xk = inv(Dk)*(Dk*xk) */
1282: diag = aa+k*9; /* ptr to inv(Dk) */
1283: xp = x + k*3;
1284: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1285: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1286: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1287: }
1288: return(0);
1289: }
1291: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1292: {
1293: const MatScalar *v;
1294: PetscScalar *xp,x0,x1,x2;
1295: PetscInt nz,k;
1296: const PetscInt *vj;
1299: for (k=mbs-1; k>=0; k--) {
1300: v = aa + 9*ai[k];
1301: xp = x + k*3;
1302: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1303: nz = ai[k+1] - ai[k];
1304: vj = aj + ai[k];
1305: xp = x + (*vj)*3;
1306: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1307: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1308: while (nz--) {
1309: /* xk += U(k,:)*x(:) */
1310: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1311: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1312: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1313: vj++;
1314: v += 9; xp = x + (*vj)*3;
1315: }
1316: xp = x + k*3;
1317: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1318: }
1319: return(0);
1320: }
1322: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1323: {
1324: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1325: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1326: const MatScalar *aa=a->a;
1327: PetscScalar *x;
1328: const PetscScalar *b;
1329: PetscErrorCode ierr;
1332: VecGetArrayRead(bb,&b);
1333: VecGetArray(xx,&x);
1335: /* solve U^T * D * y = b by forward substitution */
1336: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1337: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1339: /* solve U*x = y by back substitution */
1340: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1342: VecRestoreArrayRead(bb,&b);
1343: VecRestoreArray(xx,&x);
1344: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1345: return(0);
1346: }
1348: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1349: {
1350: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1351: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1352: const MatScalar *aa=a->a;
1353: PetscScalar *x;
1354: const PetscScalar *b;
1355: PetscErrorCode ierr;
1358: VecGetArrayRead(bb,&b);
1359: VecGetArray(xx,&x);
1360: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1361: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1362: VecRestoreArrayRead(bb,&b);
1363: VecRestoreArray(xx,&x);
1364: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1365: return(0);
1366: }
1368: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1369: {
1370: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1371: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1372: const MatScalar *aa=a->a;
1373: PetscScalar *x;
1374: const PetscScalar *b;
1375: PetscErrorCode ierr;
1378: VecGetArrayRead(bb,&b);
1379: VecGetArray(xx,&x);
1380: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1381: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1382: VecRestoreArrayRead(bb,&b);
1383: VecRestoreArray(xx,&x);
1384: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1385: return(0);
1386: }
1388: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1389: {
1390: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1391: IS isrow=a->row;
1392: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1393: PetscErrorCode ierr;
1394: const PetscInt *r,*vj;
1395: PetscInt nz,k,k2,idx;
1396: const MatScalar *aa=a->a,*v,*diag;
1397: PetscScalar *x,x0,x1,*t;
1398: const PetscScalar *b;
1401: VecGetArrayRead(bb,&b);
1402: VecGetArray(xx,&x);
1403: t = a->solve_work;
1404: ISGetIndices(isrow,&r);
1406: /* solve U^T * D * y = perm(b) by forward substitution */
1407: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1408: idx = 2*r[k];
1409: t[k*2] = b[idx];
1410: t[k*2+1] = b[idx+1];
1411: }
1412: for (k=0; k<mbs; k++) {
1413: v = aa + 4*ai[k];
1414: vj = aj + ai[k];
1415: k2 = k*2;
1416: x0 = t[k2]; x1 = t[k2+1];
1417: nz = ai[k+1] - ai[k];
1418: while (nz--) {
1419: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1420: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1421: vj++; v += 4;
1422: }
1423: diag = aa+k*4; /* ptr to inv(Dk) */
1424: t[k2] = diag[0]*x0 + diag[2]*x1;
1425: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1426: }
1428: /* solve U*x = y by back substitution */
1429: for (k=mbs-1; k>=0; k--) {
1430: v = aa + 4*ai[k];
1431: vj = aj + ai[k];
1432: k2 = k*2;
1433: x0 = t[k2]; x1 = t[k2+1];
1434: nz = ai[k+1] - ai[k];
1435: while (nz--) {
1436: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1437: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1438: vj++;
1439: v += 4;
1440: }
1441: t[k2] = x0;
1442: t[k2+1] = x1;
1443: idx = 2*r[k];
1444: x[idx] = x0;
1445: x[idx+1] = x1;
1446: }
1448: ISRestoreIndices(isrow,&r);
1449: VecRestoreArrayRead(bb,&b);
1450: VecRestoreArray(xx,&x);
1451: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1452: return(0);
1453: }
1455: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1456: {
1457: const MatScalar *v,*diag;
1458: PetscScalar x0,x1;
1459: PetscInt nz,k,k2;
1460: const PetscInt *vj;
1461:
1463: for (k=0; k<mbs; k++) {
1464: v = aa + 4*ai[k];
1465: vj = aj + ai[k];
1466: k2 = k*2;
1467: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1468: nz = ai[k+1] - ai[k];
1469: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1470: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1471: while (nz--) {
1472: /* x(:) += U(k,:)^T*(Dk*xk) */
1473: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1474: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1475: vj++; v += 4;
1476: }
1477: /* xk = inv(Dk)*(Dk*xk) */
1478: diag = aa+k*4; /* ptr to inv(Dk) */
1479: x[k2] = diag[0]*x0 + diag[2]*x1;
1480: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1481: }
1482: return(0);
1483: }
1485: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1486: {
1487: const MatScalar *v;
1488: PetscScalar x0,x1;
1489: PetscInt nz,k,k2;
1490: const PetscInt *vj;
1493: for (k=mbs-1; k>=0; k--) {
1494: v = aa + 4*ai[k];
1495: vj = aj + ai[k];
1496: k2 = k*2;
1497: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1498: nz = ai[k+1] - ai[k];
1499: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1500: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1501: while (nz--) {
1502: /* xk += U(k,:)*x(:) */
1503: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1504: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1505: vj++;
1506: v += 4;
1507: }
1508: x[k2] = x0;
1509: x[k2+1] = x1;
1510: }
1511: return(0);
1512: }
1514: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1515: {
1516: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1517: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1518: const MatScalar *aa=a->a;
1519: PetscScalar *x;
1520: const PetscScalar *b;
1521: PetscErrorCode ierr;
1524: VecGetArrayRead(bb,&b);
1525: VecGetArray(xx,&x);
1527: /* solve U^T * D * y = b by forward substitution */
1528: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1529: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1531: /* solve U*x = y by back substitution */
1532: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1534: VecRestoreArrayRead(bb,&b);
1535: VecRestoreArray(xx,&x);
1536: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1537: return(0);
1538: }
1540: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1541: {
1542: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1543: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1544: const MatScalar *aa=a->a;
1545: PetscScalar *x;
1546: const PetscScalar *b;
1547: PetscErrorCode ierr;
1550: VecGetArrayRead(bb,&b);
1551: VecGetArray(xx,&x);
1552: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1553: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1554: VecRestoreArrayRead(bb,&b);
1555: VecRestoreArray(xx,&x);
1556: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1557: return(0);
1558: }
1560: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1561: {
1562: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1563: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1564: const MatScalar *aa=a->a;
1565: PetscScalar *x;
1566: const PetscScalar *b;
1567: PetscErrorCode ierr;
1570: VecGetArrayRead(bb,&b);
1571: VecGetArray(xx,&x);
1572: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1573: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1574: VecRestoreArrayRead(bb,&b);
1575: VecRestoreArray(xx,&x);
1576: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1577: return(0);
1578: }
1580: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1581: {
1582: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1583: IS isrow=a->row;
1584: PetscErrorCode ierr;
1585: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1586: const MatScalar *aa=a->a,*v;
1587: const PetscScalar *b;
1588: PetscScalar *x,xk,*t;
1589: PetscInt nz,k,j;
1592: VecGetArrayRead(bb,&b);
1593: VecGetArray(xx,&x);
1594: t = a->solve_work;
1595: ISGetIndices(isrow,&rp);
1597: /* solve U^T*D*y = perm(b) by forward substitution */
1598: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1599: for (k=0; k<mbs; k++) {
1600: v = aa + ai[k];
1601: vj = aj + ai[k];
1602: xk = t[k];
1603: nz = ai[k+1] - ai[k] - 1;
1604: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1605: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1606: }
1608: /* solve U*perm(x) = y by back substitution */
1609: for (k=mbs-1; k>=0; k--) {
1610: v = aa + adiag[k] - 1;
1611: vj = aj + adiag[k] - 1;
1612: nz = ai[k+1] - ai[k] - 1;
1613: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1614: x[rp[k]] = t[k];
1615: }
1617: ISRestoreIndices(isrow,&rp);
1618: VecRestoreArrayRead(bb,&b);
1619: VecRestoreArray(xx,&x);
1620: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1621: return(0);
1622: }
1624: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1625: {
1626: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1627: IS isrow=a->row;
1628: PetscErrorCode ierr;
1629: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1630: const MatScalar *aa=a->a,*v;
1631: PetscScalar *x,xk,*t;
1632: const PetscScalar *b;
1633: PetscInt nz,k;
1636: VecGetArrayRead(bb,&b);
1637: VecGetArray(xx,&x);
1638: t = a->solve_work;
1639: ISGetIndices(isrow,&rp);
1641: /* solve U^T*D*y = perm(b) by forward substitution */
1642: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1643: for (k=0; k<mbs; k++) {
1644: v = aa + ai[k] + 1;
1645: vj = aj + ai[k] + 1;
1646: xk = t[k];
1647: nz = ai[k+1] - ai[k] - 1;
1648: while (nz--) t[*vj++] += (*v++) * xk;
1649: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1650: }
1652: /* solve U*perm(x) = y by back substitution */
1653: for (k=mbs-1; k>=0; k--) {
1654: v = aa + ai[k] + 1;
1655: vj = aj + ai[k] + 1;
1656: nz = ai[k+1] - ai[k] - 1;
1657: while (nz--) t[k] += (*v++) * t[*vj++];
1658: x[rp[k]] = t[k];
1659: }
1661: ISRestoreIndices(isrow,&rp);
1662: VecRestoreArrayRead(bb,&b);
1663: VecRestoreArray(xx,&x);
1664: PetscLogFlops(4.0*a->nz - 3*mbs);
1665: return(0);
1666: }
1668: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1669: {
1670: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1671: IS isrow=a->row;
1672: PetscErrorCode ierr;
1673: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1674: const MatScalar *aa=a->a,*v;
1675: PetscReal diagk;
1676: PetscScalar *x,xk;
1677: const PetscScalar *b;
1678: PetscInt nz,k;
1681: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1682: VecGetArrayRead(bb,&b);
1683: VecGetArray(xx,&x);
1684: ISGetIndices(isrow,&rp);
1686: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1687: for (k=0; k<mbs; k++) {
1688: v = aa + ai[k];
1689: vj = aj + ai[k];
1690: xk = x[k];
1691: nz = ai[k+1] - ai[k] - 1;
1692: while (nz--) x[*vj++] += (*v++) * xk;
1694: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1695: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1696: x[k] = xk*PetscSqrtReal(diagk);
1697: }
1698: ISRestoreIndices(isrow,&rp);
1699: VecRestoreArrayRead(bb,&b);
1700: VecRestoreArray(xx,&x);
1701: PetscLogFlops(2.0*a->nz - mbs);
1702: return(0);
1703: }
1705: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1706: {
1707: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1708: IS isrow=a->row;
1709: PetscErrorCode ierr;
1710: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1711: const MatScalar *aa=a->a,*v;
1712: PetscReal diagk;
1713: PetscScalar *x,xk;
1714: const PetscScalar *b;
1715: PetscInt nz,k;
1718: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1719: VecGetArrayRead(bb,&b);
1720: VecGetArray(xx,&x);
1721: ISGetIndices(isrow,&rp);
1723: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1724: for (k=0; k<mbs; k++) {
1725: v = aa + ai[k] + 1;
1726: vj = aj + ai[k] + 1;
1727: xk = x[k];
1728: nz = ai[k+1] - ai[k] - 1;
1729: while (nz--) x[*vj++] += (*v++) * xk;
1731: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1732: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1733: x[k] = xk*PetscSqrtReal(diagk);
1734: }
1735: ISRestoreIndices(isrow,&rp);
1736: VecRestoreArrayRead(bb,&b);
1737: VecRestoreArray(xx,&x);
1738: PetscLogFlops(2.0*a->nz - mbs);
1739: return(0);
1740: }
1742: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1743: {
1744: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1745: IS isrow=a->row;
1746: PetscErrorCode ierr;
1747: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1748: const MatScalar *aa=a->a,*v;
1749: PetscReal diagk;
1750: PetscScalar *x,*t;
1751: const PetscScalar *b;
1752: PetscInt nz,k;
1755: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1756: VecGetArrayRead(bb,&b);
1757: VecGetArray(xx,&x);
1758: t = a->solve_work;
1759: ISGetIndices(isrow,&rp);
1761: for (k=mbs-1; k>=0; k--) {
1762: v = aa + ai[k];
1763: vj = aj + ai[k];
1764: diagk = PetscRealPart(aa[adiag[k]]);
1765: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1766: t[k] = b[k] * PetscSqrtReal(diagk);
1767: nz = ai[k+1] - ai[k] - 1;
1768: while (nz--) t[k] += (*v++) * t[*vj++];
1769: x[rp[k]] = t[k];
1770: }
1771: ISRestoreIndices(isrow,&rp);
1772: VecRestoreArrayRead(bb,&b);
1773: VecRestoreArray(xx,&x);
1774: PetscLogFlops(2.0*a->nz - mbs);
1775: return(0);
1776: }
1778: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1779: {
1780: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1781: IS isrow=a->row;
1782: PetscErrorCode ierr;
1783: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1784: const MatScalar *aa=a->a,*v;
1785: PetscReal diagk;
1786: PetscScalar *x,*t;
1787: const PetscScalar *b;
1788: PetscInt nz,k;
1791: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1792: VecGetArrayRead(bb,&b);
1793: VecGetArray(xx,&x);
1794: t = a->solve_work;
1795: ISGetIndices(isrow,&rp);
1797: for (k=mbs-1; k>=0; k--) {
1798: v = aa + ai[k] + 1;
1799: vj = aj + ai[k] + 1;
1800: diagk = PetscRealPart(aa[ai[k]]);
1801: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1802: t[k] = b[k] * PetscSqrtReal(diagk);
1803: nz = ai[k+1] - ai[k] - 1;
1804: while (nz--) t[k] += (*v++) * t[*vj++];
1805: x[rp[k]] = t[k];
1806: }
1807: ISRestoreIndices(isrow,&rp);
1808: VecRestoreArrayRead(bb,&b);
1809: VecRestoreArray(xx,&x);
1810: PetscLogFlops(2.0*a->nz - mbs);
1811: return(0);
1812: }
1814: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1815: {
1816: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1820: if (A->rmap->bs == 1) {
1821: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1822: } else {
1823: IS isrow=a->row;
1824: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1825: const MatScalar *aa=a->a,*v;
1826: PetscScalar *x,*t;
1827: const PetscScalar *b;
1828: PetscInt nz,k,n,i,j;
1830: if (bb->n > a->solves_work_n) {
1831: PetscFree(a->solves_work);
1832: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1833: a->solves_work_n = bb->n;
1834: }
1835: n = bb->n;
1836: VecGetArrayRead(bb->v,&b);
1837: VecGetArray(xx->v,&x);
1838: t = a->solves_work;
1840: ISGetIndices(isrow,&rp);
1842: /* solve U^T*D*y = perm(b) by forward substitution */
1843: for (k=0; k<mbs; k++) {
1844: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1845: }
1846: for (k=0; k<mbs; k++) {
1847: v = aa + ai[k];
1848: vj = aj + ai[k];
1849: nz = ai[k+1] - ai[k] - 1;
1850: for (j=0; j<nz; j++) {
1851: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1852: v++;vj++;
1853: }
1854: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1855: }
1857: /* solve U*perm(x) = y by back substitution */
1858: for (k=mbs-1; k>=0; k--) {
1859: v = aa + ai[k] - 1;
1860: vj = aj + ai[k] - 1;
1861: nz = ai[k+1] - ai[k] - 1;
1862: for (j=0; j<nz; j++) {
1863: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1864: v++;vj++;
1865: }
1866: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1867: }
1869: ISRestoreIndices(isrow,&rp);
1870: VecRestoreArrayRead(bb->v,&b);
1871: VecRestoreArray(xx->v,&x);
1872: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1873: }
1874: return(0);
1875: }
1877: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1878: {
1879: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1883: if (A->rmap->bs == 1) {
1884: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1885: } else {
1886: IS isrow=a->row;
1887: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1888: const MatScalar *aa=a->a,*v;
1889: PetscScalar *x,*t;
1890: const PetscScalar *b;
1891: PetscInt nz,k,n,i;
1893: if (bb->n > a->solves_work_n) {
1894: PetscFree(a->solves_work);
1895: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1896: a->solves_work_n = bb->n;
1897: }
1898: n = bb->n;
1899: VecGetArrayRead(bb->v,&b);
1900: VecGetArray(xx->v,&x);
1901: t = a->solves_work;
1903: ISGetIndices(isrow,&rp);
1905: /* solve U^T*D*y = perm(b) by forward substitution */
1906: for (k=0; k<mbs; k++) {
1907: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1908: }
1909: for (k=0; k<mbs; k++) {
1910: v = aa + ai[k];
1911: vj = aj + ai[k];
1912: nz = ai[k+1] - ai[k];
1913: while (nz--) {
1914: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1915: v++;vj++;
1916: }
1917: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1918: }
1920: /* solve U*perm(x) = y by back substitution */
1921: for (k=mbs-1; k>=0; k--) {
1922: v = aa + ai[k];
1923: vj = aj + ai[k];
1924: nz = ai[k+1] - ai[k];
1925: while (nz--) {
1926: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1927: v++;vj++;
1928: }
1929: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1930: }
1932: ISRestoreIndices(isrow,&rp);
1933: VecRestoreArrayRead(bb->v,&b);
1934: VecRestoreArray(xx->v,&x);
1935: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1936: }
1937: return(0);
1938: }
1940: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1941: {
1942: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1943: PetscErrorCode ierr;
1944: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1945: const MatScalar *aa=a->a,*v;
1946: const PetscScalar *b;
1947: PetscScalar *x,xi;
1948: PetscInt nz,i,j;
1951: VecGetArrayRead(bb,&b);
1952: VecGetArray(xx,&x);
1954: /* solve U^T*D*y = b by forward substitution */
1955: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1956: for (i=0; i<mbs; i++) {
1957: v = aa + ai[i];
1958: vj = aj + ai[i];
1959: xi = x[i];
1960: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1961: for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
1962: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1963: }
1965: /* solve U*x = y by backward substitution */
1966: for (i=mbs-2; i>=0; i--) {
1967: xi = x[i];
1968: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1969: vj = aj + adiag[i] - 1;
1970: nz = ai[i+1] - ai[i] - 1;
1971: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1972: x[i] = xi;
1973: }
1975: VecRestoreArrayRead(bb,&b);
1976: VecRestoreArray(xx,&x);
1977: PetscLogFlops(4.0*a->nz - 3*mbs);
1978: return(0);
1979: }
1981: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1982: {
1983: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1984: PetscErrorCode ierr;
1985: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
1986: const MatScalar *aa=a->a,*v;
1987: PetscScalar *x,xk;
1988: const PetscScalar *b;
1989: PetscInt nz,k;
1992: VecGetArrayRead(bb,&b);
1993: VecGetArray(xx,&x);
1995: /* solve U^T*D*y = b by forward substitution */
1996: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1997: for (k=0; k<mbs; k++) {
1998: v = aa + ai[k] + 1;
1999: vj = aj + ai[k] + 1;
2000: xk = x[k];
2001: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2002: while (nz--) x[*vj++] += (*v++) * xk;
2003: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2004: }
2006: /* solve U*x = y by back substitution */
2007: for (k=mbs-2; k>=0; k--) {
2008: v = aa + ai[k] + 1;
2009: vj = aj + ai[k] + 1;
2010: xk = x[k];
2011: nz = ai[k+1] - ai[k] - 1;
2012: while (nz--) {
2013: xk += (*v++) * x[*vj++];
2014: }
2015: x[k] = xk;
2016: }
2018: VecRestoreArrayRead(bb,&b);
2019: VecRestoreArray(xx,&x);
2020: PetscLogFlops(4.0*a->nz - 3*mbs);
2021: return(0);
2022: }
2024: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2025: {
2026: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2027: PetscErrorCode ierr;
2028: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2029: const MatScalar *aa=a->a,*v;
2030: PetscReal diagk;
2031: PetscScalar *x;
2032: const PetscScalar *b;
2033: PetscInt nz,k;
2036: /* solve U^T*D^(1/2)*x = b by forward substitution */
2037: VecGetArrayRead(bb,&b);
2038: VecGetArray(xx,&x);
2039: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2040: for (k=0; k<mbs; k++) {
2041: v = aa + ai[k];
2042: vj = aj + ai[k];
2043: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2044: while (nz--) x[*vj++] += (*v++) * x[k];
2045: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2046: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2047: x[k] *= PetscSqrtReal(diagk);
2048: }
2049: VecRestoreArrayRead(bb,&b);
2050: VecRestoreArray(xx,&x);
2051: PetscLogFlops(2.0*a->nz - mbs);
2052: return(0);
2053: }
2055: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2056: {
2057: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2058: PetscErrorCode ierr;
2059: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2060: const MatScalar *aa=a->a,*v;
2061: PetscReal diagk;
2062: PetscScalar *x;
2063: const PetscScalar *b;
2064: PetscInt nz,k;
2067: /* solve U^T*D^(1/2)*x = b by forward substitution */
2068: VecGetArrayRead(bb,&b);
2069: VecGetArray(xx,&x);
2070: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2071: for (k=0; k<mbs; k++) {
2072: v = aa + ai[k] + 1;
2073: vj = aj + ai[k] + 1;
2074: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2075: while (nz--) x[*vj++] += (*v++) * x[k];
2076: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2077: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2078: x[k] *= PetscSqrtReal(diagk);
2079: }
2080: VecRestoreArrayRead(bb,&b);
2081: VecRestoreArray(xx,&x);
2082: PetscLogFlops(2.0*a->nz - mbs);
2083: return(0);
2084: }
2086: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2087: {
2088: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2089: PetscErrorCode ierr;
2090: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2091: const MatScalar *aa=a->a,*v;
2092: PetscReal diagk;
2093: PetscScalar *x;
2094: const PetscScalar *b;
2095: PetscInt nz,k;
2098: /* solve D^(1/2)*U*x = b by back substitution */
2099: VecGetArrayRead(bb,&b);
2100: VecGetArray(xx,&x);
2102: for (k=mbs-1; k>=0; k--) {
2103: v = aa + ai[k];
2104: vj = aj + ai[k];
2105: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2106: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2107: x[k] = PetscSqrtReal(diagk)*b[k];
2108: nz = ai[k+1] - ai[k] - 1;
2109: while (nz--) x[k] += (*v++) * x[*vj++];
2110: }
2111: VecRestoreArrayRead(bb,&b);
2112: VecRestoreArray(xx,&x);
2113: PetscLogFlops(2.0*a->nz - mbs);
2114: return(0);
2115: }
2117: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2118: {
2119: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2120: PetscErrorCode ierr;
2121: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2122: const MatScalar *aa=a->a,*v;
2123: PetscReal diagk;
2124: PetscScalar *x;
2125: const PetscScalar *b;
2126: PetscInt nz,k;
2129: /* solve D^(1/2)*U*x = b by back substitution */
2130: VecGetArrayRead(bb,&b);
2131: VecGetArray(xx,&x);
2133: for (k=mbs-1; k>=0; k--) {
2134: v = aa + ai[k] + 1;
2135: vj = aj + ai[k] + 1;
2136: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2137: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2138: x[k] = PetscSqrtReal(diagk)*b[k];
2139: nz = ai[k+1] - ai[k] - 1;
2140: while (nz--) x[k] += (*v++) * x[*vj++];
2141: }
2142: VecRestoreArrayRead(bb,&b);
2143: VecRestoreArray(xx,&x);
2144: PetscLogFlops(2.0*a->nz - mbs);
2145: return(0);
2146: }
2148: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2149: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2150: {
2151: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2153: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2154: PetscInt *jutmp,bs = A->rmap->bs,i;
2155: PetscInt m,reallocs = 0,*levtmp;
2156: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2157: PetscInt incrlev,*lev,shift,prow,nz;
2158: PetscReal f = info->fill,levels = info->levels;
2159: PetscBool perm_identity;
2162: /* check whether perm is the identity mapping */
2163: ISIdentity(perm,&perm_identity);
2165: if (perm_identity) {
2166: a->permute = PETSC_FALSE;
2167: ai = a->i; aj = a->j;
2168: } else { /* non-trivial permutation */
2169: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2170: a->permute = PETSC_TRUE;
2171: MatReorderingSeqSBAIJ(A, perm);
2172: ai = a->inew; aj = a->jnew;
2173: }
2175: /* initialization */
2176: ISGetIndices(perm,&rip);
2177: umax = (PetscInt)(f*ai[mbs] + 1);
2178: PetscMalloc1(umax,&lev);
2179: umax += mbs + 1;
2180: shift = mbs + 1;
2181: PetscMalloc1(mbs+1,&iu);
2182: PetscMalloc1(umax,&ju);
2183: iu[0] = mbs + 1;
2184: juidx = mbs + 1;
2185: /* prowl: linked list for pivot row */
2186: PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2187: /* q: linked list for col index */
2189: for (i=0; i<mbs; i++) {
2190: prowl[i] = mbs;
2191: q[i] = 0;
2192: }
2194: /* for each row k */
2195: for (k=0; k<mbs; k++) {
2196: nzk = 0;
2197: q[k] = mbs;
2198: /* copy current row into linked list */
2199: nz = ai[rip[k]+1] - ai[rip[k]];
2200: j = ai[rip[k]];
2201: while (nz--) {
2202: vj = rip[aj[j++]];
2203: if (vj > k) {
2204: qm = k;
2205: do {
2206: m = qm; qm = q[m];
2207: } while (qm < vj);
2208: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2209: nzk++;
2210: q[m] = vj;
2211: q[vj] = qm;
2212: levtmp[vj] = 0; /* initialize lev for nonzero element */
2213: }
2214: }
2216: /* modify nonzero structure of k-th row by computing fill-in
2217: for each row prow to be merged in */
2218: prow = k;
2219: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2221: while (prow < k) {
2222: /* merge row prow into k-th row */
2223: jmin = iu[prow] + 1;
2224: jmax = iu[prow+1];
2225: qm = k;
2226: for (j=jmin; j<jmax; j++) {
2227: incrlev = lev[j-shift] + 1;
2228: if (incrlev > levels) continue;
2229: vj = ju[j];
2230: do {
2231: m = qm; qm = q[m];
2232: } while (qm < vj);
2233: if (qm != vj) { /* a fill */
2234: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2235: levtmp[vj] = incrlev;
2236: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2237: }
2238: prow = prowl[prow]; /* next pivot row */
2239: }
2241: /* add k to row list for first nonzero element in k-th row */
2242: if (nzk > 1) {
2243: i = q[k]; /* col value of first nonzero element in k_th row of U */
2244: prowl[k] = prowl[i]; prowl[i] = k;
2245: }
2246: iu[k+1] = iu[k] + nzk;
2248: /* allocate more space to ju and lev if needed */
2249: if (iu[k+1] > umax) {
2250: /* estimate how much additional space we will need */
2251: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2252: /* just double the memory each time */
2253: maxadd = umax;
2254: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2255: umax += maxadd;
2257: /* allocate a longer ju */
2258: PetscMalloc1(umax,&jutmp);
2259: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2260: PetscFree(ju);
2261: ju = jutmp;
2263: PetscMalloc1(umax,&jutmp);
2264: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2265: PetscFree(lev);
2266: lev = jutmp;
2267: reallocs += 2; /* count how many times we realloc */
2268: }
2270: /* save nonzero structure of k-th row in ju */
2271: i=k;
2272: while (nzk--) {
2273: i = q[i];
2274: ju[juidx] = i;
2275: lev[juidx-shift] = levtmp[i];
2276: juidx++;
2277: }
2278: }
2280: #if defined(PETSC_USE_INFO)
2281: if (ai[mbs] != 0) {
2282: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2283: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2284: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2285: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2286: PetscInfo(A,"for best performance.\n");
2287: } else {
2288: PetscInfo(A,"Empty matrix.\n");
2289: }
2290: #endif
2292: ISRestoreIndices(perm,&rip);
2293: PetscFree3(prowl,q,levtmp);
2294: PetscFree(lev);
2296: /* put together the new matrix */
2297: MatSeqSBAIJSetPreallocation(B,bs,0,NULL);
2299: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2300: b = (Mat_SeqSBAIJ*)(B)->data;
2301: PetscFree2(b->imax,b->ilen);
2303: b->singlemalloc = PETSC_FALSE;
2304: b->free_a = PETSC_TRUE;
2305: b->free_ij = PETSC_TRUE;
2306: /* the next line frees the default space generated by the Create() */
2307: PetscFree3(b->a,b->j,b->i);
2308: PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2309: b->j = ju;
2310: b->i = iu;
2311: b->diag = 0;
2312: b->ilen = 0;
2313: b->imax = 0;
2315: ISDestroy(&b->row);
2316: ISDestroy(&b->icol);
2317: b->row = perm;
2318: b->icol = perm;
2319: PetscObjectReference((PetscObject)perm);
2320: PetscObjectReference((PetscObject)perm);
2321: PetscMalloc1(bs*mbs+bs,&b->solve_work);
2322: /* In b structure: Free imax, ilen, old a, old j.
2323: Allocate idnew, solve_work, new a, new j */
2324: PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2325: b->maxnz = b->nz = iu[mbs];
2327: (B)->info.factor_mallocs = reallocs;
2328: (B)->info.fill_ratio_given = f;
2329: if (ai[mbs] != 0) {
2330: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2331: } else {
2332: (B)->info.fill_ratio_needed = 0.0;
2333: }
2334: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2335: return(0);
2336: }
2338: /*
2339: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2340: */
2341: #include <petscbt.h>
2342: #include <../src/mat/utils/freespace.h>
2343: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2344: {
2345: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2346: PetscErrorCode ierr;
2347: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2348: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2349: const PetscInt *rip;
2350: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2351: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2352: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2353: PetscReal fill =info->fill,levels=info->levels;
2354: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2355: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2356: PetscBT lnkbt;
2359: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2360: MatMissingDiagonal(A,&missing,&d);
2361: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2362: if (bs > 1) {
2363: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2364: return(0);
2365: }
2367: /* check whether perm is the identity mapping */
2368: ISIdentity(perm,&perm_identity);
2369: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2370: a->permute = PETSC_FALSE;
2372: PetscMalloc1(am+1,&ui);
2373: PetscMalloc1(am+1,&udiag);
2374: ui[0] = 0;
2376: /* ICC(0) without matrix ordering: simply rearrange column indices */
2377: if (!levels) {
2378: /* reuse the column pointers and row offsets for memory savings */
2379: for (i=0; i<am; i++) {
2380: ncols = ai[i+1] - ai[i];
2381: ui[i+1] = ui[i] + ncols;
2382: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2383: }
2384: PetscMalloc1(ui[am]+1,&uj);
2385: cols = uj;
2386: for (i=0; i<am; i++) {
2387: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2388: ncols = ai[i+1] - ai[i] -1;
2389: for (j=0; j<ncols; j++) *cols++ = aj[j];
2390: *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2391: }
2392: } else { /* case: levels>0 */
2393: ISGetIndices(perm,&rip);
2395: /* initialization */
2396: /* jl: linked list for storing indices of the pivot rows
2397: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2398: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2399: for (i=0; i<am; i++) {
2400: jl[i] = am; il[i] = 0;
2401: }
2403: /* create and initialize a linked list for storing column indices of the active row k */
2404: nlnk = am + 1;
2405: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2407: /* initial FreeSpace size is fill*(ai[am]+1) */
2408: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2410: current_space = free_space;
2412: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2414: current_space_lvl = free_space_lvl;
2416: for (k=0; k<am; k++) { /* for each active row k */
2417: /* initialize lnk by the column indices of row k */
2418: nzk = 0;
2419: ncols = ai[k+1] - ai[k];
2420: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2421: cols = aj+ai[k];
2422: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2423: nzk += nlnk;
2425: /* update lnk by computing fill-in for each pivot row to be merged in */
2426: prow = jl[k]; /* 1st pivot row */
2428: while (prow < k) {
2429: nextprow = jl[prow];
2431: /* merge prow into k-th row */
2432: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2433: jmax = ui[prow+1];
2434: ncols = jmax-jmin;
2435: i = jmin - ui[prow];
2437: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2438: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2439: j = *(uj - 1);
2440: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2441: nzk += nlnk;
2443: /* update il and jl for prow */
2444: if (jmin < jmax) {
2445: il[prow] = jmin;
2447: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2448: }
2449: prow = nextprow;
2450: }
2452: /* if free space is not available, make more free space */
2453: if (current_space->local_remaining<nzk) {
2454: i = am - k + 1; /* num of unfactored rows */
2455: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2456: PetscFreeSpaceGet(i,¤t_space);
2457: PetscFreeSpaceGet(i,¤t_space_lvl);
2458: reallocs++;
2459: }
2461: /* copy data into free_space and free_space_lvl, then initialize lnk */
2462: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2463: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2465: /* add the k-th row into il and jl */
2466: if (nzk > 1) {
2467: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2468: jl[k] = jl[i]; jl[i] = k;
2469: il[k] = ui[k] + 1;
2470: }
2471: uj_ptr[k] = current_space->array;
2472: uj_lvl_ptr[k] = current_space_lvl->array;
2474: current_space->array += nzk;
2475: current_space->local_used += nzk;
2476: current_space->local_remaining -= nzk;
2477: current_space_lvl->array += nzk;
2478: current_space_lvl->local_used += nzk;
2479: current_space_lvl->local_remaining -= nzk;
2481: ui[k+1] = ui[k] + nzk;
2482: }
2484: ISRestoreIndices(perm,&rip);
2485: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2487: /* destroy list of free space and other temporary array(s) */
2488: PetscMalloc1(ui[am]+1,&uj);
2489: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2490: PetscIncompleteLLDestroy(lnk,lnkbt);
2491: PetscFreeSpaceDestroy(free_space_lvl);
2493: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2495: /* put together the new matrix in MATSEQSBAIJ format */
2496: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2498: b = (Mat_SeqSBAIJ*)(fact)->data;
2499: PetscFree2(b->imax,b->ilen);
2501: b->singlemalloc = PETSC_FALSE;
2502: b->free_a = PETSC_TRUE;
2503: b->free_ij = free_ij;
2505: PetscMalloc1(ui[am]+1,&b->a);
2507: b->j = uj;
2508: b->i = ui;
2509: b->diag = udiag;
2510: b->free_diag = PETSC_TRUE;
2511: b->ilen = 0;
2512: b->imax = 0;
2513: b->row = perm;
2514: b->col = perm;
2516: PetscObjectReference((PetscObject)perm);
2517: PetscObjectReference((PetscObject)perm);
2519: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2521: PetscMalloc1(am+1,&b->solve_work);
2522: PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2524: b->maxnz = b->nz = ui[am];
2526: fact->info.factor_mallocs = reallocs;
2527: fact->info.fill_ratio_given = fill;
2528: if (ai[am] != 0) {
2529: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2530: } else {
2531: fact->info.fill_ratio_needed = 0.0;
2532: }
2533: #if defined(PETSC_USE_INFO)
2534: if (ai[am] != 0) {
2535: PetscReal af = fact->info.fill_ratio_needed;
2536: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2537: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2538: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2539: } else {
2540: PetscInfo(A,"Empty matrix.\n");
2541: }
2542: #endif
2543: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2544: return(0);
2545: }
2547: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2548: {
2549: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2550: Mat_SeqSBAIJ *b;
2551: PetscErrorCode ierr;
2552: PetscBool perm_identity,free_ij = PETSC_TRUE;
2553: PetscInt bs=A->rmap->bs,am=a->mbs;
2554: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2555: PetscInt reallocs=0,i,*ui;
2556: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2557: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2558: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2559: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2560: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2561: PetscBT lnkbt;
2564: /*
2565: This code originally uses Modified Sparse Row (MSR) storage
2566: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2567: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2568: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2569: thus the original code in MSR format is still used for these cases.
2570: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2571: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2572: */
2573: if (bs > 1) {
2574: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2575: return(0);
2576: }
2578: /* check whether perm is the identity mapping */
2579: ISIdentity(perm,&perm_identity);
2580: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2581: a->permute = PETSC_FALSE;
2583: /* special case that simply copies fill pattern */
2584: if (!levels) {
2585: /* reuse the column pointers and row offsets for memory savings */
2586: ui = a->i;
2587: uj = a->j;
2588: free_ij = PETSC_FALSE;
2589: ratio_needed = 1.0;
2590: } else { /* case: levels>0 */
2591: ISGetIndices(perm,&rip);
2593: /* initialization */
2594: PetscMalloc1(am+1,&ui);
2595: ui[0] = 0;
2597: /* jl: linked list for storing indices of the pivot rows
2598: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2599: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2600: for (i=0; i<am; i++) {
2601: jl[i] = am; il[i] = 0;
2602: }
2604: /* create and initialize a linked list for storing column indices of the active row k */
2605: nlnk = am + 1;
2606: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2608: /* initial FreeSpace size is fill*(ai[am]+1) */
2609: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2611: current_space = free_space;
2613: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2615: current_space_lvl = free_space_lvl;
2617: for (k=0; k<am; k++) { /* for each active row k */
2618: /* initialize lnk by the column indices of row rip[k] */
2619: nzk = 0;
2620: ncols = ai[rip[k]+1] - ai[rip[k]];
2621: cols = aj+ai[rip[k]];
2622: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2623: nzk += nlnk;
2625: /* update lnk by computing fill-in for each pivot row to be merged in */
2626: prow = jl[k]; /* 1st pivot row */
2628: while (prow < k) {
2629: nextprow = jl[prow];
2631: /* merge prow into k-th row */
2632: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2633: jmax = ui[prow+1];
2634: ncols = jmax-jmin;
2635: i = jmin - ui[prow];
2636: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2637: j = *(uj_lvl_ptr[prow] + i - 1);
2638: cols_lvl = uj_lvl_ptr[prow]+i;
2639: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2640: nzk += nlnk;
2642: /* update il and jl for prow */
2643: if (jmin < jmax) {
2644: il[prow] = jmin;
2645: j = *cols;
2646: jl[prow] = jl[j];
2647: jl[j] = prow;
2648: }
2649: prow = nextprow;
2650: }
2652: /* if free space is not available, make more free space */
2653: if (current_space->local_remaining<nzk) {
2654: i = am - k + 1; /* num of unfactored rows */
2655: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2656: PetscFreeSpaceGet(i,¤t_space);
2657: PetscFreeSpaceGet(i,¤t_space_lvl);
2658: reallocs++;
2659: }
2661: /* copy data into free_space and free_space_lvl, then initialize lnk */
2662: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2664: /* add the k-th row into il and jl */
2665: if (nzk-1 > 0) {
2666: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2667: jl[k] = jl[i]; jl[i] = k;
2668: il[k] = ui[k] + 1;
2669: }
2670: uj_ptr[k] = current_space->array;
2671: uj_lvl_ptr[k] = current_space_lvl->array;
2673: current_space->array += nzk;
2674: current_space->local_used += nzk;
2675: current_space->local_remaining -= nzk;
2676: current_space_lvl->array += nzk;
2677: current_space_lvl->local_used += nzk;
2678: current_space_lvl->local_remaining -= nzk;
2680: ui[k+1] = ui[k] + nzk;
2681: }
2683: ISRestoreIndices(perm,&rip);
2684: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2686: /* destroy list of free space and other temporary array(s) */
2687: PetscMalloc1(ui[am]+1,&uj);
2688: PetscFreeSpaceContiguous(&free_space,uj);
2689: PetscIncompleteLLDestroy(lnk,lnkbt);
2690: PetscFreeSpaceDestroy(free_space_lvl);
2691: if (ai[am] != 0) {
2692: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2693: } else {
2694: ratio_needed = 0.0;
2695: }
2696: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2698: /* put together the new matrix in MATSEQSBAIJ format */
2699: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2701: b = (Mat_SeqSBAIJ*)(fact)->data;
2703: PetscFree2(b->imax,b->ilen);
2705: b->singlemalloc = PETSC_FALSE;
2706: b->free_a = PETSC_TRUE;
2707: b->free_ij = free_ij;
2709: PetscMalloc1(ui[am]+1,&b->a);
2711: b->j = uj;
2712: b->i = ui;
2713: b->diag = 0;
2714: b->ilen = 0;
2715: b->imax = 0;
2716: b->row = perm;
2717: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2719: PetscObjectReference((PetscObject)perm);
2720: b->icol = perm;
2721: PetscObjectReference((PetscObject)perm);
2722: PetscMalloc1(am+1,&b->solve_work);
2724: b->maxnz = b->nz = ui[am];
2726: fact->info.factor_mallocs = reallocs;
2727: fact->info.fill_ratio_given = fill;
2728: fact->info.fill_ratio_needed = ratio_needed;
2729: #if defined(PETSC_USE_INFO)
2730: if (ai[am] != 0) {
2731: PetscReal af = fact->info.fill_ratio_needed;
2732: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2733: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2734: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2735: } else {
2736: PetscInfo(A,"Empty matrix.\n");
2737: }
2738: #endif
2739: if (perm_identity) {
2740: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2741: } else {
2742: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2743: }
2744: return(0);
2745: }