Actual source code: sbaijfact2.c
petsc-3.14.6 2021-03-30
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11: {
12: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
13: IS isrow=a->row;
14: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
15: PetscErrorCode ierr;
16: const PetscInt *r;
17: PetscInt nz,*vj,k,idx,k1;
18: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
19: const MatScalar *aa=a->a,*v,*diag;
20: PetscScalar *x,*xk,*xj,*xk_tmp,*t;
21: const PetscScalar *b;
24: VecGetArrayRead(bb,&b);
25: VecGetArray(xx,&x);
26: t = a->solve_work;
27: ISGetIndices(isrow,&r);
28: PetscMalloc1(bs,&xk_tmp);
30: /* solve U^T * D * y = b by forward substitution */
31: xk = t;
32: for (k=0; k<mbs; k++) { /* t <- perm(b) */
33: idx = bs*r[k];
34: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35: }
36: for (k=0; k<mbs; k++) {
37: v = aa + bs2*ai[k];
38: xk = t + k*bs; /* Dk*xk = k-th block of x */
39: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
40: nz = ai[k+1] - ai[k];
41: vj = aj + ai[k];
42: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43: while (nz--) {
44: /* x(:) += U(k,:)^T*(Dk*xk) */
45: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46: vj++; xj = t + (*vj)*bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa+k*bs2; /* ptr to inv(Dk) */
51: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k=mbs-1; k>=0; k--) {
56: v = aa + bs2*ai[k];
57: xk = t + k*bs; /* xk */
58: nz = ai[k+1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj)*bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2; xj = t + (*vj)*bs;
66: }
67: idx = bs*r[k];
68: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69: }
71: PetscFree(xk_tmp);
72: ISRestoreIndices(isrow,&r);
73: VecRestoreArrayRead(bb,&b);
74: VecRestoreArray(xx,&x);
75: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
76: return(0);
77: }
79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80: {
82: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
83: }
85: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
86: {
88: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
89: }
91: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
92: {
93: PetscErrorCode ierr;
94: PetscInt nz,k;
95: const PetscInt *vj,bs2 = bs*bs;
96: const MatScalar *v,*diag;
97: PetscScalar *xk,*xj,*xk_tmp;
100: PetscMalloc1(bs,&xk_tmp);
101: for (k=0; k<mbs; k++) {
102: v = aa + bs2*ai[k];
103: xk = x + k*bs; /* Dk*xk = k-th block of x */
104: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
105: nz = ai[k+1] - ai[k];
106: vj = aj + ai[k];
107: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108: while (nz--) {
109: /* x(:) += U(k,:)^T*(Dk*xk) */
110: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111: vj++; xj = x + (*vj)*bs;
112: v += bs2;
113: }
114: /* xk = inv(Dk)*(Dk*xk) */
115: diag = aa+k*bs2; /* ptr to inv(Dk) */
116: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117: }
118: PetscFree(xk_tmp);
119: return(0);
120: }
122: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123: {
124: PetscInt nz,k;
125: const PetscInt *vj,bs2 = bs*bs;
126: const MatScalar *v;
127: PetscScalar *xk,*xj;
130: for (k=mbs-1; k>=0; k--) {
131: v = aa + bs2*ai[k];
132: xk = x + k*bs; /* xk */
133: nz = ai[k+1] - ai[k];
134: vj = aj + ai[k];
135: xj = x + (*vj)*bs;
136: while (nz--) {
137: /* xk += U(k,:)*x(:) */
138: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139: vj++;
140: v += bs2; xj = x + (*vj)*bs;
141: }
142: }
143: return(0);
144: }
146: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147: {
148: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
149: PetscErrorCode ierr;
150: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
151: PetscInt bs =A->rmap->bs;
152: const MatScalar *aa=a->a;
153: PetscScalar *x;
154: const PetscScalar *b;
157: VecGetArrayRead(bb,&b);
158: VecGetArray(xx,&x);
160: /* solve U^T * D * y = b by forward substitution */
161: PetscArraycpy(x,b,bs*mbs); /* x <- b */
162: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
164: /* solve U*x = y by back substitution */
165: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
167: VecRestoreArrayRead(bb,&b);
168: VecRestoreArray(xx,&x);
169: PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
170: return(0);
171: }
173: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174: {
175: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
176: PetscErrorCode ierr;
177: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
178: PetscInt bs =A->rmap->bs;
179: const MatScalar *aa=a->a;
180: const PetscScalar *b;
181: PetscScalar *x;
184: VecGetArrayRead(bb,&b);
185: VecGetArray(xx,&x);
186: PetscArraycpy(x,b,bs*mbs); /* x <- b */
187: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
188: VecRestoreArrayRead(bb,&b);
189: VecRestoreArray(xx,&x);
190: PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
191: return(0);
192: }
194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195: {
196: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
197: PetscErrorCode ierr;
198: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
199: PetscInt bs =A->rmap->bs;
200: const MatScalar *aa=a->a;
201: const PetscScalar *b;
202: PetscScalar *x;
205: VecGetArrayRead(bb,&b);
206: VecGetArray(xx,&x);
207: PetscArraycpy(x,b,bs*mbs);
208: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
209: VecRestoreArrayRead(bb,&b);
210: VecRestoreArray(xx,&x);
211: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
212: return(0);
213: }
215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216: {
217: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
218: IS isrow=a->row;
219: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220: PetscErrorCode ierr;
221: PetscInt nz,k,idx;
222: const MatScalar *aa=a->a,*v,*d;
223: PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224: const PetscScalar *b;
227: VecGetArrayRead(bb,&b);
228: VecGetArray(xx,&x);
229: t = a->solve_work;
230: ISGetIndices(isrow,&r);
232: /* solve U^T * D * y = b by forward substitution */
233: tp = t;
234: for (k=0; k<mbs; k++) { /* t <- perm(b) */
235: idx = 7*r[k];
236: tp[0] = b[idx];
237: tp[1] = b[idx+1];
238: tp[2] = b[idx+2];
239: tp[3] = b[idx+3];
240: tp[4] = b[idx+4];
241: tp[5] = b[idx+5];
242: tp[6] = b[idx+6];
243: tp += 7;
244: }
246: for (k=0; k<mbs; k++) {
247: v = aa + 49*ai[k];
248: vj = aj + ai[k];
249: tp = t + k*7;
250: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251: nz = ai[k+1] - ai[k];
252: tp = t + (*vj)*7;
253: while (nz--) {
254: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261: vj++;
262: tp = t + (*vj)*7;
263: v += 49;
264: }
266: /* xk = inv(Dk)*(Dk*xk) */
267: d = aa+k*49; /* ptr to inv(Dk) */
268: tp = t + k*7;
269: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276: }
278: /* solve U*x = y by back substitution */
279: for (k=mbs-1; k>=0; k--) {
280: v = aa + 49*ai[k];
281: vj = aj + ai[k];
282: tp = t + k*7;
283: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
284: nz = ai[k+1] - ai[k];
286: tp = t + (*vj)*7;
287: while (nz--) {
288: /* xk += U(k,:)*x(:) */
289: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296: vj++;
297: tp = t + (*vj)*7;
298: v += 49;
299: }
300: tp = t + k*7;
301: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302: idx = 7*r[k];
303: x[idx] = x0;
304: x[idx+1] = x1;
305: x[idx+2] = x2;
306: x[idx+3] = x3;
307: x[idx+4] = x4;
308: x[idx+5] = x5;
309: x[idx+6] = x6;
310: }
312: ISRestoreIndices(isrow,&r);
313: VecRestoreArrayRead(bb,&b);
314: VecRestoreArray(xx,&x);
315: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
316: return(0);
317: }
319: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320: {
321: const MatScalar *v,*d;
322: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
323: PetscInt nz,k;
324: const PetscInt *vj;
327: for (k=0; k<mbs; k++) {
328: v = aa + 49*ai[k];
329: xp = x + k*7;
330: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331: nz = ai[k+1] - ai[k];
332: vj = aj + ai[k];
333: xp = x + (*vj)*7;
334: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
335: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
336: while (nz--) {
337: /* x(:) += U(k,:)^T*(Dk*xk) */
338: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345: vj++;
346: xp = x + (*vj)*7;
347: v += 49;
348: }
349: /* xk = inv(Dk)*(Dk*xk) */
350: d = aa+k*49; /* ptr to inv(Dk) */
351: xp = x + k*7;
352: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
353: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
354: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
355: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
356: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
357: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
358: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
359: }
360: return(0);
361: }
363: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
364: {
365: const MatScalar *v;
366: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
367: PetscInt nz,k;
368: const PetscInt *vj;
371: for (k=mbs-1; k>=0; k--) {
372: v = aa + 49*ai[k];
373: xp = x + k*7;
374: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
375: nz = ai[k+1] - ai[k];
376: vj = aj + ai[k];
377: xp = x + (*vj)*7;
378: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
379: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380: while (nz--) {
381: /* xk += U(k,:)*x(:) */
382: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
383: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
384: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
385: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
386: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
387: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
388: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
389: vj++;
390: v += 49; xp = x + (*vj)*7;
391: }
392: xp = x + k*7;
393: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
394: }
395: return(0);
396: }
398: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
399: {
400: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
401: PetscErrorCode ierr;
402: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
403: const MatScalar *aa=a->a;
404: PetscScalar *x;
405: const PetscScalar *b;
408: VecGetArrayRead(bb,&b);
409: VecGetArray(xx,&x);
411: /* solve U^T * D * y = b by forward substitution */
412: PetscArraycpy(x,b,7*mbs); /* x <- b */
413: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
415: /* solve U*x = y by back substitution */
416: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
418: VecRestoreArrayRead(bb,&b);
419: VecRestoreArray(xx,&x);
420: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
421: return(0);
422: }
424: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
427: PetscErrorCode ierr;
428: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
429: const MatScalar *aa=a->a;
430: PetscScalar *x;
431: const PetscScalar *b;
434: VecGetArrayRead(bb,&b);
435: VecGetArray(xx,&x);
436: PetscArraycpy(x,b,7*mbs);
437: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
438: VecRestoreArrayRead(bb,&b);
439: VecRestoreArray(xx,&x);
440: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
441: return(0);
442: }
444: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
445: {
446: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
447: PetscErrorCode ierr;
448: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
449: const MatScalar *aa=a->a;
450: PetscScalar *x;
451: const PetscScalar *b;
454: VecGetArrayRead(bb,&b);
455: VecGetArray(xx,&x);
456: PetscArraycpy(x,b,7*mbs);
457: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
458: VecRestoreArrayRead(bb,&b);
459: VecRestoreArray(xx,&x);
460: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
461: return(0);
462: }
464: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
465: {
466: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
467: IS isrow=a->row;
468: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
469: PetscErrorCode ierr;
470: PetscInt nz,k,idx;
471: const MatScalar *aa=a->a,*v,*d;
472: PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp;
473: const PetscScalar *b;
476: VecGetArrayRead(bb,&b);
477: VecGetArray(xx,&x);
478: t = a->solve_work;
479: ISGetIndices(isrow,&r);
481: /* solve U^T * D * y = b by forward substitution */
482: tp = t;
483: for (k=0; k<mbs; k++) { /* t <- perm(b) */
484: idx = 6*r[k];
485: tp[0] = b[idx];
486: tp[1] = b[idx+1];
487: tp[2] = b[idx+2];
488: tp[3] = b[idx+3];
489: tp[4] = b[idx+4];
490: tp[5] = b[idx+5];
491: tp += 6;
492: }
494: for (k=0; k<mbs; k++) {
495: v = aa + 36*ai[k];
496: vj = aj + ai[k];
497: tp = t + k*6;
498: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
499: nz = ai[k+1] - ai[k];
500: tp = t + (*vj)*6;
501: while (nz--) {
502: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
503: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
504: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
505: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
506: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
507: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
508: vj++;
509: tp = t + (*vj)*6;
510: v += 36;
511: }
513: /* xk = inv(Dk)*(Dk*xk) */
514: d = aa+k*36; /* ptr to inv(Dk) */
515: tp = t + k*6;
516: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
517: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
518: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
519: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
520: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
521: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
522: }
524: /* solve U*x = y by back substitution */
525: for (k=mbs-1; k>=0; k--) {
526: v = aa + 36*ai[k];
527: vj = aj + ai[k];
528: tp = t + k*6;
529: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
530: nz = ai[k+1] - ai[k];
532: tp = t + (*vj)*6;
533: while (nz--) {
534: /* xk += U(k,:)*x(:) */
535: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
536: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
537: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
538: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
539: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
540: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
541: vj++;
542: tp = t + (*vj)*6;
543: v += 36;
544: }
545: tp = t + k*6;
546: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
547: idx = 6*r[k];
548: x[idx] = x0;
549: x[idx+1] = x1;
550: x[idx+2] = x2;
551: x[idx+3] = x3;
552: x[idx+4] = x4;
553: x[idx+5] = x5;
554: }
556: ISRestoreIndices(isrow,&r);
557: VecRestoreArrayRead(bb,&b);
558: VecRestoreArray(xx,&x);
559: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
560: return(0);
561: }
563: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
564: {
565: const MatScalar *v,*d;
566: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
567: PetscInt nz,k;
568: const PetscInt *vj;
571: for (k=0; k<mbs; k++) {
572: v = aa + 36*ai[k];
573: xp = x + k*6;
574: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
575: nz = ai[k+1] - ai[k];
576: vj = aj + ai[k];
577: xp = x + (*vj)*6;
578: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
579: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580: while (nz--) {
581: /* x(:) += U(k,:)^T*(Dk*xk) */
582: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
583: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
584: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
585: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
586: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
587: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
588: vj++;
589: xp = x + (*vj)*6;
590: v += 36;
591: }
592: /* xk = inv(Dk)*(Dk*xk) */
593: d = aa+k*36; /* ptr to inv(Dk) */
594: xp = x + k*6;
595: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
596: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
597: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
598: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
599: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
600: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
601: }
602: return(0);
603: }
604: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
605: {
606: const MatScalar *v;
607: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
608: PetscInt nz,k;
609: const PetscInt *vj;
612: for (k=mbs-1; k>=0; k--) {
613: v = aa + 36*ai[k];
614: xp = x + k*6;
615: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
616: nz = ai[k+1] - ai[k];
617: vj = aj + ai[k];
618: xp = x + (*vj)*6;
619: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
620: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
621: while (nz--) {
622: /* xk += U(k,:)*x(:) */
623: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
624: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
625: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
626: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
627: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
628: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
629: vj++;
630: v += 36; xp = x + (*vj)*6;
631: }
632: xp = x + k*6;
633: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
634: }
635: return(0);
636: }
639: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
640: {
641: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
642: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
643: const MatScalar *aa=a->a;
644: PetscScalar *x;
645: const PetscScalar *b;
646: PetscErrorCode ierr;
649: VecGetArrayRead(bb,&b);
650: VecGetArray(xx,&x);
652: /* solve U^T * D * y = b by forward substitution */
653: PetscArraycpy(x,b,6*mbs); /* x <- b */
654: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
656: /* solve U*x = y by back substitution */
657: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
659: VecRestoreArrayRead(bb,&b);
660: VecRestoreArray(xx,&x);
661: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
662: return(0);
663: }
665: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
666: {
667: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
668: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
669: const MatScalar *aa=a->a;
670: PetscScalar *x;
671: const PetscScalar *b;
672: PetscErrorCode ierr;
675: VecGetArrayRead(bb,&b);
676: VecGetArray(xx,&x);
677: PetscArraycpy(x,b,6*mbs); /* x <- b */
678: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
679: VecRestoreArrayRead(bb,&b);
680: VecRestoreArray(xx,&x);
681: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
682: return(0);
683: }
685: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
686: {
687: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
688: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
689: const MatScalar *aa=a->a;
690: PetscScalar *x;
691: const PetscScalar *b;
692: PetscErrorCode ierr;
695: VecGetArrayRead(bb,&b);
696: VecGetArray(xx,&x);
697: PetscArraycpy(x,b,6*mbs); /* x <- b */
698: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
699: VecRestoreArrayRead(bb,&b);
700: VecRestoreArray(xx,&x);
701: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
702: return(0);
703: }
705: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
706: {
707: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
708: IS isrow=a->row;
709: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
710: PetscErrorCode ierr;
711: const PetscInt *r,*vj;
712: PetscInt nz,k,idx;
713: const MatScalar *aa=a->a,*v,*diag;
714: PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp;
715: const PetscScalar *b;
718: VecGetArrayRead(bb,&b);
719: VecGetArray(xx,&x);
720: t = a->solve_work;
721: ISGetIndices(isrow,&r);
723: /* solve U^T * D * y = b by forward substitution */
724: tp = t;
725: for (k=0; k<mbs; k++) { /* t <- perm(b) */
726: idx = 5*r[k];
727: tp[0] = b[idx];
728: tp[1] = b[idx+1];
729: tp[2] = b[idx+2];
730: tp[3] = b[idx+3];
731: tp[4] = b[idx+4];
732: tp += 5;
733: }
735: for (k=0; k<mbs; k++) {
736: v = aa + 25*ai[k];
737: vj = aj + ai[k];
738: tp = t + k*5;
739: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
740: nz = ai[k+1] - ai[k];
742: tp = t + (*vj)*5;
743: while (nz--) {
744: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
745: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
746: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
747: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
748: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
749: vj++;
750: tp = t + (*vj)*5;
751: v += 25;
752: }
754: /* xk = inv(Dk)*(Dk*xk) */
755: diag = aa+k*25; /* ptr to inv(Dk) */
756: tp = t + k*5;
757: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
758: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
759: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
760: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
761: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
762: }
764: /* solve U*x = y by back substitution */
765: for (k=mbs-1; k>=0; k--) {
766: v = aa + 25*ai[k];
767: vj = aj + ai[k];
768: tp = t + k*5;
769: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
770: nz = ai[k+1] - ai[k];
772: tp = t + (*vj)*5;
773: while (nz--) {
774: /* xk += U(k,:)*x(:) */
775: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
776: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
777: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
778: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
779: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
780: vj++;
781: tp = t + (*vj)*5;
782: v += 25;
783: }
784: tp = t + k*5;
785: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
786: idx = 5*r[k];
787: x[idx] = x0;
788: x[idx+1] = x1;
789: x[idx+2] = x2;
790: x[idx+3] = x3;
791: x[idx+4] = x4;
792: }
794: ISRestoreIndices(isrow,&r);
795: VecRestoreArrayRead(bb,&b);
796: VecRestoreArray(xx,&x);
797: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
798: return(0);
799: }
801: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
802: {
803: const MatScalar *v,*diag;
804: PetscScalar *xp,x0,x1,x2,x3,x4;
805: PetscInt nz,k;
806: const PetscInt *vj;
809: for (k=0; k<mbs; k++) {
810: v = aa + 25*ai[k];
811: xp = x + k*5;
812: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
813: nz = ai[k+1] - ai[k];
814: vj = aj + ai[k];
815: xp = x + (*vj)*5;
816: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
817: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
818: while (nz--) {
819: /* x(:) += U(k,:)^T*(Dk*xk) */
820: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
821: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
822: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
823: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
824: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
825: vj++;
826: xp = x + (*vj)*5;
827: v += 25;
828: }
829: /* xk = inv(Dk)*(Dk*xk) */
830: diag = aa+k*25; /* ptr to inv(Dk) */
831: xp = x + k*5;
832: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
833: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
834: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
835: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
836: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
837: }
838: return(0);
839: }
841: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
842: {
843: const MatScalar *v;
844: PetscScalar *xp,x0,x1,x2,x3,x4;
845: PetscInt nz,k;
846: const PetscInt *vj;
849: for (k=mbs-1; k>=0; k--) {
850: v = aa + 25*ai[k];
851: xp = x + k*5;
852: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
853: nz = ai[k+1] - ai[k];
854: vj = aj + ai[k];
855: xp = x + (*vj)*5;
856: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
857: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
858: while (nz--) {
859: /* xk += U(k,:)*x(:) */
860: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
861: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
862: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
863: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
864: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
865: vj++;
866: v += 25; xp = x + (*vj)*5;
867: }
868: xp = x + k*5;
869: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
870: }
871: return(0);
872: }
874: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
875: {
876: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
877: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
878: const MatScalar *aa=a->a;
879: PetscScalar *x;
880: const PetscScalar *b;
881: PetscErrorCode ierr;
884: VecGetArrayRead(bb,&b);
885: VecGetArray(xx,&x);
887: /* solve U^T * D * y = b by forward substitution */
888: PetscArraycpy(x,b,5*mbs); /* x <- b */
889: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
891: /* solve U*x = y by back substitution */
892: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
894: VecRestoreArrayRead(bb,&b);
895: VecRestoreArray(xx,&x);
896: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
897: return(0);
898: }
900: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
901: {
902: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
903: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
904: const MatScalar *aa=a->a;
905: PetscScalar *x;
906: const PetscScalar *b;
907: PetscErrorCode ierr;
910: VecGetArrayRead(bb,&b);
911: VecGetArray(xx,&x);
912: PetscArraycpy(x,b,5*mbs); /* x <- b */
913: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
914: VecRestoreArrayRead(bb,&b);
915: VecRestoreArray(xx,&x);
916: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
917: return(0);
918: }
920: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
921: {
922: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
923: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
924: const MatScalar *aa=a->a;
925: PetscScalar *x;
926: const PetscScalar *b;
927: PetscErrorCode ierr;
930: VecGetArrayRead(bb,&b);
931: VecGetArray(xx,&x);
932: PetscArraycpy(x,b,5*mbs);
933: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
934: VecRestoreArrayRead(bb,&b);
935: VecRestoreArray(xx,&x);
936: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
937: return(0);
938: }
940: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
941: {
942: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
943: IS isrow=a->row;
944: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
945: PetscErrorCode ierr;
946: const PetscInt *r,*vj;
947: PetscInt nz,k,idx;
948: const MatScalar *aa=a->a,*v,*diag;
949: PetscScalar *x,x0,x1,x2,x3,*t,*tp;
950: const PetscScalar *b;
953: VecGetArrayRead(bb,&b);
954: VecGetArray(xx,&x);
955: t = a->solve_work;
956: ISGetIndices(isrow,&r);
958: /* solve U^T * D * y = b by forward substitution */
959: tp = t;
960: for (k=0; k<mbs; k++) { /* t <- perm(b) */
961: idx = 4*r[k];
962: tp[0] = b[idx];
963: tp[1] = b[idx+1];
964: tp[2] = b[idx+2];
965: tp[3] = b[idx+3];
966: tp += 4;
967: }
969: for (k=0; k<mbs; k++) {
970: v = aa + 16*ai[k];
971: vj = aj + ai[k];
972: tp = t + k*4;
973: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
974: nz = ai[k+1] - ai[k];
976: tp = t + (*vj)*4;
977: while (nz--) {
978: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
979: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
980: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
981: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
982: vj++;
983: tp = t + (*vj)*4;
984: v += 16;
985: }
987: /* xk = inv(Dk)*(Dk*xk) */
988: diag = aa+k*16; /* ptr to inv(Dk) */
989: tp = t + k*4;
990: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
991: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
992: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
993: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
994: }
996: /* solve U*x = y by back substitution */
997: for (k=mbs-1; k>=0; k--) {
998: v = aa + 16*ai[k];
999: vj = aj + ai[k];
1000: tp = t + k*4;
1001: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1002: nz = ai[k+1] - ai[k];
1004: tp = t + (*vj)*4;
1005: while (nz--) {
1006: /* xk += U(k,:)*x(:) */
1007: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1008: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1009: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1010: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1011: vj++;
1012: tp = t + (*vj)*4;
1013: v += 16;
1014: }
1015: tp = t + k*4;
1016: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1017: idx = 4*r[k];
1018: x[idx] = x0;
1019: x[idx+1] = x1;
1020: x[idx+2] = x2;
1021: x[idx+3] = x3;
1022: }
1024: ISRestoreIndices(isrow,&r);
1025: VecRestoreArrayRead(bb,&b);
1026: VecRestoreArray(xx,&x);
1027: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1028: return(0);
1029: }
1031: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1032: {
1033: const MatScalar *v,*diag;
1034: PetscScalar *xp,x0,x1,x2,x3;
1035: PetscInt nz,k;
1036: const PetscInt *vj;
1039: for (k=0; k<mbs; k++) {
1040: v = aa + 16*ai[k];
1041: xp = x + k*4;
1042: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1043: nz = ai[k+1] - ai[k];
1044: vj = aj + ai[k];
1045: xp = x + (*vj)*4;
1046: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1047: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1048: while (nz--) {
1049: /* x(:) += U(k,:)^T*(Dk*xk) */
1050: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1051: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1052: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1053: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1054: vj++;
1055: xp = x + (*vj)*4;
1056: v += 16;
1057: }
1058: /* xk = inv(Dk)*(Dk*xk) */
1059: diag = aa+k*16; /* ptr to inv(Dk) */
1060: xp = x + k*4;
1061: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1062: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1063: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1064: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1065: }
1066: return(0);
1067: }
1069: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1070: {
1071: const MatScalar *v;
1072: PetscScalar *xp,x0,x1,x2,x3;
1073: PetscInt nz,k;
1074: const PetscInt *vj;
1077: for (k=mbs-1; k>=0; k--) {
1078: v = aa + 16*ai[k];
1079: xp = x + k*4;
1080: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1081: nz = ai[k+1] - ai[k];
1082: vj = aj + ai[k];
1083: xp = x + (*vj)*4;
1084: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1085: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1086: while (nz--) {
1087: /* xk += U(k,:)*x(:) */
1088: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1089: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1090: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1091: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1092: vj++;
1093: v += 16; xp = x + (*vj)*4;
1094: }
1095: xp = x + k*4;
1096: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1097: }
1098: return(0);
1099: }
1101: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1102: {
1103: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1104: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1105: const MatScalar *aa=a->a;
1106: PetscScalar *x;
1107: const PetscScalar *b;
1108: PetscErrorCode ierr;
1111: VecGetArrayRead(bb,&b);
1112: VecGetArray(xx,&x);
1114: /* solve U^T * D * y = b by forward substitution */
1115: PetscArraycpy(x,b,4*mbs); /* x <- b */
1116: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1118: /* solve U*x = y by back substitution */
1119: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1120: VecRestoreArrayRead(bb,&b);
1121: VecRestoreArray(xx,&x);
1122: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1123: return(0);
1124: }
1126: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1127: {
1128: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1129: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1130: const MatScalar *aa=a->a;
1131: PetscScalar *x;
1132: const PetscScalar *b;
1133: PetscErrorCode ierr;
1136: VecGetArrayRead(bb,&b);
1137: VecGetArray(xx,&x);
1138: PetscArraycpy(x,b,4*mbs); /* x <- b */
1139: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1140: VecRestoreArrayRead(bb,&b);
1141: VecRestoreArray(xx,&x);
1142: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1143: return(0);
1144: }
1146: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1147: {
1148: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1149: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1150: const MatScalar *aa=a->a;
1151: PetscScalar *x;
1152: const PetscScalar *b;
1153: PetscErrorCode ierr;
1156: VecGetArrayRead(bb,&b);
1157: VecGetArray(xx,&x);
1158: PetscArraycpy(x,b,4*mbs);
1159: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1160: VecRestoreArrayRead(bb,&b);
1161: VecRestoreArray(xx,&x);
1162: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1163: return(0);
1164: }
1166: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1167: {
1168: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1169: IS isrow=a->row;
1170: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1171: PetscErrorCode ierr;
1172: const PetscInt *r;
1173: PetscInt nz,k,idx;
1174: const PetscInt *vj;
1175: const MatScalar *aa=a->a,*v,*diag;
1176: PetscScalar *x,x0,x1,x2,*t,*tp;
1177: const PetscScalar *b;
1180: VecGetArrayRead(bb,&b);
1181: VecGetArray(xx,&x);
1182: t = a->solve_work;
1183: ISGetIndices(isrow,&r);
1185: /* solve U^T * D * y = b by forward substitution */
1186: tp = t;
1187: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1188: idx = 3*r[k];
1189: tp[0] = b[idx];
1190: tp[1] = b[idx+1];
1191: tp[2] = b[idx+2];
1192: tp += 3;
1193: }
1195: for (k=0; k<mbs; k++) {
1196: v = aa + 9*ai[k];
1197: vj = aj + ai[k];
1198: tp = t + k*3;
1199: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1200: nz = ai[k+1] - ai[k];
1202: tp = t + (*vj)*3;
1203: while (nz--) {
1204: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1205: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1206: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1207: vj++;
1208: tp = t + (*vj)*3;
1209: v += 9;
1210: }
1212: /* xk = inv(Dk)*(Dk*xk) */
1213: diag = aa+k*9; /* ptr to inv(Dk) */
1214: tp = t + k*3;
1215: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1216: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1217: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1218: }
1220: /* solve U*x = y by back substitution */
1221: for (k=mbs-1; k>=0; k--) {
1222: v = aa + 9*ai[k];
1223: vj = aj + ai[k];
1224: tp = t + k*3;
1225: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1226: nz = ai[k+1] - ai[k];
1228: tp = t + (*vj)*3;
1229: while (nz--) {
1230: /* xk += U(k,:)*x(:) */
1231: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1232: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1233: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1234: vj++;
1235: tp = t + (*vj)*3;
1236: v += 9;
1237: }
1238: tp = t + k*3;
1239: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1240: idx = 3*r[k];
1241: x[idx] = x0;
1242: x[idx+1] = x1;
1243: x[idx+2] = x2;
1244: }
1246: ISRestoreIndices(isrow,&r);
1247: VecRestoreArrayRead(bb,&b);
1248: VecRestoreArray(xx,&x);
1249: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1250: return(0);
1251: }
1253: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1254: {
1255: const MatScalar *v,*diag;
1256: PetscScalar *xp,x0,x1,x2;
1257: PetscInt nz,k;
1258: const PetscInt *vj;
1261: for (k=0; k<mbs; k++) {
1262: v = aa + 9*ai[k];
1263: xp = x + k*3;
1264: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1265: nz = ai[k+1] - ai[k];
1266: vj = aj + ai[k];
1267: xp = x + (*vj)*3;
1268: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1269: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1270: while (nz--) {
1271: /* x(:) += U(k,:)^T*(Dk*xk) */
1272: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1273: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1274: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1275: vj++;
1276: xp = x + (*vj)*3;
1277: v += 9;
1278: }
1279: /* xk = inv(Dk)*(Dk*xk) */
1280: diag = aa+k*9; /* ptr to inv(Dk) */
1281: xp = x + k*3;
1282: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1283: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1284: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1285: }
1286: return(0);
1287: }
1289: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1290: {
1291: const MatScalar *v;
1292: PetscScalar *xp,x0,x1,x2;
1293: PetscInt nz,k;
1294: const PetscInt *vj;
1297: for (k=mbs-1; k>=0; k--) {
1298: v = aa + 9*ai[k];
1299: xp = x + k*3;
1300: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1301: nz = ai[k+1] - ai[k];
1302: vj = aj + ai[k];
1303: xp = x + (*vj)*3;
1304: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1305: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1306: while (nz--) {
1307: /* xk += U(k,:)*x(:) */
1308: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1309: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1310: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1311: vj++;
1312: v += 9; xp = x + (*vj)*3;
1313: }
1314: xp = x + k*3;
1315: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1316: }
1317: return(0);
1318: }
1320: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1321: {
1322: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1323: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1324: const MatScalar *aa=a->a;
1325: PetscScalar *x;
1326: const PetscScalar *b;
1327: PetscErrorCode ierr;
1330: VecGetArrayRead(bb,&b);
1331: VecGetArray(xx,&x);
1333: /* solve U^T * D * y = b by forward substitution */
1334: PetscArraycpy(x,b,3*mbs);
1335: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1337: /* solve U*x = y by back substitution */
1338: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1340: VecRestoreArrayRead(bb,&b);
1341: VecRestoreArray(xx,&x);
1342: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1343: return(0);
1344: }
1346: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1347: {
1348: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1349: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1350: const MatScalar *aa=a->a;
1351: PetscScalar *x;
1352: const PetscScalar *b;
1353: PetscErrorCode ierr;
1356: VecGetArrayRead(bb,&b);
1357: VecGetArray(xx,&x);
1358: PetscArraycpy(x,b,3*mbs);
1359: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1360: VecRestoreArrayRead(bb,&b);
1361: VecRestoreArray(xx,&x);
1362: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1363: return(0);
1364: }
1366: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1367: {
1368: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1369: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1370: const MatScalar *aa=a->a;
1371: PetscScalar *x;
1372: const PetscScalar *b;
1373: PetscErrorCode ierr;
1376: VecGetArrayRead(bb,&b);
1377: VecGetArray(xx,&x);
1378: PetscArraycpy(x,b,3*mbs);
1379: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1380: VecRestoreArrayRead(bb,&b);
1381: VecRestoreArray(xx,&x);
1382: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1383: return(0);
1384: }
1386: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1387: {
1388: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1389: IS isrow=a->row;
1390: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1391: PetscErrorCode ierr;
1392: const PetscInt *r,*vj;
1393: PetscInt nz,k,k2,idx;
1394: const MatScalar *aa=a->a,*v,*diag;
1395: PetscScalar *x,x0,x1,*t;
1396: const PetscScalar *b;
1399: VecGetArrayRead(bb,&b);
1400: VecGetArray(xx,&x);
1401: t = a->solve_work;
1402: ISGetIndices(isrow,&r);
1404: /* solve U^T * D * y = perm(b) by forward substitution */
1405: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1406: idx = 2*r[k];
1407: t[k*2] = b[idx];
1408: t[k*2+1] = b[idx+1];
1409: }
1410: for (k=0; k<mbs; k++) {
1411: v = aa + 4*ai[k];
1412: vj = aj + ai[k];
1413: k2 = k*2;
1414: x0 = t[k2]; x1 = t[k2+1];
1415: nz = ai[k+1] - ai[k];
1416: while (nz--) {
1417: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1418: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1419: vj++; v += 4;
1420: }
1421: diag = aa+k*4; /* ptr to inv(Dk) */
1422: t[k2] = diag[0]*x0 + diag[2]*x1;
1423: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1424: }
1426: /* solve U*x = y by back substitution */
1427: for (k=mbs-1; k>=0; k--) {
1428: v = aa + 4*ai[k];
1429: vj = aj + ai[k];
1430: k2 = k*2;
1431: x0 = t[k2]; x1 = t[k2+1];
1432: nz = ai[k+1] - ai[k];
1433: while (nz--) {
1434: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1435: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1436: vj++;
1437: v += 4;
1438: }
1439: t[k2] = x0;
1440: t[k2+1] = x1;
1441: idx = 2*r[k];
1442: x[idx] = x0;
1443: x[idx+1] = x1;
1444: }
1446: ISRestoreIndices(isrow,&r);
1447: VecRestoreArrayRead(bb,&b);
1448: VecRestoreArray(xx,&x);
1449: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1450: return(0);
1451: }
1453: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1454: {
1455: const MatScalar *v,*diag;
1456: PetscScalar x0,x1;
1457: PetscInt nz,k,k2;
1458: const PetscInt *vj;
1461: for (k=0; k<mbs; k++) {
1462: v = aa + 4*ai[k];
1463: vj = aj + ai[k];
1464: k2 = k*2;
1465: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1466: nz = ai[k+1] - ai[k];
1467: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1468: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1469: while (nz--) {
1470: /* x(:) += U(k,:)^T*(Dk*xk) */
1471: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1472: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1473: vj++; v += 4;
1474: }
1475: /* xk = inv(Dk)*(Dk*xk) */
1476: diag = aa+k*4; /* ptr to inv(Dk) */
1477: x[k2] = diag[0]*x0 + diag[2]*x1;
1478: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1479: }
1480: return(0);
1481: }
1483: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1484: {
1485: const MatScalar *v;
1486: PetscScalar x0,x1;
1487: PetscInt nz,k,k2;
1488: const PetscInt *vj;
1491: for (k=mbs-1; k>=0; k--) {
1492: v = aa + 4*ai[k];
1493: vj = aj + ai[k];
1494: k2 = k*2;
1495: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1496: nz = ai[k+1] - ai[k];
1497: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1498: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1499: while (nz--) {
1500: /* xk += U(k,:)*x(:) */
1501: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1502: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1503: vj++;
1504: v += 4;
1505: }
1506: x[k2] = x0;
1507: x[k2+1] = x1;
1508: }
1509: return(0);
1510: }
1512: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1513: {
1514: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1515: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1516: const MatScalar *aa=a->a;
1517: PetscScalar *x;
1518: const PetscScalar *b;
1519: PetscErrorCode ierr;
1522: VecGetArrayRead(bb,&b);
1523: VecGetArray(xx,&x);
1525: /* solve U^T * D * y = b by forward substitution */
1526: PetscArraycpy(x,b,2*mbs);
1527: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1529: /* solve U*x = y by back substitution */
1530: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1532: VecRestoreArrayRead(bb,&b);
1533: VecRestoreArray(xx,&x);
1534: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1535: return(0);
1536: }
1538: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1539: {
1540: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1541: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1542: const MatScalar *aa=a->a;
1543: PetscScalar *x;
1544: const PetscScalar *b;
1545: PetscErrorCode ierr;
1548: VecGetArrayRead(bb,&b);
1549: VecGetArray(xx,&x);
1550: PetscArraycpy(x,b,2*mbs);
1551: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1552: VecRestoreArrayRead(bb,&b);
1553: VecRestoreArray(xx,&x);
1554: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1555: return(0);
1556: }
1558: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1559: {
1560: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1561: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1562: const MatScalar *aa=a->a;
1563: PetscScalar *x;
1564: const PetscScalar *b;
1565: PetscErrorCode ierr;
1568: VecGetArrayRead(bb,&b);
1569: VecGetArray(xx,&x);
1570: PetscArraycpy(x,b,2*mbs);
1571: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1572: VecRestoreArrayRead(bb,&b);
1573: VecRestoreArray(xx,&x);
1574: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1575: return(0);
1576: }
1578: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1579: {
1580: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1581: IS isrow=a->row;
1582: PetscErrorCode ierr;
1583: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1584: const MatScalar *aa=a->a,*v;
1585: const PetscScalar *b;
1586: PetscScalar *x,xk,*t;
1587: PetscInt nz,k,j;
1590: VecGetArrayRead(bb,&b);
1591: VecGetArray(xx,&x);
1592: t = a->solve_work;
1593: ISGetIndices(isrow,&rp);
1595: /* solve U^T*D*y = perm(b) by forward substitution */
1596: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1597: for (k=0; k<mbs; k++) {
1598: v = aa + ai[k];
1599: vj = aj + ai[k];
1600: xk = t[k];
1601: nz = ai[k+1] - ai[k] - 1;
1602: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1603: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1604: }
1606: /* solve U*perm(x) = y by back substitution */
1607: for (k=mbs-1; k>=0; k--) {
1608: v = aa + adiag[k] - 1;
1609: vj = aj + adiag[k] - 1;
1610: nz = ai[k+1] - ai[k] - 1;
1611: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1612: x[rp[k]] = t[k];
1613: }
1615: ISRestoreIndices(isrow,&rp);
1616: VecRestoreArrayRead(bb,&b);
1617: VecRestoreArray(xx,&x);
1618: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1619: return(0);
1620: }
1622: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1623: {
1624: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1625: IS isrow=a->row;
1626: PetscErrorCode ierr;
1627: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1628: const MatScalar *aa=a->a,*v;
1629: PetscScalar *x,xk,*t;
1630: const PetscScalar *b;
1631: PetscInt nz,k;
1634: VecGetArrayRead(bb,&b);
1635: VecGetArray(xx,&x);
1636: t = a->solve_work;
1637: ISGetIndices(isrow,&rp);
1639: /* solve U^T*D*y = perm(b) by forward substitution */
1640: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1641: for (k=0; k<mbs; k++) {
1642: v = aa + ai[k] + 1;
1643: vj = aj + ai[k] + 1;
1644: xk = t[k];
1645: nz = ai[k+1] - ai[k] - 1;
1646: while (nz--) t[*vj++] += (*v++) * xk;
1647: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1648: }
1650: /* solve U*perm(x) = y by back substitution */
1651: for (k=mbs-1; k>=0; k--) {
1652: v = aa + ai[k] + 1;
1653: vj = aj + ai[k] + 1;
1654: nz = ai[k+1] - ai[k] - 1;
1655: while (nz--) t[k] += (*v++) * t[*vj++];
1656: x[rp[k]] = t[k];
1657: }
1659: ISRestoreIndices(isrow,&rp);
1660: VecRestoreArrayRead(bb,&b);
1661: VecRestoreArray(xx,&x);
1662: PetscLogFlops(4.0*a->nz - 3*mbs);
1663: return(0);
1664: }
1666: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1667: {
1668: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1669: IS isrow=a->row;
1670: PetscErrorCode ierr;
1671: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1672: const MatScalar *aa=a->a,*v;
1673: PetscReal diagk;
1674: PetscScalar *x,xk;
1675: const PetscScalar *b;
1676: PetscInt nz,k;
1679: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1680: VecGetArrayRead(bb,&b);
1681: VecGetArray(xx,&x);
1682: ISGetIndices(isrow,&rp);
1684: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1685: for (k=0; k<mbs; k++) {
1686: v = aa + ai[k];
1687: vj = aj + ai[k];
1688: xk = x[k];
1689: nz = ai[k+1] - ai[k] - 1;
1690: while (nz--) x[*vj++] += (*v++) * xk;
1692: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1693: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1694: x[k] = xk*PetscSqrtReal(diagk);
1695: }
1696: ISRestoreIndices(isrow,&rp);
1697: VecRestoreArrayRead(bb,&b);
1698: VecRestoreArray(xx,&x);
1699: PetscLogFlops(2.0*a->nz - mbs);
1700: return(0);
1701: }
1703: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1704: {
1705: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1706: IS isrow=a->row;
1707: PetscErrorCode ierr;
1708: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1709: const MatScalar *aa=a->a,*v;
1710: PetscReal diagk;
1711: PetscScalar *x,xk;
1712: const PetscScalar *b;
1713: PetscInt nz,k;
1716: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1717: VecGetArrayRead(bb,&b);
1718: VecGetArray(xx,&x);
1719: ISGetIndices(isrow,&rp);
1721: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1722: for (k=0; k<mbs; k++) {
1723: v = aa + ai[k] + 1;
1724: vj = aj + ai[k] + 1;
1725: xk = x[k];
1726: nz = ai[k+1] - ai[k] - 1;
1727: while (nz--) x[*vj++] += (*v++) * xk;
1729: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1730: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1731: x[k] = xk*PetscSqrtReal(diagk);
1732: }
1733: ISRestoreIndices(isrow,&rp);
1734: VecRestoreArrayRead(bb,&b);
1735: VecRestoreArray(xx,&x);
1736: PetscLogFlops(2.0*a->nz - mbs);
1737: return(0);
1738: }
1740: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1741: {
1742: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1743: IS isrow=a->row;
1744: PetscErrorCode ierr;
1745: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1746: const MatScalar *aa=a->a,*v;
1747: PetscReal diagk;
1748: PetscScalar *x,*t;
1749: const PetscScalar *b;
1750: PetscInt nz,k;
1753: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1754: VecGetArrayRead(bb,&b);
1755: VecGetArray(xx,&x);
1756: t = a->solve_work;
1757: ISGetIndices(isrow,&rp);
1759: for (k=mbs-1; k>=0; k--) {
1760: v = aa + ai[k];
1761: vj = aj + ai[k];
1762: diagk = PetscRealPart(aa[adiag[k]]);
1763: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1764: t[k] = b[k] * PetscSqrtReal(diagk);
1765: nz = ai[k+1] - ai[k] - 1;
1766: while (nz--) t[k] += (*v++) * t[*vj++];
1767: x[rp[k]] = t[k];
1768: }
1769: ISRestoreIndices(isrow,&rp);
1770: VecRestoreArrayRead(bb,&b);
1771: VecRestoreArray(xx,&x);
1772: PetscLogFlops(2.0*a->nz - mbs);
1773: return(0);
1774: }
1776: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1777: {
1778: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1779: IS isrow=a->row;
1780: PetscErrorCode ierr;
1781: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1782: const MatScalar *aa=a->a,*v;
1783: PetscReal diagk;
1784: PetscScalar *x,*t;
1785: const PetscScalar *b;
1786: PetscInt nz,k;
1789: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1790: VecGetArrayRead(bb,&b);
1791: VecGetArray(xx,&x);
1792: t = a->solve_work;
1793: ISGetIndices(isrow,&rp);
1795: for (k=mbs-1; k>=0; k--) {
1796: v = aa + ai[k] + 1;
1797: vj = aj + ai[k] + 1;
1798: diagk = PetscRealPart(aa[ai[k]]);
1799: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1800: t[k] = b[k] * PetscSqrtReal(diagk);
1801: nz = ai[k+1] - ai[k] - 1;
1802: while (nz--) t[k] += (*v++) * t[*vj++];
1803: x[rp[k]] = t[k];
1804: }
1805: ISRestoreIndices(isrow,&rp);
1806: VecRestoreArrayRead(bb,&b);
1807: VecRestoreArray(xx,&x);
1808: PetscLogFlops(2.0*a->nz - mbs);
1809: return(0);
1810: }
1812: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1813: {
1814: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1818: if (A->rmap->bs == 1) {
1819: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1820: } else {
1821: IS isrow=a->row;
1822: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1823: const MatScalar *aa=a->a,*v;
1824: PetscScalar *x,*t;
1825: const PetscScalar *b;
1826: PetscInt nz,k,n,i,j;
1828: if (bb->n > a->solves_work_n) {
1829: PetscFree(a->solves_work);
1830: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1831: a->solves_work_n = bb->n;
1832: }
1833: n = bb->n;
1834: VecGetArrayRead(bb->v,&b);
1835: VecGetArray(xx->v,&x);
1836: t = a->solves_work;
1838: ISGetIndices(isrow,&rp);
1840: /* solve U^T*D*y = perm(b) by forward substitution */
1841: for (k=0; k<mbs; k++) {
1842: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1843: }
1844: for (k=0; k<mbs; k++) {
1845: v = aa + ai[k];
1846: vj = aj + ai[k];
1847: nz = ai[k+1] - ai[k] - 1;
1848: for (j=0; j<nz; j++) {
1849: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1850: v++;vj++;
1851: }
1852: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1853: }
1855: /* solve U*perm(x) = y by back substitution */
1856: for (k=mbs-1; k>=0; k--) {
1857: v = aa + ai[k] - 1;
1858: vj = aj + ai[k] - 1;
1859: nz = ai[k+1] - ai[k] - 1;
1860: for (j=0; j<nz; j++) {
1861: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1862: v++;vj++;
1863: }
1864: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1865: }
1867: ISRestoreIndices(isrow,&rp);
1868: VecRestoreArrayRead(bb->v,&b);
1869: VecRestoreArray(xx->v,&x);
1870: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1871: }
1872: return(0);
1873: }
1875: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1876: {
1877: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1881: if (A->rmap->bs == 1) {
1882: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1883: } else {
1884: IS isrow=a->row;
1885: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1886: const MatScalar *aa=a->a,*v;
1887: PetscScalar *x,*t;
1888: const PetscScalar *b;
1889: PetscInt nz,k,n,i;
1891: if (bb->n > a->solves_work_n) {
1892: PetscFree(a->solves_work);
1893: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1894: a->solves_work_n = bb->n;
1895: }
1896: n = bb->n;
1897: VecGetArrayRead(bb->v,&b);
1898: VecGetArray(xx->v,&x);
1899: t = a->solves_work;
1901: ISGetIndices(isrow,&rp);
1903: /* solve U^T*D*y = perm(b) by forward substitution */
1904: for (k=0; k<mbs; k++) {
1905: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1906: }
1907: for (k=0; k<mbs; k++) {
1908: v = aa + ai[k];
1909: vj = aj + ai[k];
1910: nz = ai[k+1] - ai[k];
1911: while (nz--) {
1912: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1913: v++;vj++;
1914: }
1915: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1916: }
1918: /* solve U*perm(x) = y by back substitution */
1919: for (k=mbs-1; k>=0; k--) {
1920: v = aa + ai[k];
1921: vj = aj + ai[k];
1922: nz = ai[k+1] - ai[k];
1923: while (nz--) {
1924: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1925: v++;vj++;
1926: }
1927: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1928: }
1930: ISRestoreIndices(isrow,&rp);
1931: VecRestoreArrayRead(bb->v,&b);
1932: VecRestoreArray(xx->v,&x);
1933: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1934: }
1935: return(0);
1936: }
1938: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1939: {
1940: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1941: PetscErrorCode ierr;
1942: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1943: const MatScalar *aa=a->a,*v;
1944: const PetscScalar *b;
1945: PetscScalar *x,xi;
1946: PetscInt nz,i,j;
1949: VecGetArrayRead(bb,&b);
1950: VecGetArray(xx,&x);
1951: /* solve U^T*D*y = b by forward substitution */
1952: PetscArraycpy(x,b,mbs);
1953: for (i=0; i<mbs; i++) {
1954: v = aa + ai[i];
1955: vj = aj + ai[i];
1956: xi = x[i];
1957: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1958: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1959: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1960: }
1961: /* solve U*x = y by backward substitution */
1962: for (i=mbs-2; i>=0; i--) {
1963: xi = x[i];
1964: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1965: vj = aj + adiag[i] - 1;
1966: nz = ai[i+1] - ai[i] - 1;
1967: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1968: x[i] = xi;
1969: }
1970: VecRestoreArrayRead(bb,&b);
1971: VecRestoreArray(xx,&x);
1972: PetscLogFlops(4.0*a->nz - 3*mbs);
1973: return(0);
1974: }
1976: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1977: {
1978: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1979: PetscErrorCode ierr;
1980: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1981: const MatScalar *aa=a->a,*v;
1982: const PetscScalar *b;
1983: PetscScalar *x,xi;
1984: PetscInt nz,i,j,neq,ldb,ldx;
1985: PetscBool isdense;
1988: if (!mbs) return(0);
1989: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1990: if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1991: if (X != B) {
1992: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1993: if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1994: }
1995: MatDenseGetArrayRead(B,&b);
1996: MatDenseGetLDA(B,&ldb);
1997: MatDenseGetArray(X,&x);
1998: MatDenseGetLDA(X,&ldx);
1999: for (neq=0; neq<B->cmap->n; neq++) {
2000: /* solve U^T*D*y = b by forward substitution */
2001: PetscArraycpy(x,b,mbs);
2002: for (i=0; i<mbs; i++) {
2003: v = aa + ai[i];
2004: vj = aj + ai[i];
2005: xi = x[i];
2006: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2007: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2008: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2009: }
2010: /* solve U*x = y by backward substitution */
2011: for (i=mbs-2; i>=0; i--) {
2012: xi = x[i];
2013: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2014: vj = aj + adiag[i] - 1;
2015: nz = ai[i+1] - ai[i] - 1;
2016: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2017: x[i] = xi;
2018: }
2019: b += ldb;
2020: x += ldx;
2021: }
2022: MatDenseRestoreArrayRead(B,&b);
2023: MatDenseRestoreArray(X,&x);
2024: PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
2025: return(0);
2026: }
2028: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2029: {
2030: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2031: PetscErrorCode ierr;
2032: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2033: const MatScalar *aa=a->a,*v;
2034: PetscScalar *x,xk;
2035: const PetscScalar *b;
2036: PetscInt nz,k;
2039: VecGetArrayRead(bb,&b);
2040: VecGetArray(xx,&x);
2042: /* solve U^T*D*y = b by forward substitution */
2043: PetscArraycpy(x,b,mbs);
2044: for (k=0; k<mbs; k++) {
2045: v = aa + ai[k] + 1;
2046: vj = aj + ai[k] + 1;
2047: xk = x[k];
2048: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2049: while (nz--) x[*vj++] += (*v++) * xk;
2050: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2051: }
2053: /* solve U*x = y by back substitution */
2054: for (k=mbs-2; k>=0; k--) {
2055: v = aa + ai[k] + 1;
2056: vj = aj + ai[k] + 1;
2057: xk = x[k];
2058: nz = ai[k+1] - ai[k] - 1;
2059: while (nz--) {
2060: xk += (*v++) * x[*vj++];
2061: }
2062: x[k] = xk;
2063: }
2065: VecRestoreArrayRead(bb,&b);
2066: VecRestoreArray(xx,&x);
2067: PetscLogFlops(4.0*a->nz - 3*mbs);
2068: return(0);
2069: }
2071: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2072: {
2073: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2074: PetscErrorCode ierr;
2075: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2076: const MatScalar *aa=a->a,*v;
2077: PetscReal diagk;
2078: PetscScalar *x;
2079: const PetscScalar *b;
2080: PetscInt nz,k;
2083: /* solve U^T*D^(1/2)*x = b by forward substitution */
2084: VecGetArrayRead(bb,&b);
2085: VecGetArray(xx,&x);
2086: PetscArraycpy(x,b,mbs);
2087: for (k=0; k<mbs; k++) {
2088: v = aa + ai[k];
2089: vj = aj + ai[k];
2090: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2091: while (nz--) x[*vj++] += (*v++) * x[k];
2092: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2093: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2094: x[k] *= PetscSqrtReal(diagk);
2095: }
2096: VecRestoreArrayRead(bb,&b);
2097: VecRestoreArray(xx,&x);
2098: PetscLogFlops(2.0*a->nz - mbs);
2099: return(0);
2100: }
2102: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2103: {
2104: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2105: PetscErrorCode ierr;
2106: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2107: const MatScalar *aa=a->a,*v;
2108: PetscReal diagk;
2109: PetscScalar *x;
2110: const PetscScalar *b;
2111: PetscInt nz,k;
2114: /* solve U^T*D^(1/2)*x = b by forward substitution */
2115: VecGetArrayRead(bb,&b);
2116: VecGetArray(xx,&x);
2117: PetscArraycpy(x,b,mbs);
2118: for (k=0; k<mbs; k++) {
2119: v = aa + ai[k] + 1;
2120: vj = aj + ai[k] + 1;
2121: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2122: while (nz--) x[*vj++] += (*v++) * x[k];
2123: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2124: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2125: x[k] *= PetscSqrtReal(diagk);
2126: }
2127: VecRestoreArrayRead(bb,&b);
2128: VecRestoreArray(xx,&x);
2129: PetscLogFlops(2.0*a->nz - mbs);
2130: return(0);
2131: }
2133: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2134: {
2135: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2136: PetscErrorCode ierr;
2137: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2138: const MatScalar *aa=a->a,*v;
2139: PetscReal diagk;
2140: PetscScalar *x;
2141: const PetscScalar *b;
2142: PetscInt nz,k;
2145: /* solve D^(1/2)*U*x = b by back substitution */
2146: VecGetArrayRead(bb,&b);
2147: VecGetArray(xx,&x);
2149: for (k=mbs-1; k>=0; k--) {
2150: v = aa + ai[k];
2151: vj = aj + ai[k];
2152: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2153: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2154: x[k] = PetscSqrtReal(diagk)*b[k];
2155: nz = ai[k+1] - ai[k] - 1;
2156: while (nz--) x[k] += (*v++) * x[*vj++];
2157: }
2158: VecRestoreArrayRead(bb,&b);
2159: VecRestoreArray(xx,&x);
2160: PetscLogFlops(2.0*a->nz - mbs);
2161: return(0);
2162: }
2164: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2165: {
2166: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2167: PetscErrorCode ierr;
2168: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2169: const MatScalar *aa=a->a,*v;
2170: PetscReal diagk;
2171: PetscScalar *x;
2172: const PetscScalar *b;
2173: PetscInt nz,k;
2176: /* solve D^(1/2)*U*x = b by back substitution */
2177: VecGetArrayRead(bb,&b);
2178: VecGetArray(xx,&x);
2180: for (k=mbs-1; k>=0; k--) {
2181: v = aa + ai[k] + 1;
2182: vj = aj + ai[k] + 1;
2183: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2184: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2185: x[k] = PetscSqrtReal(diagk)*b[k];
2186: nz = ai[k+1] - ai[k] - 1;
2187: while (nz--) x[k] += (*v++) * x[*vj++];
2188: }
2189: VecRestoreArrayRead(bb,&b);
2190: VecRestoreArray(xx,&x);
2191: PetscLogFlops(2.0*a->nz - mbs);
2192: return(0);
2193: }
2195: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2196: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2197: {
2198: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2200: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2201: PetscInt *jutmp,bs = A->rmap->bs,i;
2202: PetscInt m,reallocs = 0,*levtmp;
2203: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2204: PetscInt incrlev,*lev,shift,prow,nz;
2205: PetscReal f = info->fill,levels = info->levels;
2206: PetscBool perm_identity;
2209: /* check whether perm is the identity mapping */
2210: ISIdentity(perm,&perm_identity);
2212: if (perm_identity) {
2213: a->permute = PETSC_FALSE;
2214: ai = a->i; aj = a->j;
2215: } else { /* non-trivial permutation */
2216: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2217: }
2219: /* initialization */
2220: ISGetIndices(perm,&rip);
2221: umax = (PetscInt)(f*ai[mbs] + 1);
2222: PetscMalloc1(umax,&lev);
2223: umax += mbs + 1;
2224: shift = mbs + 1;
2225: PetscMalloc1(mbs+1,&iu);
2226: PetscMalloc1(umax,&ju);
2227: iu[0] = mbs + 1;
2228: juidx = mbs + 1;
2229: /* prowl: linked list for pivot row */
2230: PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2231: /* q: linked list for col index */
2233: for (i=0; i<mbs; i++) {
2234: prowl[i] = mbs;
2235: q[i] = 0;
2236: }
2238: /* for each row k */
2239: for (k=0; k<mbs; k++) {
2240: nzk = 0;
2241: q[k] = mbs;
2242: /* copy current row into linked list */
2243: nz = ai[rip[k]+1] - ai[rip[k]];
2244: j = ai[rip[k]];
2245: while (nz--) {
2246: vj = rip[aj[j++]];
2247: if (vj > k) {
2248: qm = k;
2249: do {
2250: m = qm; qm = q[m];
2251: } while (qm < vj);
2252: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2253: nzk++;
2254: q[m] = vj;
2255: q[vj] = qm;
2256: levtmp[vj] = 0; /* initialize lev for nonzero element */
2257: }
2258: }
2260: /* modify nonzero structure of k-th row by computing fill-in
2261: for each row prow to be merged in */
2262: prow = k;
2263: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2265: while (prow < k) {
2266: /* merge row prow into k-th row */
2267: jmin = iu[prow] + 1;
2268: jmax = iu[prow+1];
2269: qm = k;
2270: for (j=jmin; j<jmax; j++) {
2271: incrlev = lev[j-shift] + 1;
2272: if (incrlev > levels) continue;
2273: vj = ju[j];
2274: do {
2275: m = qm; qm = q[m];
2276: } while (qm < vj);
2277: if (qm != vj) { /* a fill */
2278: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2279: levtmp[vj] = incrlev;
2280: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2281: }
2282: prow = prowl[prow]; /* next pivot row */
2283: }
2285: /* add k to row list for first nonzero element in k-th row */
2286: if (nzk > 1) {
2287: i = q[k]; /* col value of first nonzero element in k_th row of U */
2288: prowl[k] = prowl[i]; prowl[i] = k;
2289: }
2290: iu[k+1] = iu[k] + nzk;
2292: /* allocate more space to ju and lev if needed */
2293: if (iu[k+1] > umax) {
2294: /* estimate how much additional space we will need */
2295: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2296: /* just double the memory each time */
2297: maxadd = umax;
2298: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2299: umax += maxadd;
2301: /* allocate a longer ju */
2302: PetscMalloc1(umax,&jutmp);
2303: PetscArraycpy(jutmp,ju,iu[k]);
2304: PetscFree(ju);
2305: ju = jutmp;
2307: PetscMalloc1(umax,&jutmp);
2308: PetscArraycpy(jutmp,lev,iu[k]-shift);
2309: PetscFree(lev);
2310: lev = jutmp;
2311: reallocs += 2; /* count how many times we realloc */
2312: }
2314: /* save nonzero structure of k-th row in ju */
2315: i=k;
2316: while (nzk--) {
2317: i = q[i];
2318: ju[juidx] = i;
2319: lev[juidx-shift] = levtmp[i];
2320: juidx++;
2321: }
2322: }
2324: #if defined(PETSC_USE_INFO)
2325: if (ai[mbs] != 0) {
2326: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2327: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2328: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2329: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2330: PetscInfo(A,"for best performance.\n");
2331: } else {
2332: PetscInfo(A,"Empty matrix\n");
2333: }
2334: #endif
2336: ISRestoreIndices(perm,&rip);
2337: PetscFree3(prowl,q,levtmp);
2338: PetscFree(lev);
2340: /* put together the new matrix */
2341: MatSeqSBAIJSetPreallocation(B,bs,0,NULL);
2343: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2344: b = (Mat_SeqSBAIJ*)(B)->data;
2345: PetscFree2(b->imax,b->ilen);
2347: b->singlemalloc = PETSC_FALSE;
2348: b->free_a = PETSC_TRUE;
2349: b->free_ij = PETSC_TRUE;
2350: /* the next line frees the default space generated by the Create() */
2351: PetscFree3(b->a,b->j,b->i);
2352: PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2353: b->j = ju;
2354: b->i = iu;
2355: b->diag = NULL;
2356: b->ilen = NULL;
2357: b->imax = NULL;
2359: ISDestroy(&b->row);
2360: ISDestroy(&b->icol);
2361: b->row = perm;
2362: b->icol = perm;
2363: PetscObjectReference((PetscObject)perm);
2364: PetscObjectReference((PetscObject)perm);
2365: PetscMalloc1(bs*mbs+bs,&b->solve_work);
2366: /* In b structure: Free imax, ilen, old a, old j.
2367: Allocate idnew, solve_work, new a, new j */
2368: PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2369: b->maxnz = b->nz = iu[mbs];
2371: (B)->info.factor_mallocs = reallocs;
2372: (B)->info.fill_ratio_given = f;
2373: if (ai[mbs] != 0) {
2374: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2375: } else {
2376: (B)->info.fill_ratio_needed = 0.0;
2377: }
2378: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2379: return(0);
2380: }
2382: /*
2383: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2384: */
2385: #include <petscbt.h>
2386: #include <../src/mat/utils/freespace.h>
2387: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2388: {
2389: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2390: PetscErrorCode ierr;
2391: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2392: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2393: const PetscInt *rip;
2394: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2395: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2396: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2397: PetscReal fill =info->fill,levels=info->levels;
2398: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2399: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2400: PetscBT lnkbt;
2403: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2404: MatMissingDiagonal(A,&missing,&d);
2405: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2406: if (bs > 1) {
2407: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2408: return(0);
2409: }
2411: /* check whether perm is the identity mapping */
2412: ISIdentity(perm,&perm_identity);
2413: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2414: a->permute = PETSC_FALSE;
2416: PetscMalloc1(am+1,&ui);
2417: PetscMalloc1(am+1,&udiag);
2418: ui[0] = 0;
2420: /* ICC(0) without matrix ordering: simply rearrange column indices */
2421: if (!levels) {
2422: /* reuse the column pointers and row offsets for memory savings */
2423: for (i=0; i<am; i++) {
2424: ncols = ai[i+1] - ai[i];
2425: ui[i+1] = ui[i] + ncols;
2426: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2427: }
2428: PetscMalloc1(ui[am]+1,&uj);
2429: cols = uj;
2430: for (i=0; i<am; i++) {
2431: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2432: ncols = ai[i+1] - ai[i] -1;
2433: for (j=0; j<ncols; j++) *cols++ = aj[j];
2434: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2435: }
2436: } else { /* case: levels>0 */
2437: ISGetIndices(perm,&rip);
2439: /* initialization */
2440: /* jl: linked list for storing indices of the pivot rows
2441: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2442: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2443: for (i=0; i<am; i++) {
2444: jl[i] = am; il[i] = 0;
2445: }
2447: /* create and initialize a linked list for storing column indices of the active row k */
2448: nlnk = am + 1;
2449: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2451: /* initial FreeSpace size is fill*(ai[am]+1) */
2452: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2454: current_space = free_space;
2456: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2458: current_space_lvl = free_space_lvl;
2460: for (k=0; k<am; k++) { /* for each active row k */
2461: /* initialize lnk by the column indices of row k */
2462: nzk = 0;
2463: ncols = ai[k+1] - ai[k];
2464: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2465: cols = aj+ai[k];
2466: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2467: nzk += nlnk;
2469: /* update lnk by computing fill-in for each pivot row to be merged in */
2470: prow = jl[k]; /* 1st pivot row */
2472: while (prow < k) {
2473: nextprow = jl[prow];
2475: /* merge prow into k-th row */
2476: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2477: jmax = ui[prow+1];
2478: ncols = jmax-jmin;
2479: i = jmin - ui[prow];
2481: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2482: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2483: j = *(uj - 1);
2484: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2485: nzk += nlnk;
2487: /* update il and jl for prow */
2488: if (jmin < jmax) {
2489: il[prow] = jmin;
2491: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2492: }
2493: prow = nextprow;
2494: }
2496: /* if free space is not available, make more free space */
2497: if (current_space->local_remaining<nzk) {
2498: i = am - k + 1; /* num of unfactored rows */
2499: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2500: PetscFreeSpaceGet(i,¤t_space);
2501: PetscFreeSpaceGet(i,¤t_space_lvl);
2502: reallocs++;
2503: }
2505: /* copy data into free_space and free_space_lvl, then initialize lnk */
2506: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2507: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2509: /* add the k-th row into il and jl */
2510: if (nzk > 1) {
2511: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2512: jl[k] = jl[i]; jl[i] = k;
2513: il[k] = ui[k] + 1;
2514: }
2515: uj_ptr[k] = current_space->array;
2516: uj_lvl_ptr[k] = current_space_lvl->array;
2518: current_space->array += nzk;
2519: current_space->local_used += nzk;
2520: current_space->local_remaining -= nzk;
2521: current_space_lvl->array += nzk;
2522: current_space_lvl->local_used += nzk;
2523: current_space_lvl->local_remaining -= nzk;
2525: ui[k+1] = ui[k] + nzk;
2526: }
2528: ISRestoreIndices(perm,&rip);
2529: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2531: /* destroy list of free space and other temporary array(s) */
2532: PetscMalloc1(ui[am]+1,&uj);
2533: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2534: PetscIncompleteLLDestroy(lnk,lnkbt);
2535: PetscFreeSpaceDestroy(free_space_lvl);
2537: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2539: /* put together the new matrix in MATSEQSBAIJ format */
2540: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2542: b = (Mat_SeqSBAIJ*)(fact)->data;
2543: PetscFree2(b->imax,b->ilen);
2545: b->singlemalloc = PETSC_FALSE;
2546: b->free_a = PETSC_TRUE;
2547: b->free_ij = free_ij;
2549: PetscMalloc1(ui[am]+1,&b->a);
2551: b->j = uj;
2552: b->i = ui;
2553: b->diag = udiag;
2554: b->free_diag = PETSC_TRUE;
2555: b->ilen = NULL;
2556: b->imax = NULL;
2557: b->row = perm;
2558: b->col = perm;
2560: PetscObjectReference((PetscObject)perm);
2561: PetscObjectReference((PetscObject)perm);
2563: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2565: PetscMalloc1(am+1,&b->solve_work);
2566: PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2568: b->maxnz = b->nz = ui[am];
2570: fact->info.factor_mallocs = reallocs;
2571: fact->info.fill_ratio_given = fill;
2572: if (ai[am] != 0) {
2573: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2574: } else {
2575: fact->info.fill_ratio_needed = 0.0;
2576: }
2577: #if defined(PETSC_USE_INFO)
2578: if (ai[am] != 0) {
2579: PetscReal af = fact->info.fill_ratio_needed;
2580: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2581: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2582: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2583: } else {
2584: PetscInfo(A,"Empty matrix\n");
2585: }
2586: #endif
2587: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2588: return(0);
2589: }
2591: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2592: {
2593: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2594: Mat_SeqSBAIJ *b;
2595: PetscErrorCode ierr;
2596: PetscBool perm_identity,free_ij = PETSC_TRUE;
2597: PetscInt bs=A->rmap->bs,am=a->mbs;
2598: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2599: PetscInt reallocs=0,i,*ui;
2600: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2601: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2602: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2603: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2604: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2605: PetscBT lnkbt;
2608: /*
2609: This code originally uses Modified Sparse Row (MSR) storage
2610: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2611: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2612: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2613: thus the original code in MSR format is still used for these cases.
2614: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2615: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2616: */
2617: if (bs > 1) {
2618: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2619: return(0);
2620: }
2622: /* check whether perm is the identity mapping */
2623: ISIdentity(perm,&perm_identity);
2624: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2625: a->permute = PETSC_FALSE;
2627: /* special case that simply copies fill pattern */
2628: if (!levels) {
2629: /* reuse the column pointers and row offsets for memory savings */
2630: ui = a->i;
2631: uj = a->j;
2632: free_ij = PETSC_FALSE;
2633: ratio_needed = 1.0;
2634: } else { /* case: levels>0 */
2635: ISGetIndices(perm,&rip);
2637: /* initialization */
2638: PetscMalloc1(am+1,&ui);
2639: ui[0] = 0;
2641: /* jl: linked list for storing indices of the pivot rows
2642: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2643: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2644: for (i=0; i<am; i++) {
2645: jl[i] = am; il[i] = 0;
2646: }
2648: /* create and initialize a linked list for storing column indices of the active row k */
2649: nlnk = am + 1;
2650: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2652: /* initial FreeSpace size is fill*(ai[am]+1) */
2653: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2655: current_space = free_space;
2657: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2659: current_space_lvl = free_space_lvl;
2661: for (k=0; k<am; k++) { /* for each active row k */
2662: /* initialize lnk by the column indices of row rip[k] */
2663: nzk = 0;
2664: ncols = ai[rip[k]+1] - ai[rip[k]];
2665: cols = aj+ai[rip[k]];
2666: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2667: nzk += nlnk;
2669: /* update lnk by computing fill-in for each pivot row to be merged in */
2670: prow = jl[k]; /* 1st pivot row */
2672: while (prow < k) {
2673: nextprow = jl[prow];
2675: /* merge prow into k-th row */
2676: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2677: jmax = ui[prow+1];
2678: ncols = jmax-jmin;
2679: i = jmin - ui[prow];
2680: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2681: j = *(uj_lvl_ptr[prow] + i - 1);
2682: cols_lvl = uj_lvl_ptr[prow]+i;
2683: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2684: nzk += nlnk;
2686: /* update il and jl for prow */
2687: if (jmin < jmax) {
2688: il[prow] = jmin;
2689: j = *cols;
2690: jl[prow] = jl[j];
2691: jl[j] = prow;
2692: }
2693: prow = nextprow;
2694: }
2696: /* if free space is not available, make more free space */
2697: if (current_space->local_remaining<nzk) {
2698: i = am - k + 1; /* num of unfactored rows */
2699: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2700: PetscFreeSpaceGet(i,¤t_space);
2701: PetscFreeSpaceGet(i,¤t_space_lvl);
2702: reallocs++;
2703: }
2705: /* copy data into free_space and free_space_lvl, then initialize lnk */
2706: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2708: /* add the k-th row into il and jl */
2709: if (nzk-1 > 0) {
2710: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2711: jl[k] = jl[i]; jl[i] = k;
2712: il[k] = ui[k] + 1;
2713: }
2714: uj_ptr[k] = current_space->array;
2715: uj_lvl_ptr[k] = current_space_lvl->array;
2717: current_space->array += nzk;
2718: current_space->local_used += nzk;
2719: current_space->local_remaining -= nzk;
2720: current_space_lvl->array += nzk;
2721: current_space_lvl->local_used += nzk;
2722: current_space_lvl->local_remaining -= nzk;
2724: ui[k+1] = ui[k] + nzk;
2725: }
2727: ISRestoreIndices(perm,&rip);
2728: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2730: /* destroy list of free space and other temporary array(s) */
2731: PetscMalloc1(ui[am]+1,&uj);
2732: PetscFreeSpaceContiguous(&free_space,uj);
2733: PetscIncompleteLLDestroy(lnk,lnkbt);
2734: PetscFreeSpaceDestroy(free_space_lvl);
2735: if (ai[am] != 0) {
2736: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2737: } else {
2738: ratio_needed = 0.0;
2739: }
2740: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2742: /* put together the new matrix in MATSEQSBAIJ format */
2743: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2745: b = (Mat_SeqSBAIJ*)(fact)->data;
2747: PetscFree2(b->imax,b->ilen);
2749: b->singlemalloc = PETSC_FALSE;
2750: b->free_a = PETSC_TRUE;
2751: b->free_ij = free_ij;
2753: PetscMalloc1(ui[am]+1,&b->a);
2755: b->j = uj;
2756: b->i = ui;
2757: b->diag = NULL;
2758: b->ilen = NULL;
2759: b->imax = NULL;
2760: b->row = perm;
2761: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2763: PetscObjectReference((PetscObject)perm);
2764: b->icol = perm;
2765: PetscObjectReference((PetscObject)perm);
2766: PetscMalloc1(am+1,&b->solve_work);
2768: b->maxnz = b->nz = ui[am];
2770: fact->info.factor_mallocs = reallocs;
2771: fact->info.fill_ratio_given = fill;
2772: fact->info.fill_ratio_needed = ratio_needed;
2773: #if defined(PETSC_USE_INFO)
2774: if (ai[am] != 0) {
2775: PetscReal af = fact->info.fill_ratio_needed;
2776: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2777: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2778: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2779: } else {
2780: PetscInfo(A,"Empty matrix\n");
2781: }
2782: #endif
2783: if (perm_identity) {
2784: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2785: } else {
2786: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2787: }
2788: return(0);
2789: }