Actual source code: sbaijfact2.c
petsc-3.7.3 2016-08-01
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
12: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
13: {
14: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
15: IS isrow=a->row;
16: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
17: PetscErrorCode ierr;
18: const PetscInt *r;
19: PetscInt nz,*vj,k,idx,k1;
20: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
21: const MatScalar *aa=a->a,*v,*diag;
22: PetscScalar *x,*xk,*xj,*xk_tmp,*t;
23: const PetscScalar *b;
26: VecGetArrayRead(bb,&b);
27: VecGetArray(xx,&x);
28: t = a->solve_work;
29: ISGetIndices(isrow,&r);
30: PetscMalloc1(bs,&xk_tmp);
32: /* solve U^T * D * y = b by forward substitution */
33: xk = t;
34: for (k=0; k<mbs; k++) { /* t <- perm(b) */
35: idx = bs*r[k];
36: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37: }
38: for (k=0; k<mbs; k++) {
39: v = aa + bs2*ai[k];
40: xk = t + k*bs; /* Dk*xk = k-th block of x */
41: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
42: nz = ai[k+1] - ai[k];
43: vj = aj + ai[k];
44: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
45: while (nz--) {
46: /* x(:) += U(k,:)^T*(Dk*xk) */
47: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48: vj++; xj = t + (*vj)*bs;
49: v += bs2;
50: }
51: /* xk = inv(Dk)*(Dk*xk) */
52: diag = aa+k*bs2; /* ptr to inv(Dk) */
53: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54: }
56: /* solve U*x = y by back substitution */
57: for (k=mbs-1; k>=0; k--) {
58: v = aa + bs2*ai[k];
59: xk = t + k*bs; /* xk */
60: nz = ai[k+1] - ai[k];
61: vj = aj + ai[k];
62: xj = t + (*vj)*bs;
63: while (nz--) {
64: /* xk += U(k,:)*x(:) */
65: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66: vj++;
67: v += bs2; xj = t + (*vj)*bs;
68: }
69: idx = bs*r[k];
70: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71: }
73: PetscFree(xk_tmp);
74: ISRestoreIndices(isrow,&r);
75: VecRestoreArrayRead(bb,&b);
76: VecRestoreArray(xx,&x);
77: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
78: return(0);
79: }
83: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
84: {
86: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
87: return(0);
88: }
92: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
93: {
95: SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
96: return(0);
97: }
101: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102: {
103: PetscErrorCode ierr;
104: PetscInt nz,k;
105: const PetscInt *vj,bs2 = bs*bs;
106: const MatScalar *v,*diag;
107: PetscScalar *xk,*xj,*xk_tmp;
110: PetscMalloc1(bs,&xk_tmp);
111: for (k=0; k<mbs; k++) {
112: v = aa + bs2*ai[k];
113: xk = x + k*bs; /* Dk*xk = k-th block of x */
114: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
115: nz = ai[k+1] - ai[k];
116: vj = aj + ai[k];
117: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
118: while (nz--) {
119: /* x(:) += U(k,:)^T*(Dk*xk) */
120: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121: vj++; xj = x + (*vj)*bs;
122: v += bs2;
123: }
124: /* xk = inv(Dk)*(Dk*xk) */
125: diag = aa+k*bs2; /* ptr to inv(Dk) */
126: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127: }
128: PetscFree(xk_tmp);
129: return(0);
130: }
134: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135: {
136: PetscInt nz,k;
137: const PetscInt *vj,bs2 = bs*bs;
138: const MatScalar *v;
139: PetscScalar *xk,*xj;
142: for (k=mbs-1; k>=0; k--) {
143: v = aa + bs2*ai[k];
144: xk = x + k*bs; /* xk */
145: nz = ai[k+1] - ai[k];
146: vj = aj + ai[k];
147: xj = x + (*vj)*bs;
148: while (nz--) {
149: /* xk += U(k,:)*x(:) */
150: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151: vj++;
152: v += bs2; xj = x + (*vj)*bs;
153: }
154: }
155: return(0);
156: }
160: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161: {
162: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
163: PetscErrorCode ierr;
164: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
165: PetscInt bs =A->rmap->bs;
166: const MatScalar *aa=a->a;
167: PetscScalar *x;
168: const PetscScalar *b;
171: VecGetArrayRead(bb,&b);
172: VecGetArray(xx,&x);
174: /* solve U^T * D * y = b by forward substitution */
175: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
176: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
178: /* solve U*x = y by back substitution */
179: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
181: VecRestoreArrayRead(bb,&b);
182: VecRestoreArray(xx,&x);
183: PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
184: return(0);
185: }
189: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
190: {
191: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
192: PetscErrorCode ierr;
193: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
194: PetscInt bs =A->rmap->bs;
195: const MatScalar *aa=a->a;
196: const PetscScalar *b;
197: PetscScalar *x;
200: VecGetArrayRead(bb,&b);
201: VecGetArray(xx,&x);
202: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
203: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
204: VecRestoreArrayRead(bb,&b);
205: VecRestoreArray(xx,&x);
206: PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
207: return(0);
208: }
212: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
213: {
214: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
215: PetscErrorCode ierr;
216: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
217: PetscInt bs =A->rmap->bs;
218: const MatScalar *aa=a->a;
219: const PetscScalar *b;
220: PetscScalar *x;
223: VecGetArrayRead(bb,&b);
224: VecGetArray(xx,&x);
225: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
226: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
227: VecRestoreArrayRead(bb,&b);
228: VecRestoreArray(xx,&x);
229: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
230: return(0);
231: }
235: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
236: {
237: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
238: IS isrow=a->row;
239: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
240: PetscErrorCode ierr;
241: PetscInt nz,k,idx;
242: const MatScalar *aa=a->a,*v,*d;
243: PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
244: const PetscScalar *b;
247: VecGetArrayRead(bb,&b);
248: VecGetArray(xx,&x);
249: t = a->solve_work;
250: ISGetIndices(isrow,&r);
252: /* solve U^T * D * y = b by forward substitution */
253: tp = t;
254: for (k=0; k<mbs; k++) { /* t <- perm(b) */
255: idx = 7*r[k];
256: tp[0] = b[idx];
257: tp[1] = b[idx+1];
258: tp[2] = b[idx+2];
259: tp[3] = b[idx+3];
260: tp[4] = b[idx+4];
261: tp[5] = b[idx+5];
262: tp[6] = b[idx+6];
263: tp += 7;
264: }
266: for (k=0; k<mbs; k++) {
267: v = aa + 49*ai[k];
268: vj = aj + ai[k];
269: tp = t + k*7;
270: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
271: nz = ai[k+1] - ai[k];
272: tp = t + (*vj)*7;
273: while (nz--) {
274: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
275: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
276: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
277: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
278: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
279: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
280: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
281: vj++;
282: tp = t + (*vj)*7;
283: v += 49;
284: }
286: /* xk = inv(Dk)*(Dk*xk) */
287: d = aa+k*49; /* ptr to inv(Dk) */
288: tp = t + k*7;
289: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
290: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
291: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
292: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
293: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
294: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
295: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
296: }
298: /* solve U*x = y by back substitution */
299: for (k=mbs-1; k>=0; k--) {
300: v = aa + 49*ai[k];
301: vj = aj + ai[k];
302: tp = t + k*7;
303: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
304: nz = ai[k+1] - ai[k];
306: tp = t + (*vj)*7;
307: while (nz--) {
308: /* xk += U(k,:)*x(:) */
309: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
310: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
311: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
312: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
313: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
314: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
315: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
316: vj++;
317: tp = t + (*vj)*7;
318: v += 49;
319: }
320: tp = t + k*7;
321: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
322: idx = 7*r[k];
323: x[idx] = x0;
324: x[idx+1] = x1;
325: x[idx+2] = x2;
326: x[idx+3] = x3;
327: x[idx+4] = x4;
328: x[idx+5] = x5;
329: x[idx+6] = x6;
330: }
332: ISRestoreIndices(isrow,&r);
333: VecRestoreArrayRead(bb,&b);
334: VecRestoreArray(xx,&x);
335: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
336: return(0);
337: }
341: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
342: {
343: const MatScalar *v,*d;
344: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
345: PetscInt nz,k;
346: const PetscInt *vj;
349: for (k=0; k<mbs; k++) {
350: v = aa + 49*ai[k];
351: xp = x + k*7;
352: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
353: nz = ai[k+1] - ai[k];
354: vj = aj + ai[k];
355: xp = x + (*vj)*7;
356: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358: while (nz--) {
359: /* x(:) += U(k,:)^T*(Dk*xk) */
360: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
361: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
362: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
363: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
364: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
365: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
366: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
367: vj++;
368: xp = x + (*vj)*7;
369: v += 49;
370: }
371: /* xk = inv(Dk)*(Dk*xk) */
372: d = aa+k*49; /* ptr to inv(Dk) */
373: xp = x + k*7;
374: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
375: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
376: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
377: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
378: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
379: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
380: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
381: }
382: return(0);
383: }
387: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
388: {
389: const MatScalar *v;
390: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
391: PetscInt nz,k;
392: const PetscInt *vj;
395: for (k=mbs-1; k>=0; k--) {
396: v = aa + 49*ai[k];
397: xp = x + k*7;
398: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
399: nz = ai[k+1] - ai[k];
400: vj = aj + ai[k];
401: xp = x + (*vj)*7;
402: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
403: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
404: while (nz--) {
405: /* xk += U(k,:)*x(:) */
406: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
407: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
408: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
409: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
410: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
411: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
412: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
413: vj++;
414: v += 49; xp = x + (*vj)*7;
415: }
416: xp = x + k*7;
417: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
418: }
419: return(0);
420: }
424: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
427: PetscErrorCode ierr;
428: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
429: const MatScalar *aa=a->a;
430: PetscScalar *x;
431: const PetscScalar *b;
434: VecGetArrayRead(bb,&b);
435: VecGetArray(xx,&x);
437: /* solve U^T * D * y = b by forward substitution */
438: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
439: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
441: /* solve U*x = y by back substitution */
442: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
444: VecRestoreArrayRead(bb,&b);
445: VecRestoreArray(xx,&x);
446: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
447: return(0);
448: }
452: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
453: {
454: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
455: PetscErrorCode ierr;
456: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
457: const MatScalar *aa=a->a;
458: PetscScalar *x;
459: const PetscScalar *b;
462: VecGetArrayRead(bb,&b);
463: VecGetArray(xx,&x);
464: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
465: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
466: VecRestoreArrayRead(bb,&b);
467: VecRestoreArray(xx,&x);
468: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
469: return(0);
470: }
474: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
475: {
476: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
477: PetscErrorCode ierr;
478: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
479: const MatScalar *aa=a->a;
480: PetscScalar *x;
481: const PetscScalar *b;
484: VecGetArrayRead(bb,&b);
485: VecGetArray(xx,&x);
486: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
487: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
488: VecRestoreArrayRead(bb,&b);
489: VecRestoreArray(xx,&x);
490: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
491: return(0);
492: }
496: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
497: {
498: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
499: IS isrow=a->row;
500: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
501: PetscErrorCode ierr;
502: PetscInt nz,k,idx;
503: const MatScalar *aa=a->a,*v,*d;
504: PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp;
505: const PetscScalar *b;
508: VecGetArrayRead(bb,&b);
509: VecGetArray(xx,&x);
510: t = a->solve_work;
511: ISGetIndices(isrow,&r);
513: /* solve U^T * D * y = b by forward substitution */
514: tp = t;
515: for (k=0; k<mbs; k++) { /* t <- perm(b) */
516: idx = 6*r[k];
517: tp[0] = b[idx];
518: tp[1] = b[idx+1];
519: tp[2] = b[idx+2];
520: tp[3] = b[idx+3];
521: tp[4] = b[idx+4];
522: tp[5] = b[idx+5];
523: tp += 6;
524: }
526: for (k=0; k<mbs; k++) {
527: v = aa + 36*ai[k];
528: vj = aj + ai[k];
529: tp = t + k*6;
530: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
531: nz = ai[k+1] - ai[k];
532: tp = t + (*vj)*6;
533: while (nz--) {
534: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
535: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
536: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
537: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
538: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
539: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
540: vj++;
541: tp = t + (*vj)*6;
542: v += 36;
543: }
545: /* xk = inv(Dk)*(Dk*xk) */
546: d = aa+k*36; /* ptr to inv(Dk) */
547: tp = t + k*6;
548: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
549: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
550: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
551: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
552: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
553: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
554: }
556: /* solve U*x = y by back substitution */
557: for (k=mbs-1; k>=0; k--) {
558: v = aa + 36*ai[k];
559: vj = aj + ai[k];
560: tp = t + k*6;
561: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
562: nz = ai[k+1] - ai[k];
564: tp = t + (*vj)*6;
565: while (nz--) {
566: /* xk += U(k,:)*x(:) */
567: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
568: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
569: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
570: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
571: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
572: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
573: vj++;
574: tp = t + (*vj)*6;
575: v += 36;
576: }
577: tp = t + k*6;
578: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
579: idx = 6*r[k];
580: x[idx] = x0;
581: x[idx+1] = x1;
582: x[idx+2] = x2;
583: x[idx+3] = x3;
584: x[idx+4] = x4;
585: x[idx+5] = x5;
586: }
588: ISRestoreIndices(isrow,&r);
589: VecRestoreArrayRead(bb,&b);
590: VecRestoreArray(xx,&x);
591: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
592: return(0);
593: }
597: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
598: {
599: const MatScalar *v,*d;
600: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
601: PetscInt nz,k;
602: const PetscInt *vj;
605: for (k=0; k<mbs; k++) {
606: v = aa + 36*ai[k];
607: xp = x + k*6;
608: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
609: nz = ai[k+1] - ai[k];
610: vj = aj + ai[k];
611: xp = x + (*vj)*6;
612: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
613: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
614: while (nz--) {
615: /* x(:) += U(k,:)^T*(Dk*xk) */
616: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
617: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
618: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
619: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
620: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
621: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
622: vj++;
623: xp = x + (*vj)*6;
624: v += 36;
625: }
626: /* xk = inv(Dk)*(Dk*xk) */
627: d = aa+k*36; /* ptr to inv(Dk) */
628: xp = x + k*6;
629: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
630: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
631: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
632: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
633: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
634: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
635: }
636: return(0);
637: }
640: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
641: {
642: const MatScalar *v;
643: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
644: PetscInt nz,k;
645: const PetscInt *vj;
648: for (k=mbs-1; k>=0; k--) {
649: v = aa + 36*ai[k];
650: xp = x + k*6;
651: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
652: nz = ai[k+1] - ai[k];
653: vj = aj + ai[k];
654: xp = x + (*vj)*6;
655: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
656: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
657: while (nz--) {
658: /* xk += U(k,:)*x(:) */
659: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
660: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
661: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
662: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
663: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
664: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
665: vj++;
666: v += 36; xp = x + (*vj)*6;
667: }
668: xp = x + k*6;
669: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
670: }
671: return(0);
672: }
677: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
678: {
679: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
680: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
681: const MatScalar *aa=a->a;
682: PetscScalar *x;
683: const PetscScalar *b;
684: PetscErrorCode ierr;
687: VecGetArrayRead(bb,&b);
688: VecGetArray(xx,&x);
690: /* solve U^T * D * y = b by forward substitution */
691: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
692: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
694: /* solve U*x = y by back substitution */
695: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
697: VecRestoreArrayRead(bb,&b);
698: VecRestoreArray(xx,&x);
699: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
700: return(0);
701: }
705: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
706: {
707: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
708: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
709: const MatScalar *aa=a->a;
710: PetscScalar *x;
711: const PetscScalar *b;
712: PetscErrorCode ierr;
715: VecGetArrayRead(bb,&b);
716: VecGetArray(xx,&x);
717: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
718: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
719: VecRestoreArrayRead(bb,&b);
720: VecRestoreArray(xx,&x);
721: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
722: return(0);
723: }
727: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
728: {
729: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
730: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
731: const MatScalar *aa=a->a;
732: PetscScalar *x;
733: const PetscScalar *b;
734: PetscErrorCode ierr;
737: VecGetArrayRead(bb,&b);
738: VecGetArray(xx,&x);
739: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
740: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
741: VecRestoreArrayRead(bb,&b);
742: VecRestoreArray(xx,&x);
743: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
744: return(0);
745: }
749: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
750: {
751: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
752: IS isrow=a->row;
753: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
754: PetscErrorCode ierr;
755: const PetscInt *r,*vj;
756: PetscInt nz,k,idx;
757: const MatScalar *aa=a->a,*v,*diag;
758: PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp;
759: const PetscScalar *b;
762: VecGetArrayRead(bb,&b);
763: VecGetArray(xx,&x);
764: t = a->solve_work;
765: ISGetIndices(isrow,&r);
767: /* solve U^T * D * y = b by forward substitution */
768: tp = t;
769: for (k=0; k<mbs; k++) { /* t <- perm(b) */
770: idx = 5*r[k];
771: tp[0] = b[idx];
772: tp[1] = b[idx+1];
773: tp[2] = b[idx+2];
774: tp[3] = b[idx+3];
775: tp[4] = b[idx+4];
776: tp += 5;
777: }
779: for (k=0; k<mbs; k++) {
780: v = aa + 25*ai[k];
781: vj = aj + ai[k];
782: tp = t + k*5;
783: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
784: nz = ai[k+1] - ai[k];
786: tp = t + (*vj)*5;
787: while (nz--) {
788: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
789: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
790: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
791: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
792: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
793: vj++;
794: tp = t + (*vj)*5;
795: v += 25;
796: }
798: /* xk = inv(Dk)*(Dk*xk) */
799: diag = aa+k*25; /* ptr to inv(Dk) */
800: tp = t + k*5;
801: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
802: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
803: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
804: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
805: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
806: }
808: /* solve U*x = y by back substitution */
809: for (k=mbs-1; k>=0; k--) {
810: v = aa + 25*ai[k];
811: vj = aj + ai[k];
812: tp = t + k*5;
813: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
814: nz = ai[k+1] - ai[k];
816: tp = t + (*vj)*5;
817: while (nz--) {
818: /* xk += U(k,:)*x(:) */
819: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
820: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
821: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
822: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
823: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
824: vj++;
825: tp = t + (*vj)*5;
826: v += 25;
827: }
828: tp = t + k*5;
829: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
830: idx = 5*r[k];
831: x[idx] = x0;
832: x[idx+1] = x1;
833: x[idx+2] = x2;
834: x[idx+3] = x3;
835: x[idx+4] = x4;
836: }
838: ISRestoreIndices(isrow,&r);
839: VecRestoreArrayRead(bb,&b);
840: VecRestoreArray(xx,&x);
841: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
842: return(0);
843: }
847: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
848: {
849: const MatScalar *v,*diag;
850: PetscScalar *xp,x0,x1,x2,x3,x4;
851: PetscInt nz,k;
852: const PetscInt *vj;
855: for (k=0; k<mbs; k++) {
856: v = aa + 25*ai[k];
857: xp = x + k*5;
858: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
859: nz = ai[k+1] - ai[k];
860: vj = aj + ai[k];
861: xp = x + (*vj)*5;
862: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
863: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864: while (nz--) {
865: /* x(:) += U(k,:)^T*(Dk*xk) */
866: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
867: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
868: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
869: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
870: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
871: vj++;
872: xp = x + (*vj)*5;
873: v += 25;
874: }
875: /* xk = inv(Dk)*(Dk*xk) */
876: diag = aa+k*25; /* ptr to inv(Dk) */
877: xp = x + k*5;
878: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
879: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
880: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
881: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
882: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
883: }
884: return(0);
885: }
889: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
890: {
891: const MatScalar *v;
892: PetscScalar *xp,x0,x1,x2,x3,x4;
893: PetscInt nz,k;
894: const PetscInt *vj;
897: for (k=mbs-1; k>=0; k--) {
898: v = aa + 25*ai[k];
899: xp = x + k*5;
900: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
901: nz = ai[k+1] - ai[k];
902: vj = aj + ai[k];
903: xp = x + (*vj)*5;
904: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
905: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
906: while (nz--) {
907: /* xk += U(k,:)*x(:) */
908: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
909: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
910: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
911: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
912: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
913: vj++;
914: v += 25; xp = x + (*vj)*5;
915: }
916: xp = x + k*5;
917: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
918: }
919: return(0);
920: }
924: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
925: {
926: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
927: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
928: const MatScalar *aa=a->a;
929: PetscScalar *x;
930: const PetscScalar *b;
931: PetscErrorCode ierr;
934: VecGetArrayRead(bb,&b);
935: VecGetArray(xx,&x);
937: /* solve U^T * D * y = b by forward substitution */
938: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
939: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
941: /* solve U*x = y by back substitution */
942: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
944: VecRestoreArrayRead(bb,&b);
945: VecRestoreArray(xx,&x);
946: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
947: return(0);
948: }
952: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
953: {
954: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
955: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
956: const MatScalar *aa=a->a;
957: PetscScalar *x;
958: const PetscScalar *b;
959: PetscErrorCode ierr;
962: VecGetArrayRead(bb,&b);
963: VecGetArray(xx,&x);
964: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
965: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
966: VecRestoreArrayRead(bb,&b);
967: VecRestoreArray(xx,&x);
968: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
969: return(0);
970: }
974: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
975: {
976: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
977: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
978: const MatScalar *aa=a->a;
979: PetscScalar *x;
980: const PetscScalar *b;
981: PetscErrorCode ierr;
984: VecGetArrayRead(bb,&b);
985: VecGetArray(xx,&x);
986: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
987: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
988: VecRestoreArrayRead(bb,&b);
989: VecRestoreArray(xx,&x);
990: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
991: return(0);
992: }
996: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
997: {
998: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
999: IS isrow=a->row;
1000: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1001: PetscErrorCode ierr;
1002: const PetscInt *r,*vj;
1003: PetscInt nz,k,idx;
1004: const MatScalar *aa=a->a,*v,*diag;
1005: PetscScalar *x,x0,x1,x2,x3,*t,*tp;
1006: const PetscScalar *b;
1009: VecGetArrayRead(bb,&b);
1010: VecGetArray(xx,&x);
1011: t = a->solve_work;
1012: ISGetIndices(isrow,&r);
1014: /* solve U^T * D * y = b by forward substitution */
1015: tp = t;
1016: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1017: idx = 4*r[k];
1018: tp[0] = b[idx];
1019: tp[1] = b[idx+1];
1020: tp[2] = b[idx+2];
1021: tp[3] = b[idx+3];
1022: tp += 4;
1023: }
1025: for (k=0; k<mbs; k++) {
1026: v = aa + 16*ai[k];
1027: vj = aj + ai[k];
1028: tp = t + k*4;
1029: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1030: nz = ai[k+1] - ai[k];
1032: tp = t + (*vj)*4;
1033: while (nz--) {
1034: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1035: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1036: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1037: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1038: vj++;
1039: tp = t + (*vj)*4;
1040: v += 16;
1041: }
1043: /* xk = inv(Dk)*(Dk*xk) */
1044: diag = aa+k*16; /* ptr to inv(Dk) */
1045: tp = t + k*4;
1046: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1047: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1048: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1049: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1050: }
1052: /* solve U*x = y by back substitution */
1053: for (k=mbs-1; k>=0; k--) {
1054: v = aa + 16*ai[k];
1055: vj = aj + ai[k];
1056: tp = t + k*4;
1057: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1058: nz = ai[k+1] - ai[k];
1060: tp = t + (*vj)*4;
1061: while (nz--) {
1062: /* xk += U(k,:)*x(:) */
1063: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1064: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1065: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1066: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1067: vj++;
1068: tp = t + (*vj)*4;
1069: v += 16;
1070: }
1071: tp = t + k*4;
1072: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1073: idx = 4*r[k];
1074: x[idx] = x0;
1075: x[idx+1] = x1;
1076: x[idx+2] = x2;
1077: x[idx+3] = x3;
1078: }
1080: ISRestoreIndices(isrow,&r);
1081: VecRestoreArrayRead(bb,&b);
1082: VecRestoreArray(xx,&x);
1083: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1084: return(0);
1085: }
1089: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1090: {
1091: const MatScalar *v,*diag;
1092: PetscScalar *xp,x0,x1,x2,x3;
1093: PetscInt nz,k;
1094: const PetscInt *vj;
1097: for (k=0; k<mbs; k++) {
1098: v = aa + 16*ai[k];
1099: xp = x + k*4;
1100: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1101: nz = ai[k+1] - ai[k];
1102: vj = aj + ai[k];
1103: xp = x + (*vj)*4;
1104: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1105: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1106: while (nz--) {
1107: /* x(:) += U(k,:)^T*(Dk*xk) */
1108: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1109: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1110: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1111: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1112: vj++;
1113: xp = x + (*vj)*4;
1114: v += 16;
1115: }
1116: /* xk = inv(Dk)*(Dk*xk) */
1117: diag = aa+k*16; /* ptr to inv(Dk) */
1118: xp = x + k*4;
1119: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1120: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1121: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1122: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1123: }
1124: return(0);
1125: }
1129: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1130: {
1131: const MatScalar *v;
1132: PetscScalar *xp,x0,x1,x2,x3;
1133: PetscInt nz,k;
1134: const PetscInt *vj;
1137: for (k=mbs-1; k>=0; k--) {
1138: v = aa + 16*ai[k];
1139: xp = x + k*4;
1140: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1141: nz = ai[k+1] - ai[k];
1142: vj = aj + ai[k];
1143: xp = x + (*vj)*4;
1144: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1145: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1146: while (nz--) {
1147: /* xk += U(k,:)*x(:) */
1148: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1149: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1150: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1151: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1152: vj++;
1153: v += 16; xp = x + (*vj)*4;
1154: }
1155: xp = x + k*4;
1156: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1157: }
1158: return(0);
1159: }
1163: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1164: {
1165: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1166: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1167: const MatScalar *aa=a->a;
1168: PetscScalar *x;
1169: const PetscScalar *b;
1170: PetscErrorCode ierr;
1173: VecGetArrayRead(bb,&b);
1174: VecGetArray(xx,&x);
1176: /* solve U^T * D * y = b by forward substitution */
1177: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1178: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1180: /* solve U*x = y by back substitution */
1181: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1182: VecRestoreArrayRead(bb,&b);
1183: VecRestoreArray(xx,&x);
1184: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1185: return(0);
1186: }
1190: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1191: {
1192: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1193: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1194: const MatScalar *aa=a->a;
1195: PetscScalar *x;
1196: const PetscScalar *b;
1197: PetscErrorCode ierr;
1200: VecGetArrayRead(bb,&b);
1201: VecGetArray(xx,&x);
1202: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1203: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1204: VecRestoreArrayRead(bb,&b);
1205: VecRestoreArray(xx,&x);
1206: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1207: return(0);
1208: }
1212: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1213: {
1214: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1215: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1216: const MatScalar *aa=a->a;
1217: PetscScalar *x;
1218: const PetscScalar *b;
1219: PetscErrorCode ierr;
1222: VecGetArrayRead(bb,&b);
1223: VecGetArray(xx,&x);
1224: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1225: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1226: VecRestoreArrayRead(bb,&b);
1227: VecRestoreArray(xx,&x);
1228: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1229: return(0);
1230: }
1234: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1235: {
1236: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1237: IS isrow=a->row;
1238: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1239: PetscErrorCode ierr;
1240: const PetscInt *r;
1241: PetscInt nz,k,idx;
1242: const PetscInt *vj;
1243: const MatScalar *aa=a->a,*v,*diag;
1244: PetscScalar *x,x0,x1,x2,*t,*tp;
1245: const PetscScalar *b;
1248: VecGetArrayRead(bb,&b);
1249: VecGetArray(xx,&x);
1250: t = a->solve_work;
1251: ISGetIndices(isrow,&r);
1253: /* solve U^T * D * y = b by forward substitution */
1254: tp = t;
1255: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1256: idx = 3*r[k];
1257: tp[0] = b[idx];
1258: tp[1] = b[idx+1];
1259: tp[2] = b[idx+2];
1260: tp += 3;
1261: }
1263: for (k=0; k<mbs; k++) {
1264: v = aa + 9*ai[k];
1265: vj = aj + ai[k];
1266: tp = t + k*3;
1267: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1268: nz = ai[k+1] - ai[k];
1270: tp = t + (*vj)*3;
1271: while (nz--) {
1272: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1273: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1274: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1275: vj++;
1276: tp = t + (*vj)*3;
1277: v += 9;
1278: }
1280: /* xk = inv(Dk)*(Dk*xk) */
1281: diag = aa+k*9; /* ptr to inv(Dk) */
1282: tp = t + k*3;
1283: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1284: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1285: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1286: }
1288: /* solve U*x = y by back substitution */
1289: for (k=mbs-1; k>=0; k--) {
1290: v = aa + 9*ai[k];
1291: vj = aj + ai[k];
1292: tp = t + k*3;
1293: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1294: nz = ai[k+1] - ai[k];
1296: tp = t + (*vj)*3;
1297: while (nz--) {
1298: /* xk += U(k,:)*x(:) */
1299: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1300: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1301: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1302: vj++;
1303: tp = t + (*vj)*3;
1304: v += 9;
1305: }
1306: tp = t + k*3;
1307: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1308: idx = 3*r[k];
1309: x[idx] = x0;
1310: x[idx+1] = x1;
1311: x[idx+2] = x2;
1312: }
1314: ISRestoreIndices(isrow,&r);
1315: VecRestoreArrayRead(bb,&b);
1316: VecRestoreArray(xx,&x);
1317: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1318: return(0);
1319: }
1323: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1324: {
1325: const MatScalar *v,*diag;
1326: PetscScalar *xp,x0,x1,x2;
1327: PetscInt nz,k;
1328: const PetscInt *vj;
1331: for (k=0; k<mbs; k++) {
1332: v = aa + 9*ai[k];
1333: xp = x + k*3;
1334: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1335: nz = ai[k+1] - ai[k];
1336: vj = aj + ai[k];
1337: xp = x + (*vj)*3;
1338: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1339: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1340: while (nz--) {
1341: /* x(:) += U(k,:)^T*(Dk*xk) */
1342: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1343: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1344: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1345: vj++;
1346: xp = x + (*vj)*3;
1347: v += 9;
1348: }
1349: /* xk = inv(Dk)*(Dk*xk) */
1350: diag = aa+k*9; /* ptr to inv(Dk) */
1351: xp = x + k*3;
1352: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1353: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1354: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1355: }
1356: return(0);
1357: }
1361: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1362: {
1363: const MatScalar *v;
1364: PetscScalar *xp,x0,x1,x2;
1365: PetscInt nz,k;
1366: const PetscInt *vj;
1369: for (k=mbs-1; k>=0; k--) {
1370: v = aa + 9*ai[k];
1371: xp = x + k*3;
1372: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1373: nz = ai[k+1] - ai[k];
1374: vj = aj + ai[k];
1375: xp = x + (*vj)*3;
1376: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1377: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1378: while (nz--) {
1379: /* xk += U(k,:)*x(:) */
1380: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1381: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1382: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1383: vj++;
1384: v += 9; xp = x + (*vj)*3;
1385: }
1386: xp = x + k*3;
1387: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1388: }
1389: return(0);
1390: }
1394: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1395: {
1396: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1397: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1398: const MatScalar *aa=a->a;
1399: PetscScalar *x;
1400: const PetscScalar *b;
1401: PetscErrorCode ierr;
1404: VecGetArrayRead(bb,&b);
1405: VecGetArray(xx,&x);
1407: /* solve U^T * D * y = b by forward substitution */
1408: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1409: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1411: /* solve U*x = y by back substitution */
1412: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1414: VecRestoreArrayRead(bb,&b);
1415: VecRestoreArray(xx,&x);
1416: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1417: return(0);
1418: }
1422: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1423: {
1424: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1425: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1426: const MatScalar *aa=a->a;
1427: PetscScalar *x;
1428: const PetscScalar *b;
1429: PetscErrorCode ierr;
1432: VecGetArrayRead(bb,&b);
1433: VecGetArray(xx,&x);
1434: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1435: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1436: VecRestoreArrayRead(bb,&b);
1437: VecRestoreArray(xx,&x);
1438: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1439: return(0);
1440: }
1444: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1445: {
1446: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1447: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1448: const MatScalar *aa=a->a;
1449: PetscScalar *x;
1450: const PetscScalar *b;
1451: PetscErrorCode ierr;
1454: VecGetArrayRead(bb,&b);
1455: VecGetArray(xx,&x);
1456: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1457: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1458: VecRestoreArrayRead(bb,&b);
1459: VecRestoreArray(xx,&x);
1460: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1461: return(0);
1462: }
1466: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1467: {
1468: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1469: IS isrow=a->row;
1470: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1471: PetscErrorCode ierr;
1472: const PetscInt *r,*vj;
1473: PetscInt nz,k,k2,idx;
1474: const MatScalar *aa=a->a,*v,*diag;
1475: PetscScalar *x,x0,x1,*t;
1476: const PetscScalar *b;
1479: VecGetArrayRead(bb,&b);
1480: VecGetArray(xx,&x);
1481: t = a->solve_work;
1482: ISGetIndices(isrow,&r);
1484: /* solve U^T * D * y = perm(b) by forward substitution */
1485: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1486: idx = 2*r[k];
1487: t[k*2] = b[idx];
1488: t[k*2+1] = b[idx+1];
1489: }
1490: for (k=0; k<mbs; k++) {
1491: v = aa + 4*ai[k];
1492: vj = aj + ai[k];
1493: k2 = k*2;
1494: x0 = t[k2]; x1 = t[k2+1];
1495: nz = ai[k+1] - ai[k];
1496: while (nz--) {
1497: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1498: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1499: vj++; v += 4;
1500: }
1501: diag = aa+k*4; /* ptr to inv(Dk) */
1502: t[k2] = diag[0]*x0 + diag[2]*x1;
1503: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1504: }
1506: /* solve U*x = y by back substitution */
1507: for (k=mbs-1; k>=0; k--) {
1508: v = aa + 4*ai[k];
1509: vj = aj + ai[k];
1510: k2 = k*2;
1511: x0 = t[k2]; x1 = t[k2+1];
1512: nz = ai[k+1] - ai[k];
1513: while (nz--) {
1514: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1515: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1516: vj++;
1517: v += 4;
1518: }
1519: t[k2] = x0;
1520: t[k2+1] = x1;
1521: idx = 2*r[k];
1522: x[idx] = x0;
1523: x[idx+1] = x1;
1524: }
1526: ISRestoreIndices(isrow,&r);
1527: VecRestoreArrayRead(bb,&b);
1528: VecRestoreArray(xx,&x);
1529: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1530: return(0);
1531: }
1535: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1536: {
1537: const MatScalar *v,*diag;
1538: PetscScalar x0,x1;
1539: PetscInt nz,k,k2;
1540: const PetscInt *vj;
1541:
1543: for (k=0; k<mbs; k++) {
1544: v = aa + 4*ai[k];
1545: vj = aj + ai[k];
1546: k2 = k*2;
1547: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1548: nz = ai[k+1] - ai[k];
1549: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1550: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1551: while (nz--) {
1552: /* x(:) += U(k,:)^T*(Dk*xk) */
1553: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1554: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1555: vj++; v += 4;
1556: }
1557: /* xk = inv(Dk)*(Dk*xk) */
1558: diag = aa+k*4; /* ptr to inv(Dk) */
1559: x[k2] = diag[0]*x0 + diag[2]*x1;
1560: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1561: }
1562: return(0);
1563: }
1567: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1568: {
1569: const MatScalar *v;
1570: PetscScalar x0,x1;
1571: PetscInt nz,k,k2;
1572: const PetscInt *vj;
1575: for (k=mbs-1; k>=0; k--) {
1576: v = aa + 4*ai[k];
1577: vj = aj + ai[k];
1578: k2 = k*2;
1579: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1580: nz = ai[k+1] - ai[k];
1581: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1582: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583: while (nz--) {
1584: /* xk += U(k,:)*x(:) */
1585: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1586: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1587: vj++;
1588: v += 4;
1589: }
1590: x[k2] = x0;
1591: x[k2+1] = x1;
1592: }
1593: return(0);
1594: }
1598: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1599: {
1600: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1601: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1602: const MatScalar *aa=a->a;
1603: PetscScalar *x;
1604: const PetscScalar *b;
1605: PetscErrorCode ierr;
1608: VecGetArrayRead(bb,&b);
1609: VecGetArray(xx,&x);
1611: /* solve U^T * D * y = b by forward substitution */
1612: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1613: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1615: /* solve U*x = y by back substitution */
1616: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1618: VecRestoreArrayRead(bb,&b);
1619: VecRestoreArray(xx,&x);
1620: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1621: return(0);
1622: }
1626: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1627: {
1628: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1629: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1630: const MatScalar *aa=a->a;
1631: PetscScalar *x;
1632: const PetscScalar *b;
1633: PetscErrorCode ierr;
1636: VecGetArrayRead(bb,&b);
1637: VecGetArray(xx,&x);
1638: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1639: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1640: VecRestoreArrayRead(bb,&b);
1641: VecRestoreArray(xx,&x);
1642: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1643: return(0);
1644: }
1648: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1649: {
1650: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1651: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1652: const MatScalar *aa=a->a;
1653: PetscScalar *x;
1654: const PetscScalar *b;
1655: PetscErrorCode ierr;
1658: VecGetArrayRead(bb,&b);
1659: VecGetArray(xx,&x);
1660: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1661: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1662: VecRestoreArrayRead(bb,&b);
1663: VecRestoreArray(xx,&x);
1664: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1665: return(0);
1666: }
1670: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1671: {
1672: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1673: IS isrow=a->row;
1674: PetscErrorCode ierr;
1675: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1676: const MatScalar *aa=a->a,*v;
1677: const PetscScalar *b;
1678: PetscScalar *x,xk,*t;
1679: PetscInt nz,k,j;
1682: VecGetArrayRead(bb,&b);
1683: VecGetArray(xx,&x);
1684: t = a->solve_work;
1685: ISGetIndices(isrow,&rp);
1687: /* solve U^T*D*y = perm(b) by forward substitution */
1688: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1689: for (k=0; k<mbs; k++) {
1690: v = aa + ai[k];
1691: vj = aj + ai[k];
1692: xk = t[k];
1693: nz = ai[k+1] - ai[k] - 1;
1694: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1695: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1696: }
1698: /* solve U*perm(x) = y by back substitution */
1699: for (k=mbs-1; k>=0; k--) {
1700: v = aa + adiag[k] - 1;
1701: vj = aj + adiag[k] - 1;
1702: nz = ai[k+1] - ai[k] - 1;
1703: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1704: x[rp[k]] = t[k];
1705: }
1707: ISRestoreIndices(isrow,&rp);
1708: VecRestoreArrayRead(bb,&b);
1709: VecRestoreArray(xx,&x);
1710: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1711: return(0);
1712: }
1716: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1717: {
1718: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1719: IS isrow=a->row;
1720: PetscErrorCode ierr;
1721: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1722: const MatScalar *aa=a->a,*v;
1723: PetscScalar *x,xk,*t;
1724: const PetscScalar *b;
1725: PetscInt nz,k;
1728: VecGetArrayRead(bb,&b);
1729: VecGetArray(xx,&x);
1730: t = a->solve_work;
1731: ISGetIndices(isrow,&rp);
1733: /* solve U^T*D*y = perm(b) by forward substitution */
1734: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1735: for (k=0; k<mbs; k++) {
1736: v = aa + ai[k] + 1;
1737: vj = aj + ai[k] + 1;
1738: xk = t[k];
1739: nz = ai[k+1] - ai[k] - 1;
1740: while (nz--) t[*vj++] += (*v++) * xk;
1741: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1742: }
1744: /* solve U*perm(x) = y by back substitution */
1745: for (k=mbs-1; k>=0; k--) {
1746: v = aa + ai[k] + 1;
1747: vj = aj + ai[k] + 1;
1748: nz = ai[k+1] - ai[k] - 1;
1749: while (nz--) t[k] += (*v++) * t[*vj++];
1750: x[rp[k]] = t[k];
1751: }
1753: ISRestoreIndices(isrow,&rp);
1754: VecRestoreArrayRead(bb,&b);
1755: VecRestoreArray(xx,&x);
1756: PetscLogFlops(4.0*a->nz - 3*mbs);
1757: return(0);
1758: }
1762: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1763: {
1764: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1765: IS isrow=a->row;
1766: PetscErrorCode ierr;
1767: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1768: const MatScalar *aa=a->a,*v;
1769: PetscReal diagk;
1770: PetscScalar *x,xk;
1771: const PetscScalar *b;
1772: PetscInt nz,k;
1775: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1776: VecGetArrayRead(bb,&b);
1777: VecGetArray(xx,&x);
1778: ISGetIndices(isrow,&rp);
1780: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1781: for (k=0; k<mbs; k++) {
1782: v = aa + ai[k];
1783: vj = aj + ai[k];
1784: xk = x[k];
1785: nz = ai[k+1] - ai[k] - 1;
1786: while (nz--) x[*vj++] += (*v++) * xk;
1788: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1789: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1790: x[k] = xk*PetscSqrtReal(diagk);
1791: }
1792: ISRestoreIndices(isrow,&rp);
1793: VecRestoreArrayRead(bb,&b);
1794: VecRestoreArray(xx,&x);
1795: PetscLogFlops(2.0*a->nz - mbs);
1796: return(0);
1797: }
1801: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1802: {
1803: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1804: IS isrow=a->row;
1805: PetscErrorCode ierr;
1806: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1807: const MatScalar *aa=a->a,*v;
1808: PetscReal diagk;
1809: PetscScalar *x,xk;
1810: const PetscScalar *b;
1811: PetscInt nz,k;
1814: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1815: VecGetArrayRead(bb,&b);
1816: VecGetArray(xx,&x);
1817: ISGetIndices(isrow,&rp);
1819: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1820: for (k=0; k<mbs; k++) {
1821: v = aa + ai[k] + 1;
1822: vj = aj + ai[k] + 1;
1823: xk = x[k];
1824: nz = ai[k+1] - ai[k] - 1;
1825: while (nz--) x[*vj++] += (*v++) * xk;
1827: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1828: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1829: x[k] = xk*PetscSqrtReal(diagk);
1830: }
1831: ISRestoreIndices(isrow,&rp);
1832: VecRestoreArrayRead(bb,&b);
1833: VecRestoreArray(xx,&x);
1834: PetscLogFlops(2.0*a->nz - mbs);
1835: return(0);
1836: }
1840: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1841: {
1842: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1843: IS isrow=a->row;
1844: PetscErrorCode ierr;
1845: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1846: const MatScalar *aa=a->a,*v;
1847: PetscReal diagk;
1848: PetscScalar *x,*t;
1849: const PetscScalar *b;
1850: PetscInt nz,k;
1853: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1854: VecGetArrayRead(bb,&b);
1855: VecGetArray(xx,&x);
1856: t = a->solve_work;
1857: ISGetIndices(isrow,&rp);
1859: for (k=mbs-1; k>=0; k--) {
1860: v = aa + ai[k];
1861: vj = aj + ai[k];
1862: diagk = PetscRealPart(aa[adiag[k]]);
1863: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1864: t[k] = b[k] * PetscSqrtReal(diagk);
1865: nz = ai[k+1] - ai[k] - 1;
1866: while (nz--) t[k] += (*v++) * t[*vj++];
1867: x[rp[k]] = t[k];
1868: }
1869: ISRestoreIndices(isrow,&rp);
1870: VecRestoreArrayRead(bb,&b);
1871: VecRestoreArray(xx,&x);
1872: PetscLogFlops(2.0*a->nz - mbs);
1873: return(0);
1874: }
1878: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1879: {
1880: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1881: IS isrow=a->row;
1882: PetscErrorCode ierr;
1883: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1884: const MatScalar *aa=a->a,*v;
1885: PetscReal diagk;
1886: PetscScalar *x,*t;
1887: const PetscScalar *b;
1888: PetscInt nz,k;
1891: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1892: VecGetArrayRead(bb,&b);
1893: VecGetArray(xx,&x);
1894: t = a->solve_work;
1895: ISGetIndices(isrow,&rp);
1897: for (k=mbs-1; k>=0; k--) {
1898: v = aa + ai[k] + 1;
1899: vj = aj + ai[k] + 1;
1900: diagk = PetscRealPart(aa[ai[k]]);
1901: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1902: t[k] = b[k] * PetscSqrtReal(diagk);
1903: nz = ai[k+1] - ai[k] - 1;
1904: while (nz--) t[k] += (*v++) * t[*vj++];
1905: x[rp[k]] = t[k];
1906: }
1907: ISRestoreIndices(isrow,&rp);
1908: VecRestoreArrayRead(bb,&b);
1909: VecRestoreArray(xx,&x);
1910: PetscLogFlops(2.0*a->nz - mbs);
1911: return(0);
1912: }
1916: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1917: {
1918: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1922: if (A->rmap->bs == 1) {
1923: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1924: } else {
1925: IS isrow=a->row;
1926: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1927: const MatScalar *aa=a->a,*v;
1928: PetscScalar *x,*t;
1929: const PetscScalar *b;
1930: PetscInt nz,k,n,i,j;
1932: if (bb->n > a->solves_work_n) {
1933: PetscFree(a->solves_work);
1934: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1935: a->solves_work_n = bb->n;
1936: }
1937: n = bb->n;
1938: VecGetArrayRead(bb->v,&b);
1939: VecGetArray(xx->v,&x);
1940: t = a->solves_work;
1942: ISGetIndices(isrow,&rp);
1944: /* solve U^T*D*y = perm(b) by forward substitution */
1945: for (k=0; k<mbs; k++) {
1946: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1947: }
1948: for (k=0; k<mbs; k++) {
1949: v = aa + ai[k];
1950: vj = aj + ai[k];
1951: nz = ai[k+1] - ai[k] - 1;
1952: for (j=0; j<nz; j++) {
1953: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1954: v++;vj++;
1955: }
1956: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1957: }
1959: /* solve U*perm(x) = y by back substitution */
1960: for (k=mbs-1; k>=0; k--) {
1961: v = aa + ai[k] - 1;
1962: vj = aj + ai[k] - 1;
1963: nz = ai[k+1] - ai[k] - 1;
1964: for (j=0; j<nz; j++) {
1965: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1966: v++;vj++;
1967: }
1968: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1969: }
1971: ISRestoreIndices(isrow,&rp);
1972: VecRestoreArrayRead(bb->v,&b);
1973: VecRestoreArray(xx->v,&x);
1974: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1975: }
1976: return(0);
1977: }
1981: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1982: {
1983: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1987: if (A->rmap->bs == 1) {
1988: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1989: } else {
1990: IS isrow=a->row;
1991: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1992: const MatScalar *aa=a->a,*v;
1993: PetscScalar *x,*t;
1994: const PetscScalar *b;
1995: PetscInt nz,k,n,i;
1997: if (bb->n > a->solves_work_n) {
1998: PetscFree(a->solves_work);
1999: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
2000: a->solves_work_n = bb->n;
2001: }
2002: n = bb->n;
2003: VecGetArrayRead(bb->v,&b);
2004: VecGetArray(xx->v,&x);
2005: t = a->solves_work;
2007: ISGetIndices(isrow,&rp);
2009: /* solve U^T*D*y = perm(b) by forward substitution */
2010: for (k=0; k<mbs; k++) {
2011: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
2012: }
2013: for (k=0; k<mbs; k++) {
2014: v = aa + ai[k];
2015: vj = aj + ai[k];
2016: nz = ai[k+1] - ai[k];
2017: while (nz--) {
2018: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
2019: v++;vj++;
2020: }
2021: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2022: }
2024: /* solve U*perm(x) = y by back substitution */
2025: for (k=mbs-1; k>=0; k--) {
2026: v = aa + ai[k];
2027: vj = aj + ai[k];
2028: nz = ai[k+1] - ai[k];
2029: while (nz--) {
2030: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
2031: v++;vj++;
2032: }
2033: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
2034: }
2036: ISRestoreIndices(isrow,&rp);
2037: VecRestoreArrayRead(bb->v,&b);
2038: VecRestoreArray(xx->v,&x);
2039: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
2040: }
2041: return(0);
2042: }
2046: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2047: {
2048: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2049: PetscErrorCode ierr;
2050: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
2051: const MatScalar *aa=a->a,*v;
2052: const PetscScalar *b;
2053: PetscScalar *x,xi;
2054: PetscInt nz,i,j;
2057: VecGetArrayRead(bb,&b);
2058: VecGetArray(xx,&x);
2060: /* solve U^T*D*y = b by forward substitution */
2061: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2062: for (i=0; i<mbs; i++) {
2063: v = aa + ai[i];
2064: vj = aj + ai[i];
2065: xi = x[i];
2066: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2067: for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
2068: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2069: }
2071: /* solve U*x = y by backward substitution */
2072: for (i=mbs-2; i>=0; i--) {
2073: xi = x[i];
2074: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2075: vj = aj + adiag[i] - 1;
2076: nz = ai[i+1] - ai[i] - 1;
2077: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2078: x[i] = xi;
2079: }
2081: VecRestoreArrayRead(bb,&b);
2082: VecRestoreArray(xx,&x);
2083: PetscLogFlops(4.0*a->nz - 3*mbs);
2084: return(0);
2085: }
2089: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2090: {
2091: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2092: PetscErrorCode ierr;
2093: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2094: const MatScalar *aa=a->a,*v;
2095: PetscScalar *x,xk;
2096: const PetscScalar *b;
2097: PetscInt nz,k;
2100: VecGetArrayRead(bb,&b);
2101: VecGetArray(xx,&x);
2103: /* solve U^T*D*y = b by forward substitution */
2104: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2105: for (k=0; k<mbs; k++) {
2106: v = aa + ai[k] + 1;
2107: vj = aj + ai[k] + 1;
2108: xk = x[k];
2109: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2110: while (nz--) x[*vj++] += (*v++) * xk;
2111: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2112: }
2114: /* solve U*x = y by back substitution */
2115: for (k=mbs-2; k>=0; k--) {
2116: v = aa + ai[k] + 1;
2117: vj = aj + ai[k] + 1;
2118: xk = x[k];
2119: nz = ai[k+1] - ai[k] - 1;
2120: while (nz--) {
2121: xk += (*v++) * x[*vj++];
2122: }
2123: x[k] = xk;
2124: }
2126: VecRestoreArrayRead(bb,&b);
2127: VecRestoreArray(xx,&x);
2128: PetscLogFlops(4.0*a->nz - 3*mbs);
2129: return(0);
2130: }
2134: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2135: {
2136: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2137: PetscErrorCode ierr;
2138: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2139: const MatScalar *aa=a->a,*v;
2140: PetscReal diagk;
2141: PetscScalar *x;
2142: const PetscScalar *b;
2143: PetscInt nz,k;
2146: /* solve U^T*D^(1/2)*x = b by forward substitution */
2147: VecGetArrayRead(bb,&b);
2148: VecGetArray(xx,&x);
2149: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2150: for (k=0; k<mbs; k++) {
2151: v = aa + ai[k];
2152: vj = aj + ai[k];
2153: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2154: while (nz--) x[*vj++] += (*v++) * x[k];
2155: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2156: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2157: x[k] *= PetscSqrtReal(diagk);
2158: }
2159: VecRestoreArrayRead(bb,&b);
2160: VecRestoreArray(xx,&x);
2161: PetscLogFlops(2.0*a->nz - mbs);
2162: return(0);
2163: }
2167: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2168: {
2169: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2170: PetscErrorCode ierr;
2171: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2172: const MatScalar *aa=a->a,*v;
2173: PetscReal diagk;
2174: PetscScalar *x;
2175: const PetscScalar *b;
2176: PetscInt nz,k;
2179: /* solve U^T*D^(1/2)*x = b by forward substitution */
2180: VecGetArrayRead(bb,&b);
2181: VecGetArray(xx,&x);
2182: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2183: for (k=0; k<mbs; k++) {
2184: v = aa + ai[k] + 1;
2185: vj = aj + ai[k] + 1;
2186: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2187: while (nz--) x[*vj++] += (*v++) * x[k];
2188: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2189: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2190: x[k] *= PetscSqrtReal(diagk);
2191: }
2192: VecRestoreArrayRead(bb,&b);
2193: VecRestoreArray(xx,&x);
2194: PetscLogFlops(2.0*a->nz - mbs);
2195: return(0);
2196: }
2200: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2201: {
2202: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2203: PetscErrorCode ierr;
2204: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2205: const MatScalar *aa=a->a,*v;
2206: PetscReal diagk;
2207: PetscScalar *x;
2208: const PetscScalar *b;
2209: PetscInt nz,k;
2212: /* solve D^(1/2)*U*x = b by back substitution */
2213: VecGetArrayRead(bb,&b);
2214: VecGetArray(xx,&x);
2216: for (k=mbs-1; k>=0; k--) {
2217: v = aa + ai[k];
2218: vj = aj + ai[k];
2219: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2220: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2221: x[k] = PetscSqrtReal(diagk)*b[k];
2222: nz = ai[k+1] - ai[k] - 1;
2223: while (nz--) x[k] += (*v++) * x[*vj++];
2224: }
2225: VecRestoreArrayRead(bb,&b);
2226: VecRestoreArray(xx,&x);
2227: PetscLogFlops(2.0*a->nz - mbs);
2228: return(0);
2229: }
2233: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2234: {
2235: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2236: PetscErrorCode ierr;
2237: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2238: const MatScalar *aa=a->a,*v;
2239: PetscReal diagk;
2240: PetscScalar *x;
2241: const PetscScalar *b;
2242: PetscInt nz,k;
2245: /* solve D^(1/2)*U*x = b by back substitution */
2246: VecGetArrayRead(bb,&b);
2247: VecGetArray(xx,&x);
2249: for (k=mbs-1; k>=0; k--) {
2250: v = aa + ai[k] + 1;
2251: vj = aj + ai[k] + 1;
2252: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2253: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2254: x[k] = PetscSqrtReal(diagk)*b[k];
2255: nz = ai[k+1] - ai[k] - 1;
2256: while (nz--) x[k] += (*v++) * x[*vj++];
2257: }
2258: VecRestoreArrayRead(bb,&b);
2259: VecRestoreArray(xx,&x);
2260: PetscLogFlops(2.0*a->nz - mbs);
2261: return(0);
2262: }
2264: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2267: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2268: {
2269: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2271: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2272: PetscInt *jutmp,bs = A->rmap->bs,i;
2273: PetscInt m,reallocs = 0,*levtmp;
2274: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2275: PetscInt incrlev,*lev,shift,prow,nz;
2276: PetscReal f = info->fill,levels = info->levels;
2277: PetscBool perm_identity;
2280: /* check whether perm is the identity mapping */
2281: ISIdentity(perm,&perm_identity);
2283: if (perm_identity) {
2284: a->permute = PETSC_FALSE;
2285: ai = a->i; aj = a->j;
2286: } else { /* non-trivial permutation */
2287: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2288: a->permute = PETSC_TRUE;
2289: MatReorderingSeqSBAIJ(A, perm);
2290: ai = a->inew; aj = a->jnew;
2291: }
2293: /* initialization */
2294: ISGetIndices(perm,&rip);
2295: umax = (PetscInt)(f*ai[mbs] + 1);
2296: PetscMalloc1(umax,&lev);
2297: umax += mbs + 1;
2298: shift = mbs + 1;
2299: PetscMalloc1(mbs+1,&iu);
2300: PetscMalloc1(umax,&ju);
2301: iu[0] = mbs + 1;
2302: juidx = mbs + 1;
2303: /* prowl: linked list for pivot row */
2304: PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2305: /* q: linked list for col index */
2307: for (i=0; i<mbs; i++) {
2308: prowl[i] = mbs;
2309: q[i] = 0;
2310: }
2312: /* for each row k */
2313: for (k=0; k<mbs; k++) {
2314: nzk = 0;
2315: q[k] = mbs;
2316: /* copy current row into linked list */
2317: nz = ai[rip[k]+1] - ai[rip[k]];
2318: j = ai[rip[k]];
2319: while (nz--) {
2320: vj = rip[aj[j++]];
2321: if (vj > k) {
2322: qm = k;
2323: do {
2324: m = qm; qm = q[m];
2325: } while (qm < vj);
2326: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2327: nzk++;
2328: q[m] = vj;
2329: q[vj] = qm;
2330: levtmp[vj] = 0; /* initialize lev for nonzero element */
2331: }
2332: }
2334: /* modify nonzero structure of k-th row by computing fill-in
2335: for each row prow to be merged in */
2336: prow = k;
2337: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2339: while (prow < k) {
2340: /* merge row prow into k-th row */
2341: jmin = iu[prow] + 1;
2342: jmax = iu[prow+1];
2343: qm = k;
2344: for (j=jmin; j<jmax; j++) {
2345: incrlev = lev[j-shift] + 1;
2346: if (incrlev > levels) continue;
2347: vj = ju[j];
2348: do {
2349: m = qm; qm = q[m];
2350: } while (qm < vj);
2351: if (qm != vj) { /* a fill */
2352: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2353: levtmp[vj] = incrlev;
2354: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2355: }
2356: prow = prowl[prow]; /* next pivot row */
2357: }
2359: /* add k to row list for first nonzero element in k-th row */
2360: if (nzk > 1) {
2361: i = q[k]; /* col value of first nonzero element in k_th row of U */
2362: prowl[k] = prowl[i]; prowl[i] = k;
2363: }
2364: iu[k+1] = iu[k] + nzk;
2366: /* allocate more space to ju and lev if needed */
2367: if (iu[k+1] > umax) {
2368: /* estimate how much additional space we will need */
2369: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2370: /* just double the memory each time */
2371: maxadd = umax;
2372: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2373: umax += maxadd;
2375: /* allocate a longer ju */
2376: PetscMalloc1(umax,&jutmp);
2377: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2378: PetscFree(ju);
2379: ju = jutmp;
2381: PetscMalloc1(umax,&jutmp);
2382: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2383: PetscFree(lev);
2384: lev = jutmp;
2385: reallocs += 2; /* count how many times we realloc */
2386: }
2388: /* save nonzero structure of k-th row in ju */
2389: i=k;
2390: while (nzk--) {
2391: i = q[i];
2392: ju[juidx] = i;
2393: lev[juidx-shift] = levtmp[i];
2394: juidx++;
2395: }
2396: }
2398: #if defined(PETSC_USE_INFO)
2399: if (ai[mbs] != 0) {
2400: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2401: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2402: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2403: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2404: PetscInfo(A,"for best performance.\n");
2405: } else {
2406: PetscInfo(A,"Empty matrix.\n");
2407: }
2408: #endif
2410: ISRestoreIndices(perm,&rip);
2411: PetscFree3(prowl,q,levtmp);
2412: PetscFree(lev);
2414: /* put together the new matrix */
2415: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);
2417: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2418: b = (Mat_SeqSBAIJ*)(B)->data;
2419: PetscFree2(b->imax,b->ilen);
2421: b->singlemalloc = PETSC_FALSE;
2422: b->free_a = PETSC_TRUE;
2423: b->free_ij = PETSC_TRUE;
2424: /* the next line frees the default space generated by the Create() */
2425: PetscFree3(b->a,b->j,b->i);
2426: PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2427: b->j = ju;
2428: b->i = iu;
2429: b->diag = 0;
2430: b->ilen = 0;
2431: b->imax = 0;
2433: ISDestroy(&b->row);
2434: ISDestroy(&b->icol);
2435: b->row = perm;
2436: b->icol = perm;
2437: PetscObjectReference((PetscObject)perm);
2438: PetscObjectReference((PetscObject)perm);
2439: PetscMalloc1(bs*mbs+bs,&b->solve_work);
2440: /* In b structure: Free imax, ilen, old a, old j.
2441: Allocate idnew, solve_work, new a, new j */
2442: PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2443: b->maxnz = b->nz = iu[mbs];
2445: (B)->info.factor_mallocs = reallocs;
2446: (B)->info.fill_ratio_given = f;
2447: if (ai[mbs] != 0) {
2448: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2449: } else {
2450: (B)->info.fill_ratio_needed = 0.0;
2451: }
2452: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2453: return(0);
2454: }
2456: /*
2457: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2458: */
2459: #include <petscbt.h>
2460: #include <../src/mat/utils/freespace.h>
2463: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2464: {
2465: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2466: PetscErrorCode ierr;
2467: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2468: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2469: const PetscInt *rip;
2470: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2471: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2472: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2473: PetscReal fill =info->fill,levels=info->levels;
2474: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2475: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2476: PetscBT lnkbt;
2479: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2480: MatMissingDiagonal(A,&missing,&d);
2481: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2482: if (bs > 1) {
2483: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2484: return(0);
2485: }
2487: /* check whether perm is the identity mapping */
2488: ISIdentity(perm,&perm_identity);
2489: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2490: a->permute = PETSC_FALSE;
2492: PetscMalloc1(am+1,&ui);
2493: PetscMalloc1(am+1,&udiag);
2494: ui[0] = 0;
2496: /* ICC(0) without matrix ordering: simply rearrange column indices */
2497: if (!levels) {
2498: /* reuse the column pointers and row offsets for memory savings */
2499: for (i=0; i<am; i++) {
2500: ncols = ai[i+1] - ai[i];
2501: ui[i+1] = ui[i] + ncols;
2502: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2503: }
2504: PetscMalloc1(ui[am]+1,&uj);
2505: cols = uj;
2506: for (i=0; i<am; i++) {
2507: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2508: ncols = ai[i+1] - ai[i] -1;
2509: for (j=0; j<ncols; j++) *cols++ = aj[j];
2510: *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2511: }
2512: } else { /* case: levels>0 */
2513: ISGetIndices(perm,&rip);
2515: /* initialization */
2516: /* jl: linked list for storing indices of the pivot rows
2517: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2518: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2519: for (i=0; i<am; i++) {
2520: jl[i] = am; il[i] = 0;
2521: }
2523: /* create and initialize a linked list for storing column indices of the active row k */
2524: nlnk = am + 1;
2525: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2527: /* initial FreeSpace size is fill*(ai[am]+1) */
2528: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2530: current_space = free_space;
2532: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2534: current_space_lvl = free_space_lvl;
2536: for (k=0; k<am; k++) { /* for each active row k */
2537: /* initialize lnk by the column indices of row k */
2538: nzk = 0;
2539: ncols = ai[k+1] - ai[k];
2540: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2541: cols = aj+ai[k];
2542: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2543: nzk += nlnk;
2545: /* update lnk by computing fill-in for each pivot row to be merged in */
2546: prow = jl[k]; /* 1st pivot row */
2548: while (prow < k) {
2549: nextprow = jl[prow];
2551: /* merge prow into k-th row */
2552: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2553: jmax = ui[prow+1];
2554: ncols = jmax-jmin;
2555: i = jmin - ui[prow];
2557: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2558: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2559: j = *(uj - 1);
2560: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2561: nzk += nlnk;
2563: /* update il and jl for prow */
2564: if (jmin < jmax) {
2565: il[prow] = jmin;
2567: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2568: }
2569: prow = nextprow;
2570: }
2572: /* if free space is not available, make more free space */
2573: if (current_space->local_remaining<nzk) {
2574: i = am - k + 1; /* num of unfactored rows */
2575: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2576: PetscFreeSpaceGet(i,¤t_space);
2577: PetscFreeSpaceGet(i,¤t_space_lvl);
2578: reallocs++;
2579: }
2581: /* copy data into free_space and free_space_lvl, then initialize lnk */
2582: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2583: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2585: /* add the k-th row into il and jl */
2586: if (nzk > 1) {
2587: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2588: jl[k] = jl[i]; jl[i] = k;
2589: il[k] = ui[k] + 1;
2590: }
2591: uj_ptr[k] = current_space->array;
2592: uj_lvl_ptr[k] = current_space_lvl->array;
2594: current_space->array += nzk;
2595: current_space->local_used += nzk;
2596: current_space->local_remaining -= nzk;
2597: current_space_lvl->array += nzk;
2598: current_space_lvl->local_used += nzk;
2599: current_space_lvl->local_remaining -= nzk;
2601: ui[k+1] = ui[k] + nzk;
2602: }
2604: ISRestoreIndices(perm,&rip);
2605: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2607: /* destroy list of free space and other temporary array(s) */
2608: PetscMalloc1(ui[am]+1,&uj);
2609: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2610: PetscIncompleteLLDestroy(lnk,lnkbt);
2611: PetscFreeSpaceDestroy(free_space_lvl);
2613: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2615: /* put together the new matrix in MATSEQSBAIJ format */
2616: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2618: b = (Mat_SeqSBAIJ*)(fact)->data;
2619: PetscFree2(b->imax,b->ilen);
2621: b->singlemalloc = PETSC_FALSE;
2622: b->free_a = PETSC_TRUE;
2623: b->free_ij = free_ij;
2625: PetscMalloc1(ui[am]+1,&b->a);
2627: b->j = uj;
2628: b->i = ui;
2629: b->diag = udiag;
2630: b->free_diag = PETSC_TRUE;
2631: b->ilen = 0;
2632: b->imax = 0;
2633: b->row = perm;
2634: b->col = perm;
2636: PetscObjectReference((PetscObject)perm);
2637: PetscObjectReference((PetscObject)perm);
2639: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2641: PetscMalloc1(am+1,&b->solve_work);
2642: PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2644: b->maxnz = b->nz = ui[am];
2646: fact->info.factor_mallocs = reallocs;
2647: fact->info.fill_ratio_given = fill;
2648: if (ai[am] != 0) {
2649: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2650: } else {
2651: fact->info.fill_ratio_needed = 0.0;
2652: }
2653: #if defined(PETSC_USE_INFO)
2654: if (ai[am] != 0) {
2655: PetscReal af = fact->info.fill_ratio_needed;
2656: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2657: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2658: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2659: } else {
2660: PetscInfo(A,"Empty matrix.\n");
2661: }
2662: #endif
2663: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2664: return(0);
2665: }
2669: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2670: {
2671: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2672: Mat_SeqSBAIJ *b;
2673: PetscErrorCode ierr;
2674: PetscBool perm_identity,free_ij = PETSC_TRUE;
2675: PetscInt bs=A->rmap->bs,am=a->mbs;
2676: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2677: PetscInt reallocs=0,i,*ui;
2678: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2679: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2680: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2681: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2682: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2683: PetscBT lnkbt;
2686: /*
2687: This code originally uses Modified Sparse Row (MSR) storage
2688: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2689: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2690: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2691: thus the original code in MSR format is still used for these cases.
2692: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2693: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2694: */
2695: if (bs > 1) {
2696: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2697: return(0);
2698: }
2700: /* check whether perm is the identity mapping */
2701: ISIdentity(perm,&perm_identity);
2702: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2703: a->permute = PETSC_FALSE;
2705: /* special case that simply copies fill pattern */
2706: if (!levels) {
2707: /* reuse the column pointers and row offsets for memory savings */
2708: ui = a->i;
2709: uj = a->j;
2710: free_ij = PETSC_FALSE;
2711: ratio_needed = 1.0;
2712: } else { /* case: levels>0 */
2713: ISGetIndices(perm,&rip);
2715: /* initialization */
2716: PetscMalloc1(am+1,&ui);
2717: ui[0] = 0;
2719: /* jl: linked list for storing indices of the pivot rows
2720: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2721: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2722: for (i=0; i<am; i++) {
2723: jl[i] = am; il[i] = 0;
2724: }
2726: /* create and initialize a linked list for storing column indices of the active row k */
2727: nlnk = am + 1;
2728: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2730: /* initial FreeSpace size is fill*(ai[am]+1) */
2731: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2733: current_space = free_space;
2735: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2737: current_space_lvl = free_space_lvl;
2739: for (k=0; k<am; k++) { /* for each active row k */
2740: /* initialize lnk by the column indices of row rip[k] */
2741: nzk = 0;
2742: ncols = ai[rip[k]+1] - ai[rip[k]];
2743: cols = aj+ai[rip[k]];
2744: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2745: nzk += nlnk;
2747: /* update lnk by computing fill-in for each pivot row to be merged in */
2748: prow = jl[k]; /* 1st pivot row */
2750: while (prow < k) {
2751: nextprow = jl[prow];
2753: /* merge prow into k-th row */
2754: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2755: jmax = ui[prow+1];
2756: ncols = jmax-jmin;
2757: i = jmin - ui[prow];
2758: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2759: j = *(uj_lvl_ptr[prow] + i - 1);
2760: cols_lvl = uj_lvl_ptr[prow]+i;
2761: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2762: nzk += nlnk;
2764: /* update il and jl for prow */
2765: if (jmin < jmax) {
2766: il[prow] = jmin;
2767: j = *cols;
2768: jl[prow] = jl[j];
2769: jl[j] = prow;
2770: }
2771: prow = nextprow;
2772: }
2774: /* if free space is not available, make more free space */
2775: if (current_space->local_remaining<nzk) {
2776: i = am - k + 1; /* num of unfactored rows */
2777: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2778: PetscFreeSpaceGet(i,¤t_space);
2779: PetscFreeSpaceGet(i,¤t_space_lvl);
2780: reallocs++;
2781: }
2783: /* copy data into free_space and free_space_lvl, then initialize lnk */
2784: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2786: /* add the k-th row into il and jl */
2787: if (nzk-1 > 0) {
2788: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2789: jl[k] = jl[i]; jl[i] = k;
2790: il[k] = ui[k] + 1;
2791: }
2792: uj_ptr[k] = current_space->array;
2793: uj_lvl_ptr[k] = current_space_lvl->array;
2795: current_space->array += nzk;
2796: current_space->local_used += nzk;
2797: current_space->local_remaining -= nzk;
2798: current_space_lvl->array += nzk;
2799: current_space_lvl->local_used += nzk;
2800: current_space_lvl->local_remaining -= nzk;
2802: ui[k+1] = ui[k] + nzk;
2803: }
2805: ISRestoreIndices(perm,&rip);
2806: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2808: /* destroy list of free space and other temporary array(s) */
2809: PetscMalloc1(ui[am]+1,&uj);
2810: PetscFreeSpaceContiguous(&free_space,uj);
2811: PetscIncompleteLLDestroy(lnk,lnkbt);
2812: PetscFreeSpaceDestroy(free_space_lvl);
2813: if (ai[am] != 0) {
2814: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2815: } else {
2816: ratio_needed = 0.0;
2817: }
2818: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2820: /* put together the new matrix in MATSEQSBAIJ format */
2821: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2823: b = (Mat_SeqSBAIJ*)(fact)->data;
2825: PetscFree2(b->imax,b->ilen);
2827: b->singlemalloc = PETSC_FALSE;
2828: b->free_a = PETSC_TRUE;
2829: b->free_ij = free_ij;
2831: PetscMalloc1(ui[am]+1,&b->a);
2833: b->j = uj;
2834: b->i = ui;
2835: b->diag = 0;
2836: b->ilen = 0;
2837: b->imax = 0;
2838: b->row = perm;
2839: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2841: PetscObjectReference((PetscObject)perm);
2842: b->icol = perm;
2843: PetscObjectReference((PetscObject)perm);
2844: PetscMalloc1(am+1,&b->solve_work);
2846: b->maxnz = b->nz = ui[am];
2848: fact->info.factor_mallocs = reallocs;
2849: fact->info.fill_ratio_given = fill;
2850: fact->info.fill_ratio_needed = ratio_needed;
2851: #if defined(PETSC_USE_INFO)
2852: if (ai[am] != 0) {
2853: PetscReal af = fact->info.fill_ratio_needed;
2854: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2855: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2856: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2857: } else {
2858: PetscInfo(A,"Empty matrix.\n");
2859: }
2860: #endif
2861: if (perm_identity) {
2862: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2863: } else {
2864: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2865: }
2866: return(0);
2867: }